yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import urllib.error
  44 import urllib.parse
  45 import urllib.request
  46 import xml.etree.ElementTree
  47 import zlib
  48
  49 from .compat import functools  # isort: split
  50 from .compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from .dependencies import brotli, certifi, websockets, xattr
  58 from .socks import ProxyType, sockssocket
  59
  60
  61 def register_socks_protocols():
  62     # "Register" SOCKS protocols
  63     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  64     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  65     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  66         if scheme not in urllib.parse.uses_netloc:
  67             urllib.parse.uses_netloc.append(scheme)
  68
  69
  70 # This is not clearly defined otherwise
  71 compiled_regex_type = type(re.compile(''))
  72
  73
  74 def random_user_agent():
  75     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  76     _CHROME_VERSIONS = (
  77         '90.0.4430.212',
  78         '90.0.4430.24',
  79         '90.0.4430.70',
  80         '90.0.4430.72',
  81         '90.0.4430.85',
  82         '90.0.4430.93',
  83         '91.0.4472.101',
  84         '91.0.4472.106',
  85         '91.0.4472.114',
  86         '91.0.4472.124',
  87         '91.0.4472.164',
  88         '91.0.4472.19',
  89         '91.0.4472.77',
  90         '92.0.4515.107',
  91         '92.0.4515.115',
  92         '92.0.4515.131',
  93         '92.0.4515.159',
  94         '92.0.4515.43',
  95         '93.0.4556.0',
  96         '93.0.4577.15',
  97         '93.0.4577.63',
  98         '93.0.4577.82',
  99         '94.0.4606.41',
 100         '94.0.4606.54',
 101         '94.0.4606.61',
 102         '94.0.4606.71',
 103         '94.0.4606.81',
 104         '94.0.4606.85',
 105         '95.0.4638.17',
 106         '95.0.4638.50',
 107         '95.0.4638.54',
 108         '95.0.4638.69',
 109         '95.0.4638.74',
 110         '96.0.4664.18',
 111         '96.0.4664.45',
 112         '96.0.4664.55',
 113         '96.0.4664.93',
 114         '97.0.4692.20',
 115     )
 116     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 117
 118
 119 SUPPORTED_ENCODINGS = [
 120     'gzip', 'deflate'
 121 ]
 122 if brotli:
 123     SUPPORTED_ENCODINGS.append('br')
 124
 125 std_headers = {
 126     'User-Agent': random_user_agent(),
 127     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 128     'Accept-Language': 'en-us,en;q=0.5',
 129     'Sec-Fetch-Mode': 'navigate',
 130 }
 131
 132
 133 USER_AGENTS = {
 134     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 135 }
 136
 137
 138 NO_DEFAULT = object()
 139 IDENTITY = lambda x: x
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150 }
 151
 152 # needed for sanitizing filenames in restricted mode
 153 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 154                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 155                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 156
 157 DATE_FORMATS = (
 158     '%d %B %Y',
 159     '%d %b %Y',
 160     '%B %d %Y',
 161     '%B %dst %Y',
 162     '%B %dnd %Y',
 163     '%B %drd %Y',
 164     '%B %dth %Y',
 165     '%b %d %Y',
 166     '%b %dst %Y',
 167     '%b %dnd %Y',
 168     '%b %drd %Y',
 169     '%b %dth %Y',
 170     '%b %dst %Y %I:%M',
 171     '%b %dnd %Y %I:%M',
 172     '%b %drd %Y %I:%M',
 173     '%b %dth %Y %I:%M',
 174     '%Y %m %d',
 175     '%Y-%m-%d',
 176     '%Y.%m.%d.',
 177     '%Y/%m/%d',
 178     '%Y/%m/%d %H:%M',
 179     '%Y/%m/%d %H:%M:%S',
 180     '%Y%m%d%H%M',
 181     '%Y%m%d%H%M%S',
 182     '%Y%m%d',
 183     '%Y-%m-%d %H:%M',
 184     '%Y-%m-%d %H:%M:%S',
 185     '%Y-%m-%d %H:%M:%S.%f',
 186     '%Y-%m-%d %H:%M:%S:%f',
 187     '%d.%m.%Y %H:%M',
 188     '%d.%m.%Y %H.%M',
 189     '%Y-%m-%dT%H:%M:%SZ',
 190     '%Y-%m-%dT%H:%M:%S.%fZ',
 191     '%Y-%m-%dT%H:%M:%S.%f0Z',
 192     '%Y-%m-%dT%H:%M:%S',
 193     '%Y-%m-%dT%H:%M:%S.%f',
 194     '%Y-%m-%dT%H:%M',
 195     '%b %d %Y at %H:%M',
 196     '%b %d %Y at %H:%M:%S',
 197     '%B %d %Y at %H:%M',
 198     '%B %d %Y at %H:%M:%S',
 199     '%H:%M %d-%b-%Y',
 200 )
 201
 202 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 203 DATE_FORMATS_DAY_FIRST.extend([
 204     '%d-%m-%Y',
 205     '%d.%m.%Y',
 206     '%d.%m.%y',
 207     '%d/%m/%Y',
 208     '%d/%m/%y',
 209     '%d/%m/%Y %H:%M:%S',
 210     '%d-%m-%Y %H:%M',
 211 ])
 212
 213 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 214 DATE_FORMATS_MONTH_FIRST.extend([
 215     '%m-%d-%Y',
 216     '%m.%d.%Y',
 217     '%m/%d/%Y',
 218     '%m/%d/%y',
 219     '%m/%d/%Y %H:%M:%S',
 220 ])
 221
 222 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 223 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 224
 225 NUMBER_RE = r'\d+(?:\.\d+)?'
 226
 227
 228 @functools.cache
 229 def preferredencoding():
 230     """Get preferred encoding.
 231
 232     Returns the best encoding scheme for the system, based on
 233     locale.getpreferredencoding() and some further tweaks.
 234     """
 235     try:
 236         pref = locale.getpreferredencoding()
 237         'TEST'.encode(pref)
 238     except Exception:
 239         pref = 'UTF-8'
 240
 241     return pref
 242
 243
 244 def write_json_file(obj, fn):
 245     """ Encode obj as JSON and write it to fn, atomically if possible """
 246
 247     tf = tempfile.NamedTemporaryFile(
 248         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 249         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 250
 251     try:
 252         with tf:
 253             json.dump(obj, tf, ensure_ascii=False)
 254         if sys.platform == 'win32':
 255             # Need to remove existing file on Windows, else os.rename raises
 256             # WindowsError or FileExistsError.
 257             with contextlib.suppress(OSError):
 258                 os.unlink(fn)
 259         with contextlib.suppress(OSError):
 260             mask = os.umask(0)
 261             os.umask(mask)
 262             os.chmod(tf.name, 0o666 & ~mask)
 263         os.rename(tf.name, fn)
 264     except Exception:
 265         with contextlib.suppress(OSError):
 266             os.remove(tf.name)
 267         raise
 268
 269
 270 def find_xpath_attr(node, xpath, key, val=None):
 271     """ Find the xpath xpath[@key=val] """
 272     assert re.match(r'^[a-zA-Z_-]+$', key)
 273     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 274     return node.find(expr)
 275
 276 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 277 # the namespace parameter
 278
 279
 280 def xpath_with_ns(path, ns_map):
 281     components = [c.split(':') for c in path.split('/')]
 282     replaced = []
 283     for c in components:
 284         if len(c) == 1:
 285             replaced.append(c[0])
 286         else:
 287             ns, tag = c
 288             replaced.append('{%s}%s' % (ns_map[ns], tag))
 289     return '/'.join(replaced)
 290
 291
 292 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 293     def _find_xpath(xpath):
 294         return node.find(xpath)
 295
 296     if isinstance(xpath, str):
 297         n = _find_xpath(xpath)
 298     else:
 299         for xp in xpath:
 300             n = _find_xpath(xp)
 301             if n is not None:
 302                 break
 303
 304     if n is None:
 305         if default is not NO_DEFAULT:
 306             return default
 307         elif fatal:
 308             name = xpath if name is None else name
 309             raise ExtractorError('Could not find XML element %s' % name)
 310         else:
 311             return None
 312     return n
 313
 314
 315 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 316     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 317     if n is None or n == default:
 318         return n
 319     if n.text is None:
 320         if default is not NO_DEFAULT:
 321             return default
 322         elif fatal:
 323             name = xpath if name is None else name
 324             raise ExtractorError('Could not find XML element\'s text %s' % name)
 325         else:
 326             return None
 327     return n.text
 328
 329
 330 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 331     n = find_xpath_attr(node, xpath, key)
 332     if n is None:
 333         if default is not NO_DEFAULT:
 334             return default
 335         elif fatal:
 336             name = f'{xpath}[@{key}]' if name is None else name
 337             raise ExtractorError('Could not find XML attribute %s' % name)
 338         else:
 339             return None
 340     return n.attrib[key]
 341
 342
 343 def get_element_by_id(id, html, **kwargs):
 344     """Return the content of the tag with the specified ID in the passed HTML document"""
 345     return get_element_by_attribute('id', id, html, **kwargs)
 346
 347
 348 def get_element_html_by_id(id, html, **kwargs):
 349     """Return the html of the tag with the specified ID in the passed HTML document"""
 350     return get_element_html_by_attribute('id', id, html, **kwargs)
 351
 352
 353 def get_element_by_class(class_name, html):
 354     """Return the content of the first tag with the specified class in the passed HTML document"""
 355     retval = get_elements_by_class(class_name, html)
 356     return retval[0] if retval else None
 357
 358
 359 def get_element_html_by_class(class_name, html):
 360     """Return the html of the first tag with the specified class in the passed HTML document"""
 361     retval = get_elements_html_by_class(class_name, html)
 362     return retval[0] if retval else None
 363
 364
 365 def get_element_by_attribute(attribute, value, html, **kwargs):
 366     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 367     return retval[0] if retval else None
 368
 369
 370 def get_element_html_by_attribute(attribute, value, html, **kargs):
 371     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 372     return retval[0] if retval else None
 373
 374
 375 def get_elements_by_class(class_name, html, **kargs):
 376     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 377     return get_elements_by_attribute(
 378         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 379         html, escape_value=False)
 380
 381
 382 def get_elements_html_by_class(class_name, html):
 383     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 384     return get_elements_html_by_attribute(
 385         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 386         html, escape_value=False)
 387
 388
 389 def get_elements_by_attribute(*args, **kwargs):
 390     """Return the content of the tag with the specified attribute in the passed HTML document"""
 391     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 392
 393
 394 def get_elements_html_by_attribute(*args, **kwargs):
 395     """Return the html of the tag with the specified attribute in the passed HTML document"""
 396     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 397
 398
 399 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 400     """
 401     Return the text (content) and the html (whole) of the tag with the specified
 402     attribute in the passed HTML document
 403     """
 404
 405     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 406
 407     value = re.escape(value) if escape_value else value
 408
 409     partial_element_re = rf'''(?x)
 410         <(?P<tag>[a-zA-Z0-9:._-]+)
 411          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 412          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 413         '''
 414
 415     for m in re.finditer(partial_element_re, html):
 416         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 417
 418         yield (
 419             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 420             whole
 421         )
 422
 423
 424 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 425     """
 426     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 427     closing tag for the first opening tag it has encountered, and can be used
 428     as a context manager
 429     """
 430
 431     class HTMLBreakOnClosingTagException(Exception):
 432         pass
 433
 434     def __init__(self):
 435         self.tagstack = collections.deque()
 436         html.parser.HTMLParser.__init__(self)
 437
 438     def __enter__(self):
 439         return self
 440
 441     def __exit__(self, *_):
 442         self.close()
 443
 444     def close(self):
 445         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 446         # so data remains buffered; we no longer have any interest in it, thus
 447         # override this method to discard it
 448         pass
 449
 450     def handle_starttag(self, tag, _):
 451         self.tagstack.append(tag)
 452
 453     def handle_endtag(self, tag):
 454         if not self.tagstack:
 455             raise compat_HTMLParseError('no tags in the stack')
 456         while self.tagstack:
 457             inner_tag = self.tagstack.pop()
 458             if inner_tag == tag:
 459                 break
 460         else:
 461             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 462         if not self.tagstack:
 463             raise self.HTMLBreakOnClosingTagException()
 464
 465
 466 def get_element_text_and_html_by_tag(tag, html):
 467     """
 468     For the first element with the specified tag in the passed HTML document
 469     return its' content (text) and the whole element (html)
 470     """
 471     def find_or_raise(haystack, needle, exc):
 472         try:
 473             return haystack.index(needle)
 474         except ValueError:
 475             raise exc
 476     closing_tag = f'</{tag}>'
 477     whole_start = find_or_raise(
 478         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 479     content_start = find_or_raise(
 480         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 481     content_start += whole_start + 1
 482     with HTMLBreakOnClosingTagParser() as parser:
 483         parser.feed(html[whole_start:content_start])
 484         if not parser.tagstack or parser.tagstack[0] != tag:
 485             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 486         offset = content_start
 487         while offset < len(html):
 488             next_closing_tag_start = find_or_raise(
 489                 html[offset:], closing_tag,
 490                 compat_HTMLParseError(f'closing {tag} tag not found'))
 491             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 492             try:
 493                 parser.feed(html[offset:offset + next_closing_tag_end])
 494                 offset += next_closing_tag_end
 495             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 496                 return html[content_start:offset + next_closing_tag_start], \
 497                     html[whole_start:offset + next_closing_tag_end]
 498         raise compat_HTMLParseError('unexpected end of html')
 499
 500
 501 class HTMLAttributeParser(html.parser.HTMLParser):
 502     """Trivial HTML parser to gather the attributes for a single element"""
 503
 504     def __init__(self):
 505         self.attrs = {}
 506         html.parser.HTMLParser.__init__(self)
 507
 508     def handle_starttag(self, tag, attrs):
 509         self.attrs = dict(attrs)
 510
 511
 512 class HTMLListAttrsParser(html.parser.HTMLParser):
 513     """HTML parser to gather the attributes for the elements of a list"""
 514
 515     def __init__(self):
 516         html.parser.HTMLParser.__init__(self)
 517         self.items = []
 518         self._level = 0
 519
 520     def handle_starttag(self, tag, attrs):
 521         if tag == 'li' and self._level == 0:
 522             self.items.append(dict(attrs))
 523         self._level += 1
 524
 525     def handle_endtag(self, tag):
 526         self._level -= 1
 527
 528
 529 def extract_attributes(html_element):
 530     """Given a string for an HTML element such as
 531     <el
 532          a="foo" B="bar" c="&98;az" d=boz
 533          empty= noval entity="&amp;"
 534          sq='"' dq="'"
 535     >
 536     Decode and return a dictionary of attributes.
 537     {
 538         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 539         'empty': '', 'noval': None, 'entity': '&',
 540         'sq': '"', 'dq': '\''
 541     }.
 542     """
 543     parser = HTMLAttributeParser()
 544     with contextlib.suppress(compat_HTMLParseError):
 545         parser.feed(html_element)
 546         parser.close()
 547     return parser.attrs
 548
 549
 550 def parse_list(webpage):
 551     """Given a string for an series of HTML <li> elements,
 552     return a dictionary of their attributes"""
 553     parser = HTMLListAttrsParser()
 554     parser.feed(webpage)
 555     parser.close()
 556     return parser.items
 557
 558
 559 def clean_html(html):
 560     """Clean an HTML snippet into a readable string"""
 561
 562     if html is None:  # Convenience for sanitizing descriptions etc.
 563         return html
 564
 565     html = re.sub(r'\s+', ' ', html)
 566     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 567     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 568     # Strip html tags
 569     html = re.sub('<.*?>', '', html)
 570     # Replace html entities
 571     html = unescapeHTML(html)
 572     return html.strip()
 573
 574
 575 class LenientJSONDecoder(json.JSONDecoder):
 576     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 577         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 578         super().__init__(*args, **kwargs)
 579
 580     def decode(self, s):
 581         if self.transform_source:
 582             s = self.transform_source(s)
 583         if self.ignore_extra:
 584             return self.raw_decode(s.lstrip())[0]
 585         return super().decode(s)
 586
 587
 588 def sanitize_open(filename, open_mode):
 589     """Try to open the given filename, and slightly tweak it if this fails.
 590
 591     Attempts to open the given filename. If this fails, it tries to change
 592     the filename slightly, step by step, until it's either able to open it
 593     or it fails and raises a final exception, like the standard open()
 594     function.
 595
 596     It returns the tuple (stream, definitive_file_name).
 597     """
 598     if filename == '-':
 599         if sys.platform == 'win32':
 600             import msvcrt
 601
 602             # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout
 603             with contextlib.suppress(io.UnsupportedOperation):
 604                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 605         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 606
 607     for attempt in range(2):
 608         try:
 609             try:
 610                 if sys.platform == 'win32':
 611                     # FIXME: An exclusive lock also locks the file from being read.
 612                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 613                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 614                     raise LockingUnsupportedError()
 615                 stream = locked_file(filename, open_mode, block=False).__enter__()
 616             except OSError:
 617                 stream = open(filename, open_mode)
 618             return stream, filename
 619         except OSError as err:
 620             if attempt or err.errno in (errno.EACCES,):
 621                 raise
 622             old_filename, filename = filename, sanitize_path(filename)
 623             if old_filename == filename:
 624                 raise
 625
 626
 627 def timeconvert(timestr):
 628     """Convert RFC 2822 defined time string into system timestamp"""
 629     timestamp = None
 630     timetuple = email.utils.parsedate_tz(timestr)
 631     if timetuple is not None:
 632         timestamp = email.utils.mktime_tz(timetuple)
 633     return timestamp
 634
 635
 636 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 637     """Sanitizes a string so it could be used as part of a filename.
 638     @param restricted   Use a stricter subset of allowed characters
 639     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 640                         If unset, yt-dlp's new sanitization rules are in effect
 641     """
 642     if s == '':
 643         return ''
 644
 645     def replace_insane(char):
 646         if restricted and char in ACCENT_CHARS:
 647             return ACCENT_CHARS[char]
 648         elif not restricted and char == '\n':
 649             return '\0 '
 650         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 651             return ''
 652         elif char == '"':
 653             return '' if restricted else '\''
 654         elif char == ':':
 655             return '\0_\0-' if restricted else '\0 \0-'
 656         elif char in '\\/|*<>':
 657             return '\0_'
 658         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 659             return '\0_'
 660         return char
 661
 662     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 663     result = ''.join(map(replace_insane, s))
 664     if is_id is NO_DEFAULT:
 665         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 666         STRIP_RE = r'(?:\0.|[ _-])*'
 667         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 668     result = result.replace('\0', '') or '_'
 669
 670     if not is_id:
 671         while '__' in result:
 672             result = result.replace('__', '_')
 673         result = result.strip('_')
 674         # Common case of "Foreign band name - English song title"
 675         if restricted and result.startswith('-_'):
 676             result = result[2:]
 677         if result.startswith('-'):
 678             result = '_' + result[len('-'):]
 679         result = result.lstrip('.')
 680         if not result:
 681             result = '_'
 682     return result
 683
 684
 685 def sanitize_path(s, force=False):
 686     """Sanitizes and normalizes path on Windows"""
 687     if sys.platform == 'win32':
 688         force = False
 689         drive_or_unc, _ = os.path.splitdrive(s)
 690     elif force:
 691         drive_or_unc = ''
 692     else:
 693         return s
 694
 695     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 696     if drive_or_unc:
 697         norm_path.pop(0)
 698     sanitized_path = [
 699         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 700         for path_part in norm_path]
 701     if drive_or_unc:
 702         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 703     elif force and s and s[0] == os.path.sep:
 704         sanitized_path.insert(0, os.path.sep)
 705     return os.path.join(*sanitized_path)
 706
 707
 708 def sanitize_url(url, *, scheme='http'):
 709     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 710     # the number of unwanted failures due to missing protocol
 711     if url is None:
 712         return
 713     elif url.startswith('//'):
 714         return f'{scheme}:{url}'
 715     # Fix some common typos seen so far
 716     COMMON_TYPOS = (
 717         # https://github.com/ytdl-org/youtube-dl/issues/15649
 718         (r'^httpss://', r'https://'),
 719         # https://bx1.be/lives/direct-tv/
 720         (r'^rmtp([es]?)://', r'rtmp\1://'),
 721     )
 722     for mistake, fixup in COMMON_TYPOS:
 723         if re.match(mistake, url):
 724             return re.sub(mistake, fixup, url)
 725     return url
 726
 727
 728 def extract_basic_auth(url):
 729     parts = urllib.parse.urlsplit(url)
 730     if parts.username is None:
 731         return url, None
 732     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 733         parts.hostname if parts.port is None
 734         else '%s:%d' % (parts.hostname, parts.port))))
 735     auth_payload = base64.b64encode(
 736         ('%s:%s' % (parts.username, parts.password or '')).encode())
 737     return url, f'Basic {auth_payload.decode()}'
 738
 739
 740 def sanitized_Request(url, *args, **kwargs):
 741     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 742     if auth_header is not None:
 743         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 744         headers['Authorization'] = auth_header
 745     return urllib.request.Request(url, *args, **kwargs)
 746
 747
 748 def expand_path(s):
 749     """Expand shell variables and ~"""
 750     return os.path.expandvars(compat_expanduser(s))
 751
 752
 753 def orderedSet(iterable, *, lazy=False):
 754     """Remove all duplicates from the input iterable"""
 755     def _iter():
 756         seen = []  # Do not use set since the items can be unhashable
 757         for x in iterable:
 758             if x not in seen:
 759                 seen.append(x)
 760                 yield x
 761
 762     return _iter() if lazy else list(_iter())
 763
 764
 765 def _htmlentity_transform(entity_with_semicolon):
 766     """Transforms an HTML entity to a character."""
 767     entity = entity_with_semicolon[:-1]
 768
 769     # Known non-numeric HTML entity
 770     if entity in html.entities.name2codepoint:
 771         return chr(html.entities.name2codepoint[entity])
 772
 773     # TODO: HTML5 allows entities without a semicolon. For example,
 774     # '&Eacuteric' should be decoded as 'Éric'.
 775     if entity_with_semicolon in html.entities.html5:
 776         return html.entities.html5[entity_with_semicolon]
 777
 778     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 779     if mobj is not None:
 780         numstr = mobj.group(1)
 781         if numstr.startswith('x'):
 782             base = 16
 783             numstr = '0%s' % numstr
 784         else:
 785             base = 10
 786         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 787         with contextlib.suppress(ValueError):
 788             return chr(int(numstr, base))
 789
 790     # Unknown entity in name, return its literal representation
 791     return '&%s;' % entity
 792
 793
 794 def unescapeHTML(s):
 795     if s is None:
 796         return None
 797     assert isinstance(s, str)
 798
 799     return re.sub(
 800         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 801
 802
 803 def escapeHTML(text):
 804     return (
 805         text
 806         .replace('&', '&amp;')
 807         .replace('<', '&lt;')
 808         .replace('>', '&gt;')
 809         .replace('"', '&quot;')
 810         .replace("'", '&#39;')
 811     )
 812
 813
 814 def process_communicate_or_kill(p, *args, **kwargs):
 815     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 816                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 817     return Popen.communicate_or_kill(p, *args, **kwargs)
 818
 819
 820 class Popen(subprocess.Popen):
 821     if sys.platform == 'win32':
 822         _startupinfo = subprocess.STARTUPINFO()
 823         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 824     else:
 825         _startupinfo = None
 826
 827     def __init__(self, *args, text=False, **kwargs):
 828         if text is True:
 829             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 830             kwargs.setdefault('encoding', 'utf-8')
 831             kwargs.setdefault('errors', 'replace')
 832         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 833
 834     def communicate_or_kill(self, *args, **kwargs):
 835         try:
 836             return self.communicate(*args, **kwargs)
 837         except BaseException:  # Including KeyboardInterrupt
 838             self.kill(timeout=None)
 839             raise
 840
 841     def kill(self, *, timeout=0):
 842         super().kill()
 843         if timeout != 0:
 844             self.wait(timeout=timeout)
 845
 846     @classmethod
 847     def run(cls, *args, **kwargs):
 848         with cls(*args, **kwargs) as proc:
 849             stdout, stderr = proc.communicate_or_kill()
 850             return stdout or '', stderr or '', proc.returncode
 851
 852
 853 def get_subprocess_encoding():
 854     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 855         # For subprocess calls, encode with locale encoding
 856         # Refer to http://stackoverflow.com/a/9951851/35070
 857         encoding = preferredencoding()
 858     else:
 859         encoding = sys.getfilesystemencoding()
 860     if encoding is None:
 861         encoding = 'utf-8'
 862     return encoding
 863
 864
 865 def encodeFilename(s, for_subprocess=False):
 866     assert isinstance(s, str)
 867     return s
 868
 869
 870 def decodeFilename(b, for_subprocess=False):
 871     return b
 872
 873
 874 def encodeArgument(s):
 875     # Legacy code that uses byte strings
 876     # Uncomment the following line after fixing all post processors
 877     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 878     return s if isinstance(s, str) else s.decode('ascii')
 879
 880
 881 def decodeArgument(b):
 882     return b
 883
 884
 885 def decodeOption(optval):
 886     if optval is None:
 887         return optval
 888     if isinstance(optval, bytes):
 889         optval = optval.decode(preferredencoding())
 890
 891     assert isinstance(optval, str)
 892     return optval
 893
 894
 895 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 896
 897
 898 def timetuple_from_msec(msec):
 899     secs, msec = divmod(msec, 1000)
 900     mins, secs = divmod(secs, 60)
 901     hrs, mins = divmod(mins, 60)
 902     return _timetuple(hrs, mins, secs, msec)
 903
 904
 905 def formatSeconds(secs, delim=':', msec=False):
 906     time = timetuple_from_msec(secs * 1000)
 907     if time.hours:
 908         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 909     elif time.minutes:
 910         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 911     else:
 912         ret = '%d' % time.seconds
 913     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 914
 915
 916 def _ssl_load_windows_store_certs(ssl_context, storename):
 917     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 918     try:
 919         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 920                  if encoding == 'x509_asn' and (
 921                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 922     except PermissionError:
 923         return
 924     for cert in certs:
 925         with contextlib.suppress(ssl.SSLError):
 926             ssl_context.load_verify_locations(cadata=cert)
 927
 928
 929 def make_HTTPS_handler(params, **kwargs):
 930     opts_check_certificate = not params.get('nocheckcertificate')
 931     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 932     context.check_hostname = opts_check_certificate
 933     if params.get('legacyserverconnect'):
 934         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 935         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 936         context.set_ciphers('DEFAULT')
 937
 938     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 939     if opts_check_certificate:
 940         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 941             context.load_verify_locations(cafile=certifi.where())
 942         else:
 943             try:
 944                 context.load_default_certs()
 945                 # Work around the issue in load_default_certs when there are bad certificates. See:
 946                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 947                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 948             except ssl.SSLError:
 949                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 950                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 951                     for storename in ('CA', 'ROOT'):
 952                         _ssl_load_windows_store_certs(context, storename)
 953                 context.set_default_verify_paths()
 954
 955     client_certfile = params.get('client_certificate')
 956     if client_certfile:
 957         try:
 958             context.load_cert_chain(
 959                 client_certfile, keyfile=params.get('client_certificate_key'),
 960                 password=params.get('client_certificate_password'))
 961         except ssl.SSLError:
 962             raise YoutubeDLError('Unable to load client certificate')
 963
 964     # Some servers may reject requests if ALPN extension is not sent. See:
 965     # https://github.com/python/cpython/issues/85140
 966     # https://github.com/yt-dlp/yt-dlp/issues/3878
 967     with contextlib.suppress(NotImplementedError):
 968         context.set_alpn_protocols(['http/1.1'])
 969
 970     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 971
 972
 973 def bug_reports_message(before=';'):
 974     from .update import REPOSITORY
 975
 976     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 977            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 978
 979     before = before.rstrip()
 980     if not before or before.endswith(('.', '!', '?')):
 981         msg = msg[0].title() + msg[1:]
 982
 983     return (before + ' ' if before else '') + msg
 984
 985
 986 class YoutubeDLError(Exception):
 987     """Base exception for YoutubeDL errors."""
 988     msg = None
 989
 990     def __init__(self, msg=None):
 991         if msg is not None:
 992             self.msg = msg
 993         elif self.msg is None:
 994             self.msg = type(self).__name__
 995         super().__init__(self.msg)
 996
 997
 998 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
 999 if hasattr(ssl, 'CertificateError'):
1000     network_exceptions.append(ssl.CertificateError)
1001 network_exceptions = tuple(network_exceptions)
1002
1003
1004 class ExtractorError(YoutubeDLError):
1005     """Error during info extraction."""
1006
1007     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1008         """ tb, if given, is the original traceback (so that it can be printed out).
1009         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1010         """
1011         if sys.exc_info()[0] in network_exceptions:
1012             expected = True
1013
1014         self.orig_msg = str(msg)
1015         self.traceback = tb
1016         self.expected = expected
1017         self.cause = cause
1018         self.video_id = video_id
1019         self.ie = ie
1020         self.exc_info = sys.exc_info()  # preserve original exception
1021         if isinstance(self.exc_info[1], ExtractorError):
1022             self.exc_info = self.exc_info[1].exc_info
1023
1024         super().__init__(''.join((
1025             format_field(ie, None, '[%s] '),
1026             format_field(video_id, None, '%s: '),
1027             msg,
1028             format_field(cause, None, ' (caused by %r)'),
1029             '' if expected else bug_reports_message())))
1030
1031     def format_traceback(self):
1032         return join_nonempty(
1033             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1034             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1035             delim='\n') or None
1036
1037
1038 class UnsupportedError(ExtractorError):
1039     def __init__(self, url):
1040         super().__init__(
1041             'Unsupported URL: %s' % url, expected=True)
1042         self.url = url
1043
1044
1045 class RegexNotFoundError(ExtractorError):
1046     """Error when a regex didn't match"""
1047     pass
1048
1049
1050 class GeoRestrictedError(ExtractorError):
1051     """Geographic restriction Error exception.
1052
1053     This exception may be thrown when a video is not available from your
1054     geographic location due to geographic restrictions imposed by a website.
1055     """
1056
1057     def __init__(self, msg, countries=None, **kwargs):
1058         kwargs['expected'] = True
1059         super().__init__(msg, **kwargs)
1060         self.countries = countries
1061
1062
1063 class UserNotLive(ExtractorError):
1064     """Error when a channel/user is not live"""
1065
1066     def __init__(self, msg=None, **kwargs):
1067         kwargs['expected'] = True
1068         super().__init__(msg or 'The channel is not currently live', **kwargs)
1069
1070
1071 class DownloadError(YoutubeDLError):
1072     """Download Error exception.
1073
1074     This exception may be thrown by FileDownloader objects if they are not
1075     configured to continue on errors. They will contain the appropriate
1076     error message.
1077     """
1078
1079     def __init__(self, msg, exc_info=None):
1080         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1081         super().__init__(msg)
1082         self.exc_info = exc_info
1083
1084
1085 class EntryNotInPlaylist(YoutubeDLError):
1086     """Entry not in playlist exception.
1087
1088     This exception will be thrown by YoutubeDL when a requested entry
1089     is not found in the playlist info_dict
1090     """
1091     msg = 'Entry not found in info'
1092
1093
1094 class SameFileError(YoutubeDLError):
1095     """Same File exception.
1096
1097     This exception will be thrown by FileDownloader objects if they detect
1098     multiple files would have to be downloaded to the same file on disk.
1099     """
1100     msg = 'Fixed output name but more than one file to download'
1101
1102     def __init__(self, filename=None):
1103         if filename is not None:
1104             self.msg += f': {filename}'
1105         super().__init__(self.msg)
1106
1107
1108 class PostProcessingError(YoutubeDLError):
1109     """Post Processing exception.
1110
1111     This exception may be raised by PostProcessor's .run() method to
1112     indicate an error in the postprocessing task.
1113     """
1114
1115
1116 class DownloadCancelled(YoutubeDLError):
1117     """ Exception raised when the download queue should be interrupted """
1118     msg = 'The download was cancelled'
1119
1120
1121 class ExistingVideoReached(DownloadCancelled):
1122     """ --break-on-existing triggered """
1123     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1124
1125
1126 class RejectedVideoReached(DownloadCancelled):
1127     """ --break-on-reject triggered """
1128     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1129
1130
1131 class MaxDownloadsReached(DownloadCancelled):
1132     """ --max-downloads limit has been reached. """
1133     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1134
1135
1136 class ReExtractInfo(YoutubeDLError):
1137     """ Video info needs to be re-extracted. """
1138
1139     def __init__(self, msg, expected=False):
1140         super().__init__(msg)
1141         self.expected = expected
1142
1143
1144 class ThrottledDownload(ReExtractInfo):
1145     """ Download speed below --throttled-rate. """
1146     msg = 'The download speed is below throttle limit'
1147
1148     def __init__(self):
1149         super().__init__(self.msg, expected=False)
1150
1151
1152 class UnavailableVideoError(YoutubeDLError):
1153     """Unavailable Format exception.
1154
1155     This exception will be thrown when a video is requested
1156     in a format that is not available for that video.
1157     """
1158     msg = 'Unable to download video'
1159
1160     def __init__(self, err=None):
1161         if err is not None:
1162             self.msg += f': {err}'
1163         super().__init__(self.msg)
1164
1165
1166 class ContentTooShortError(YoutubeDLError):
1167     """Content Too Short exception.
1168
1169     This exception may be raised by FileDownloader objects when a file they
1170     download is too small for what the server announced first, indicating
1171     the connection was probably interrupted.
1172     """
1173
1174     def __init__(self, downloaded, expected):
1175         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1176         # Both in bytes
1177         self.downloaded = downloaded
1178         self.expected = expected
1179
1180
1181 class XAttrMetadataError(YoutubeDLError):
1182     def __init__(self, code=None, msg='Unknown error'):
1183         super().__init__(msg)
1184         self.code = code
1185         self.msg = msg
1186
1187         # Parsing code and msg
1188         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1189                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1190             self.reason = 'NO_SPACE'
1191         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1192             self.reason = 'VALUE_TOO_LONG'
1193         else:
1194             self.reason = 'NOT_SUPPORTED'
1195
1196
1197 class XAttrUnavailableError(YoutubeDLError):
1198     pass
1199
1200
1201 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1202     hc = http_class(*args, **kwargs)
1203     source_address = ydl_handler._params.get('source_address')
1204
1205     if source_address is not None:
1206         # This is to workaround _create_connection() from socket where it will try all
1207         # address data from getaddrinfo() including IPv6. This filters the result from
1208         # getaddrinfo() based on the source_address value.
1209         # This is based on the cpython socket.create_connection() function.
1210         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1211         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1212             host, port = address
1213             err = None
1214             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1215             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1216             ip_addrs = [addr for addr in addrs if addr[0] == af]
1217             if addrs and not ip_addrs:
1218                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1219                 raise OSError(
1220                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1221                     % (ip_version, source_address[0]))
1222             for res in ip_addrs:
1223                 af, socktype, proto, canonname, sa = res
1224                 sock = None
1225                 try:
1226                     sock = socket.socket(af, socktype, proto)
1227                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1228                         sock.settimeout(timeout)
1229                     sock.bind(source_address)
1230                     sock.connect(sa)
1231                     err = None  # Explicitly break reference cycle
1232                     return sock
1233                 except OSError as _:
1234                     err = _
1235                     if sock is not None:
1236                         sock.close()
1237             if err is not None:
1238                 raise err
1239             else:
1240                 raise OSError('getaddrinfo returns an empty list')
1241         if hasattr(hc, '_create_connection'):
1242             hc._create_connection = _create_connection
1243         hc.source_address = (source_address, 0)
1244
1245     return hc
1246
1247
1248 def handle_youtubedl_headers(headers):
1249     filtered_headers = headers
1250
1251     if 'Youtubedl-no-compression' in filtered_headers:
1252         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1253         del filtered_headers['Youtubedl-no-compression']
1254
1255     return filtered_headers
1256
1257
1258 class YoutubeDLHandler(urllib.request.HTTPHandler):
1259     """Handler for HTTP requests and responses.
1260
1261     This class, when installed with an OpenerDirector, automatically adds
1262     the standard headers to every HTTP request and handles gzipped and
1263     deflated responses from web servers. If compression is to be avoided in
1264     a particular request, the original request in the program code only has
1265     to include the HTTP header "Youtubedl-no-compression", which will be
1266     removed before making the real request.
1267
1268     Part of this code was copied from:
1269
1270     http://techknack.net/python-urllib2-handlers/
1271
1272     Andrew Rowls, the author of that code, agreed to release it to the
1273     public domain.
1274     """
1275
1276     def __init__(self, params, *args, **kwargs):
1277         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1278         self._params = params
1279
1280     def http_open(self, req):
1281         conn_class = http.client.HTTPConnection
1282
1283         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1284         if socks_proxy:
1285             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1286             del req.headers['Ytdl-socks-proxy']
1287
1288         return self.do_open(functools.partial(
1289             _create_http_connection, self, conn_class, False),
1290             req)
1291
1292     @staticmethod
1293     def deflate(data):
1294         if not data:
1295             return data
1296         try:
1297             return zlib.decompress(data, -zlib.MAX_WBITS)
1298         except zlib.error:
1299             return zlib.decompress(data)
1300
1301     @staticmethod
1302     def brotli(data):
1303         if not data:
1304             return data
1305         return brotli.decompress(data)
1306
1307     def http_request(self, req):
1308         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1309         # always respected by websites, some tend to give out URLs with non percent-encoded
1310         # non-ASCII characters (see telemb.py, ard.py [#3412])
1311         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1312         # To work around aforementioned issue we will replace request's original URL with
1313         # percent-encoded one
1314         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1315         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1316         url = req.get_full_url()
1317         url_escaped = escape_url(url)
1318
1319         # Substitute URL if any change after escaping
1320         if url != url_escaped:
1321             req = update_Request(req, url=url_escaped)
1322
1323         for h, v in self._params.get('http_headers', std_headers).items():
1324             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1325             # The dict keys are capitalized because of this bug by urllib
1326             if h.capitalize() not in req.headers:
1327                 req.add_header(h, v)
1328
1329         if 'Accept-encoding' not in req.headers:
1330             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1331
1332         req.headers = handle_youtubedl_headers(req.headers)
1333
1334         return super().do_request_(req)
1335
1336     def http_response(self, req, resp):
1337         old_resp = resp
1338         # gzip
1339         if resp.headers.get('Content-encoding', '') == 'gzip':
1340             content = resp.read()
1341             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1342             try:
1343                 uncompressed = io.BytesIO(gz.read())
1344             except OSError as original_ioerror:
1345                 # There may be junk add the end of the file
1346                 # See http://stackoverflow.com/q/4928560/35070 for details
1347                 for i in range(1, 1024):
1348                     try:
1349                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1350                         uncompressed = io.BytesIO(gz.read())
1351                     except OSError:
1352                         continue
1353                     break
1354                 else:
1355                     raise original_ioerror
1356             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1357             resp.msg = old_resp.msg
1358             del resp.headers['Content-encoding']
1359         # deflate
1360         if resp.headers.get('Content-encoding', '') == 'deflate':
1361             gz = io.BytesIO(self.deflate(resp.read()))
1362             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1363             resp.msg = old_resp.msg
1364             del resp.headers['Content-encoding']
1365         # brotli
1366         if resp.headers.get('Content-encoding', '') == 'br':
1367             resp = urllib.request.addinfourl(
1368                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1369             resp.msg = old_resp.msg
1370             del resp.headers['Content-encoding']
1371         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1372         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1373         if 300 <= resp.code < 400:
1374             location = resp.headers.get('Location')
1375             if location:
1376                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1377                 location = location.encode('iso-8859-1').decode()
1378                 location_escaped = escape_url(location)
1379                 if location != location_escaped:
1380                     del resp.headers['Location']
1381                     resp.headers['Location'] = location_escaped
1382         return resp
1383
1384     https_request = http_request
1385     https_response = http_response
1386
1387
1388 def make_socks_conn_class(base_class, socks_proxy):
1389     assert issubclass(base_class, (
1390         http.client.HTTPConnection, http.client.HTTPSConnection))
1391
1392     url_components = urllib.parse.urlparse(socks_proxy)
1393     if url_components.scheme.lower() == 'socks5':
1394         socks_type = ProxyType.SOCKS5
1395     elif url_components.scheme.lower() in ('socks', 'socks4'):
1396         socks_type = ProxyType.SOCKS4
1397     elif url_components.scheme.lower() == 'socks4a':
1398         socks_type = ProxyType.SOCKS4A
1399
1400     def unquote_if_non_empty(s):
1401         if not s:
1402             return s
1403         return urllib.parse.unquote_plus(s)
1404
1405     proxy_args = (
1406         socks_type,
1407         url_components.hostname, url_components.port or 1080,
1408         True,  # Remote DNS
1409         unquote_if_non_empty(url_components.username),
1410         unquote_if_non_empty(url_components.password),
1411     )
1412
1413     class SocksConnection(base_class):
1414         def connect(self):
1415             self.sock = sockssocket()
1416             self.sock.setproxy(*proxy_args)
1417             if isinstance(self.timeout, (int, float)):
1418                 self.sock.settimeout(self.timeout)
1419             self.sock.connect((self.host, self.port))
1420
1421             if isinstance(self, http.client.HTTPSConnection):
1422                 if hasattr(self, '_context'):  # Python > 2.6
1423                     self.sock = self._context.wrap_socket(
1424                         self.sock, server_hostname=self.host)
1425                 else:
1426                     self.sock = ssl.wrap_socket(self.sock)
1427
1428     return SocksConnection
1429
1430
1431 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1432     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1433         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1434         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1435         self._params = params
1436
1437     def https_open(self, req):
1438         kwargs = {}
1439         conn_class = self._https_conn_class
1440
1441         if hasattr(self, '_context'):  # python > 2.6
1442             kwargs['context'] = self._context
1443         if hasattr(self, '_check_hostname'):  # python 3.x
1444             kwargs['check_hostname'] = self._check_hostname
1445
1446         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1447         if socks_proxy:
1448             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1449             del req.headers['Ytdl-socks-proxy']
1450
1451         try:
1452             return self.do_open(
1453                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1454         except urllib.error.URLError as e:
1455             if (isinstance(e.reason, ssl.SSLError)
1456                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1457                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1458             raise
1459
1460
1461 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1462     """
1463     See [1] for cookie file format.
1464
1465     1. https://curl.haxx.se/docs/http-cookies.html
1466     """
1467     _HTTPONLY_PREFIX = '#HttpOnly_'
1468     _ENTRY_LEN = 7
1469     _HEADER = '''# Netscape HTTP Cookie File
1470 # This file is generated by yt-dlp.  Do not edit.
1471
1472 '''
1473     _CookieFileEntry = collections.namedtuple(
1474         'CookieFileEntry',
1475         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1476
1477     def __init__(self, filename=None, *args, **kwargs):
1478         super().__init__(None, *args, **kwargs)
1479         if self.is_path(filename):
1480             filename = os.fspath(filename)
1481         self.filename = filename
1482
1483     @staticmethod
1484     def _true_or_false(cndn):
1485         return 'TRUE' if cndn else 'FALSE'
1486
1487     @staticmethod
1488     def is_path(file):
1489         return isinstance(file, (str, bytes, os.PathLike))
1490
1491     @contextlib.contextmanager
1492     def open(self, file, *, write=False):
1493         if self.is_path(file):
1494             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1495                 yield f
1496         else:
1497             if write:
1498                 file.truncate(0)
1499             yield file
1500
1501     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1502         now = time.time()
1503         for cookie in self:
1504             if (not ignore_discard and cookie.discard
1505                     or not ignore_expires and cookie.is_expired(now)):
1506                 continue
1507             name, value = cookie.name, cookie.value
1508             if value is None:
1509                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1510                 # with no name, whereas http.cookiejar regards it as a
1511                 # cookie with no value.
1512                 name, value = '', name
1513             f.write('%s\n' % '\t'.join((
1514                 cookie.domain,
1515                 self._true_or_false(cookie.domain.startswith('.')),
1516                 cookie.path,
1517                 self._true_or_false(cookie.secure),
1518                 str_or_none(cookie.expires, default=''),
1519                 name, value
1520             )))
1521
1522     def save(self, filename=None, *args, **kwargs):
1523         """
1524         Save cookies to a file.
1525         Code is taken from CPython 3.6
1526         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1527
1528         if filename is None:
1529             if self.filename is not None:
1530                 filename = self.filename
1531             else:
1532                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1533
1534         # Store session cookies with `expires` set to 0 instead of an empty string
1535         for cookie in self:
1536             if cookie.expires is None:
1537                 cookie.expires = 0
1538
1539         with self.open(filename, write=True) as f:
1540             f.write(self._HEADER)
1541             self._really_save(f, *args, **kwargs)
1542
1543     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1544         """Load cookies from a file."""
1545         if filename is None:
1546             if self.filename is not None:
1547                 filename = self.filename
1548             else:
1549                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1550
1551         def prepare_line(line):
1552             if line.startswith(self._HTTPONLY_PREFIX):
1553                 line = line[len(self._HTTPONLY_PREFIX):]
1554             # comments and empty lines are fine
1555             if line.startswith('#') or not line.strip():
1556                 return line
1557             cookie_list = line.split('\t')
1558             if len(cookie_list) != self._ENTRY_LEN:
1559                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1560             cookie = self._CookieFileEntry(*cookie_list)
1561             if cookie.expires_at and not cookie.expires_at.isdigit():
1562                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1563             return line
1564
1565         cf = io.StringIO()
1566         with self.open(filename) as f:
1567             for line in f:
1568                 try:
1569                     cf.write(prepare_line(line))
1570                 except http.cookiejar.LoadError as e:
1571                     if f'{line.strip()} '[0] in '[{"':
1572                         raise http.cookiejar.LoadError(
1573                             'Cookies file must be Netscape formatted, not JSON. See  '
1574                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1575                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1576                     continue
1577         cf.seek(0)
1578         self._really_load(cf, filename, ignore_discard, ignore_expires)
1579         # Session cookies are denoted by either `expires` field set to
1580         # an empty string or 0. MozillaCookieJar only recognizes the former
1581         # (see [1]). So we need force the latter to be recognized as session
1582         # cookies on our own.
1583         # Session cookies may be important for cookies-based authentication,
1584         # e.g. usually, when user does not check 'Remember me' check box while
1585         # logging in on a site, some important cookies are stored as session
1586         # cookies so that not recognizing them will result in failed login.
1587         # 1. https://bugs.python.org/issue17164
1588         for cookie in self:
1589             # Treat `expires=0` cookies as session cookies
1590             if cookie.expires == 0:
1591                 cookie.expires = None
1592                 cookie.discard = True
1593
1594
1595 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1596     def __init__(self, cookiejar=None):
1597         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1598
1599     def http_response(self, request, response):
1600         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1601
1602     https_request = urllib.request.HTTPCookieProcessor.http_request
1603     https_response = http_response
1604
1605
1606 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1607     """YoutubeDL redirect handler
1608
1609     The code is based on HTTPRedirectHandler implementation from CPython [1].
1610
1611     This redirect handler solves two issues:
1612      - ensures redirect URL is always unicode under python 2
1613      - introduces support for experimental HTTP response status code
1614        308 Permanent Redirect [2] used by some sites [3]
1615
1616     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1617     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1618     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1619     """
1620
1621     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1622
1623     def redirect_request(self, req, fp, code, msg, headers, newurl):
1624         """Return a Request or None in response to a redirect.
1625
1626         This is called by the http_error_30x methods when a
1627         redirection response is received.  If a redirection should
1628         take place, return a new Request to allow http_error_30x to
1629         perform the redirect.  Otherwise, raise HTTPError if no-one
1630         else should try to handle this url.  Return None if you can't
1631         but another Handler might.
1632         """
1633         m = req.get_method()
1634         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1635                  or code in (301, 302, 303) and m == "POST")):
1636             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1637         # Strictly (according to RFC 2616), 301 or 302 in response to
1638         # a POST MUST NOT cause a redirection without confirmation
1639         # from the user (of urllib.request, in this case).  In practice,
1640         # essentially all clients do redirect in this case, so we do
1641         # the same.
1642
1643         # Be conciliant with URIs containing a space.  This is mainly
1644         # redundant with the more complete encoding done in http_error_302(),
1645         # but it is kept for compatibility with other callers.
1646         newurl = newurl.replace(' ', '%20')
1647
1648         CONTENT_HEADERS = ("content-length", "content-type")
1649         # NB: don't use dict comprehension for python 2.6 compatibility
1650         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1651
1652         # A 303 must either use GET or HEAD for subsequent request
1653         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1654         if code == 303 and m != 'HEAD':
1655             m = 'GET'
1656         # 301 and 302 redirects are commonly turned into a GET from a POST
1657         # for subsequent requests by browsers, so we'll do the same.
1658         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1659         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1660         if code in (301, 302) and m == 'POST':
1661             m = 'GET'
1662
1663         return urllib.request.Request(
1664             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1665             unverifiable=True, method=m)
1666
1667
1668 def extract_timezone(date_str):
1669     m = re.search(
1670         r'''(?x)
1671             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1672             (?P<tz>Z|                                            # just the UTC Z, or
1673                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1674                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1675                    [ ]?                                          # optional space
1676                 (?P<sign>\+|-)                                   # +/-
1677                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1678             $)
1679         ''', date_str)
1680     if not m:
1681         timezone = datetime.timedelta()
1682     else:
1683         date_str = date_str[:-len(m.group('tz'))]
1684         if not m.group('sign'):
1685             timezone = datetime.timedelta()
1686         else:
1687             sign = 1 if m.group('sign') == '+' else -1
1688             timezone = datetime.timedelta(
1689                 hours=sign * int(m.group('hours')),
1690                 minutes=sign * int(m.group('minutes')))
1691     return timezone, date_str
1692
1693
1694 def parse_iso8601(date_str, delimiter='T', timezone=None):
1695     """ Return a UNIX timestamp from the given date """
1696
1697     if date_str is None:
1698         return None
1699
1700     date_str = re.sub(r'\.[0-9]+', '', date_str)
1701
1702     if timezone is None:
1703         timezone, date_str = extract_timezone(date_str)
1704
1705     with contextlib.suppress(ValueError):
1706         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1707         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1708         return calendar.timegm(dt.timetuple())
1709
1710
1711 def date_formats(day_first=True):
1712     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1713
1714
1715 def unified_strdate(date_str, day_first=True):
1716     """Return a string with the date in the format YYYYMMDD"""
1717
1718     if date_str is None:
1719         return None
1720     upload_date = None
1721     # Replace commas
1722     date_str = date_str.replace(',', ' ')
1723     # Remove AM/PM + timezone
1724     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1725     _, date_str = extract_timezone(date_str)
1726
1727     for expression in date_formats(day_first):
1728         with contextlib.suppress(ValueError):
1729             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1730     if upload_date is None:
1731         timetuple = email.utils.parsedate_tz(date_str)
1732         if timetuple:
1733             with contextlib.suppress(ValueError):
1734                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1735     if upload_date is not None:
1736         return str(upload_date)
1737
1738
1739 def unified_timestamp(date_str, day_first=True):
1740     if date_str is None:
1741         return None
1742
1743     date_str = re.sub(r'[,|]', '', date_str)
1744
1745     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1746     timezone, date_str = extract_timezone(date_str)
1747
1748     # Remove AM/PM + timezone
1749     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1750
1751     # Remove unrecognized timezones from ISO 8601 alike timestamps
1752     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1753     if m:
1754         date_str = date_str[:-len(m.group('tz'))]
1755
1756     # Python only supports microseconds, so remove nanoseconds
1757     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1758     if m:
1759         date_str = m.group(1)
1760
1761     for expression in date_formats(day_first):
1762         with contextlib.suppress(ValueError):
1763             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1764             return calendar.timegm(dt.timetuple())
1765     timetuple = email.utils.parsedate_tz(date_str)
1766     if timetuple:
1767         return calendar.timegm(timetuple) + pm_delta * 3600
1768
1769
1770 def determine_ext(url, default_ext='unknown_video'):
1771     if url is None or '.' not in url:
1772         return default_ext
1773     guess = url.partition('?')[0].rpartition('.')[2]
1774     if re.match(r'^[A-Za-z0-9]+$', guess):
1775         return guess
1776     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1777     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1778         return guess.rstrip('/')
1779     else:
1780         return default_ext
1781
1782
1783 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1784     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1785
1786
1787 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1788     R"""
1789     Return a datetime object from a string.
1790     Supported format:
1791         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1792
1793     @param format       strftime format of DATE
1794     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1795                         auto: round to the unit provided in date_str (if applicable).
1796     """
1797     auto_precision = False
1798     if precision == 'auto':
1799         auto_precision = True
1800         precision = 'microsecond'
1801     today = datetime_round(datetime.datetime.utcnow(), precision)
1802     if date_str in ('now', 'today'):
1803         return today
1804     if date_str == 'yesterday':
1805         return today - datetime.timedelta(days=1)
1806     match = re.match(
1807         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1808         date_str)
1809     if match is not None:
1810         start_time = datetime_from_str(match.group('start'), precision, format)
1811         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1812         unit = match.group('unit')
1813         if unit == 'month' or unit == 'year':
1814             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1815             unit = 'day'
1816         else:
1817             if unit == 'week':
1818                 unit = 'day'
1819                 time *= 7
1820             delta = datetime.timedelta(**{unit + 's': time})
1821             new_date = start_time + delta
1822         if auto_precision:
1823             return datetime_round(new_date, unit)
1824         return new_date
1825
1826     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1827
1828
1829 def date_from_str(date_str, format='%Y%m%d', strict=False):
1830     R"""
1831     Return a date object from a string using datetime_from_str
1832
1833     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1834                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1835     """
1836     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1837         raise ValueError(f'Invalid date format "{date_str}"')
1838     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1839
1840
1841 def datetime_add_months(dt, months):
1842     """Increment/Decrement a datetime object by months."""
1843     month = dt.month + months - 1
1844     year = dt.year + month // 12
1845     month = month % 12 + 1
1846     day = min(dt.day, calendar.monthrange(year, month)[1])
1847     return dt.replace(year, month, day)
1848
1849
1850 def datetime_round(dt, precision='day'):
1851     """
1852     Round a datetime object's time to a specific precision
1853     """
1854     if precision == 'microsecond':
1855         return dt
1856
1857     unit_seconds = {
1858         'day': 86400,
1859         'hour': 3600,
1860         'minute': 60,
1861         'second': 1,
1862     }
1863     roundto = lambda x, n: ((x + n / 2) // n) * n
1864     timestamp = calendar.timegm(dt.timetuple())
1865     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1866
1867
1868 def hyphenate_date(date_str):
1869     """
1870     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1871     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1872     if match is not None:
1873         return '-'.join(match.groups())
1874     else:
1875         return date_str
1876
1877
1878 class DateRange:
1879     """Represents a time interval between two dates"""
1880
1881     def __init__(self, start=None, end=None):
1882         """start and end must be strings in the format accepted by date"""
1883         if start is not None:
1884             self.start = date_from_str(start, strict=True)
1885         else:
1886             self.start = datetime.datetime.min.date()
1887         if end is not None:
1888             self.end = date_from_str(end, strict=True)
1889         else:
1890             self.end = datetime.datetime.max.date()
1891         if self.start > self.end:
1892             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1893
1894     @classmethod
1895     def day(cls, day):
1896         """Returns a range that only contains the given day"""
1897         return cls(day, day)
1898
1899     def __contains__(self, date):
1900         """Check if the date is in the range"""
1901         if not isinstance(date, datetime.date):
1902             date = date_from_str(date)
1903         return self.start <= date <= self.end
1904
1905     def __str__(self):
1906         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1907
1908     def __eq__(self, other):
1909         return (isinstance(other, DateRange)
1910                 and self.start == other.start and self.end == other.end)
1911
1912
1913 def platform_name():
1914     """ Returns the platform name as a str """
1915     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1916     return platform.platform()
1917
1918
1919 @functools.cache
1920 def system_identifier():
1921     python_implementation = platform.python_implementation()
1922     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1923         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1924
1925     return 'Python %s (%s %s) - %s %s' % (
1926         platform.python_version(),
1927         python_implementation,
1928         platform.architecture()[0],
1929         platform.platform(),
1930         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1931     )
1932
1933
1934 @functools.cache
1935 def get_windows_version():
1936     ''' Get Windows version. returns () if it's not running on Windows '''
1937     if compat_os_name == 'nt':
1938         return version_tuple(platform.win32_ver()[1])
1939     else:
1940         return ()
1941
1942
1943 def write_string(s, out=None, encoding=None):
1944     assert isinstance(s, str)
1945     out = out or sys.stderr
1946
1947     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1948         s = re.sub(r'([\r\n]+)', r' \1', s)
1949
1950     enc, buffer = None, out
1951     if 'b' in getattr(out, 'mode', ''):
1952         enc = encoding or preferredencoding()
1953     elif hasattr(out, 'buffer'):
1954         buffer = out.buffer
1955         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1956
1957     buffer.write(s.encode(enc, 'ignore') if enc else s)
1958     out.flush()
1959
1960
1961 def bytes_to_intlist(bs):
1962     if not bs:
1963         return []
1964     if isinstance(bs[0], int):  # Python 3
1965         return list(bs)
1966     else:
1967         return [ord(c) for c in bs]
1968
1969
1970 def intlist_to_bytes(xs):
1971     if not xs:
1972         return b''
1973     return struct.pack('%dB' % len(xs), *xs)
1974
1975
1976 class LockingUnsupportedError(OSError):
1977     msg = 'File locking is not supported'
1978
1979     def __init__(self):
1980         super().__init__(self.msg)
1981
1982
1983 # Cross-platform file locking
1984 if sys.platform == 'win32':
1985     import ctypes
1986     import ctypes.wintypes
1987     import msvcrt
1988
1989     class OVERLAPPED(ctypes.Structure):
1990         _fields_ = [
1991             ('Internal', ctypes.wintypes.LPVOID),
1992             ('InternalHigh', ctypes.wintypes.LPVOID),
1993             ('Offset', ctypes.wintypes.DWORD),
1994             ('OffsetHigh', ctypes.wintypes.DWORD),
1995             ('hEvent', ctypes.wintypes.HANDLE),
1996         ]
1997
1998     kernel32 = ctypes.windll.kernel32
1999     LockFileEx = kernel32.LockFileEx
2000     LockFileEx.argtypes = [
2001         ctypes.wintypes.HANDLE,     # hFile
2002         ctypes.wintypes.DWORD,      # dwFlags
2003         ctypes.wintypes.DWORD,      # dwReserved
2004         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2005         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2006         ctypes.POINTER(OVERLAPPED)  # Overlapped
2007     ]
2008     LockFileEx.restype = ctypes.wintypes.BOOL
2009     UnlockFileEx = kernel32.UnlockFileEx
2010     UnlockFileEx.argtypes = [
2011         ctypes.wintypes.HANDLE,     # hFile
2012         ctypes.wintypes.DWORD,      # dwReserved
2013         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2014         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2015         ctypes.POINTER(OVERLAPPED)  # Overlapped
2016     ]
2017     UnlockFileEx.restype = ctypes.wintypes.BOOL
2018     whole_low = 0xffffffff
2019     whole_high = 0x7fffffff
2020
2021     def _lock_file(f, exclusive, block):
2022         overlapped = OVERLAPPED()
2023         overlapped.Offset = 0
2024         overlapped.OffsetHigh = 0
2025         overlapped.hEvent = 0
2026         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2027
2028         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2029                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2030                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2031             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2032             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2033
2034     def _unlock_file(f):
2035         assert f._lock_file_overlapped_p
2036         handle = msvcrt.get_osfhandle(f.fileno())
2037         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2038             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2039
2040 else:
2041     try:
2042         import fcntl
2043
2044         def _lock_file(f, exclusive, block):
2045             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2046             if not block:
2047                 flags |= fcntl.LOCK_NB
2048             try:
2049                 fcntl.flock(f, flags)
2050             except BlockingIOError:
2051                 raise
2052             except OSError:  # AOSP does not have flock()
2053                 fcntl.lockf(f, flags)
2054
2055         def _unlock_file(f):
2056             try:
2057                 fcntl.flock(f, fcntl.LOCK_UN)
2058             except OSError:
2059                 fcntl.lockf(f, fcntl.LOCK_UN)
2060
2061     except ImportError:
2062
2063         def _lock_file(f, exclusive, block):
2064             raise LockingUnsupportedError()
2065
2066         def _unlock_file(f):
2067             raise LockingUnsupportedError()
2068
2069
2070 class locked_file:
2071     locked = False
2072
2073     def __init__(self, filename, mode, block=True, encoding=None):
2074         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2075             raise NotImplementedError(mode)
2076         self.mode, self.block = mode, block
2077
2078         writable = any(f in mode for f in 'wax+')
2079         readable = any(f in mode for f in 'r+')
2080         flags = functools.reduce(operator.ior, (
2081             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2082             getattr(os, 'O_BINARY', 0),  # Windows only
2083             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2084             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2085             os.O_APPEND if 'a' in mode else 0,
2086             os.O_EXCL if 'x' in mode else 0,
2087             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2088         ))
2089
2090         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2091
2092     def __enter__(self):
2093         exclusive = 'r' not in self.mode
2094         try:
2095             _lock_file(self.f, exclusive, self.block)
2096             self.locked = True
2097         except OSError:
2098             self.f.close()
2099             raise
2100         if 'w' in self.mode:
2101             try:
2102                 self.f.truncate()
2103             except OSError as e:
2104                 if e.errno not in (
2105                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2106                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2107                 ):
2108                     raise
2109         return self
2110
2111     def unlock(self):
2112         if not self.locked:
2113             return
2114         try:
2115             _unlock_file(self.f)
2116         finally:
2117             self.locked = False
2118
2119     def __exit__(self, *_):
2120         try:
2121             self.unlock()
2122         finally:
2123             self.f.close()
2124
2125     open = __enter__
2126     close = __exit__
2127
2128     def __getattr__(self, attr):
2129         return getattr(self.f, attr)
2130
2131     def __iter__(self):
2132         return iter(self.f)
2133
2134
2135 @functools.cache
2136 def get_filesystem_encoding():
2137     encoding = sys.getfilesystemencoding()
2138     return encoding if encoding is not None else 'utf-8'
2139
2140
2141 def shell_quote(args):
2142     quoted_args = []
2143     encoding = get_filesystem_encoding()
2144     for a in args:
2145         if isinstance(a, bytes):
2146             # We may get a filename encoded with 'encodeFilename'
2147             a = a.decode(encoding)
2148         quoted_args.append(compat_shlex_quote(a))
2149     return ' '.join(quoted_args)
2150
2151
2152 def smuggle_url(url, data):
2153     """ Pass additional data in a URL for internal use. """
2154
2155     url, idata = unsmuggle_url(url, {})
2156     data.update(idata)
2157     sdata = urllib.parse.urlencode(
2158         {'__youtubedl_smuggle': json.dumps(data)})
2159     return url + '#' + sdata
2160
2161
2162 def unsmuggle_url(smug_url, default=None):
2163     if '#__youtubedl_smuggle' not in smug_url:
2164         return smug_url, default
2165     url, _, sdata = smug_url.rpartition('#')
2166     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2167     data = json.loads(jsond)
2168     return url, data
2169
2170
2171 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2172     """ Formats numbers with decimal sufixes like K, M, etc """
2173     num, factor = float_or_none(num), float(factor)
2174     if num is None or num < 0:
2175         return None
2176     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2177     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2178     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2179     if factor == 1024:
2180         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2181     converted = num / (factor ** exponent)
2182     return fmt % (converted, suffix)
2183
2184
2185 def format_bytes(bytes):
2186     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2187
2188
2189 def lookup_unit_table(unit_table, s):
2190     units_re = '|'.join(re.escape(u) for u in unit_table)
2191     m = re.match(
2192         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2193     if not m:
2194         return None
2195     num_str = m.group('num').replace(',', '.')
2196     mult = unit_table[m.group('unit')]
2197     return int(float(num_str) * mult)
2198
2199
2200 def parse_filesize(s):
2201     if s is None:
2202         return None
2203
2204     # The lower-case forms are of course incorrect and unofficial,
2205     # but we support those too
2206     _UNIT_TABLE = {
2207         'B': 1,
2208         'b': 1,
2209         'bytes': 1,
2210         'KiB': 1024,
2211         'KB': 1000,
2212         'kB': 1024,
2213         'Kb': 1000,
2214         'kb': 1000,
2215         'kilobytes': 1000,
2216         'kibibytes': 1024,
2217         'MiB': 1024 ** 2,
2218         'MB': 1000 ** 2,
2219         'mB': 1024 ** 2,
2220         'Mb': 1000 ** 2,
2221         'mb': 1000 ** 2,
2222         'megabytes': 1000 ** 2,
2223         'mebibytes': 1024 ** 2,
2224         'GiB': 1024 ** 3,
2225         'GB': 1000 ** 3,
2226         'gB': 1024 ** 3,
2227         'Gb': 1000 ** 3,
2228         'gb': 1000 ** 3,
2229         'gigabytes': 1000 ** 3,
2230         'gibibytes': 1024 ** 3,
2231         'TiB': 1024 ** 4,
2232         'TB': 1000 ** 4,
2233         'tB': 1024 ** 4,
2234         'Tb': 1000 ** 4,
2235         'tb': 1000 ** 4,
2236         'terabytes': 1000 ** 4,
2237         'tebibytes': 1024 ** 4,
2238         'PiB': 1024 ** 5,
2239         'PB': 1000 ** 5,
2240         'pB': 1024 ** 5,
2241         'Pb': 1000 ** 5,
2242         'pb': 1000 ** 5,
2243         'petabytes': 1000 ** 5,
2244         'pebibytes': 1024 ** 5,
2245         'EiB': 1024 ** 6,
2246         'EB': 1000 ** 6,
2247         'eB': 1024 ** 6,
2248         'Eb': 1000 ** 6,
2249         'eb': 1000 ** 6,
2250         'exabytes': 1000 ** 6,
2251         'exbibytes': 1024 ** 6,
2252         'ZiB': 1024 ** 7,
2253         'ZB': 1000 ** 7,
2254         'zB': 1024 ** 7,
2255         'Zb': 1000 ** 7,
2256         'zb': 1000 ** 7,
2257         'zettabytes': 1000 ** 7,
2258         'zebibytes': 1024 ** 7,
2259         'YiB': 1024 ** 8,
2260         'YB': 1000 ** 8,
2261         'yB': 1024 ** 8,
2262         'Yb': 1000 ** 8,
2263         'yb': 1000 ** 8,
2264         'yottabytes': 1000 ** 8,
2265         'yobibytes': 1024 ** 8,
2266     }
2267
2268     return lookup_unit_table(_UNIT_TABLE, s)
2269
2270
2271 def parse_count(s):
2272     if s is None:
2273         return None
2274
2275     s = re.sub(r'^[^\d]+\s', '', s).strip()
2276
2277     if re.match(r'^[\d,.]+$', s):
2278         return str_to_int(s)
2279
2280     _UNIT_TABLE = {
2281         'k': 1000,
2282         'K': 1000,
2283         'm': 1000 ** 2,
2284         'M': 1000 ** 2,
2285         'kk': 1000 ** 2,
2286         'KK': 1000 ** 2,
2287         'b': 1000 ** 3,
2288         'B': 1000 ** 3,
2289     }
2290
2291     ret = lookup_unit_table(_UNIT_TABLE, s)
2292     if ret is not None:
2293         return ret
2294
2295     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2296     if mobj:
2297         return str_to_int(mobj.group(1))
2298
2299
2300 def parse_resolution(s, *, lenient=False):
2301     if s is None:
2302         return {}
2303
2304     if lenient:
2305         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2306     else:
2307         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2308     if mobj:
2309         return {
2310             'width': int(mobj.group('w')),
2311             'height': int(mobj.group('h')),
2312         }
2313
2314     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2315     if mobj:
2316         return {'height': int(mobj.group(1))}
2317
2318     mobj = re.search(r'\b([48])[kK]\b', s)
2319     if mobj:
2320         return {'height': int(mobj.group(1)) * 540}
2321
2322     return {}
2323
2324
2325 def parse_bitrate(s):
2326     if not isinstance(s, str):
2327         return
2328     mobj = re.search(r'\b(\d+)\s*kbps', s)
2329     if mobj:
2330         return int(mobj.group(1))
2331
2332
2333 def month_by_name(name, lang='en'):
2334     """ Return the number of a month by (locale-independently) English name """
2335
2336     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2337
2338     try:
2339         return month_names.index(name) + 1
2340     except ValueError:
2341         return None
2342
2343
2344 def month_by_abbreviation(abbrev):
2345     """ Return the number of a month by (locale-independently) English
2346         abbreviations """
2347
2348     try:
2349         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2350     except ValueError:
2351         return None
2352
2353
2354 def fix_xml_ampersands(xml_str):
2355     """Replace all the '&' by '&amp;' in XML"""
2356     return re.sub(
2357         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2358         '&amp;',
2359         xml_str)
2360
2361
2362 def setproctitle(title):
2363     assert isinstance(title, str)
2364
2365     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2366     try:
2367         import ctypes
2368     except ImportError:
2369         return
2370
2371     try:
2372         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2373     except OSError:
2374         return
2375     except TypeError:
2376         # LoadLibrary in Windows Python 2.7.13 only expects
2377         # a bytestring, but since unicode_literals turns
2378         # every string into a unicode string, it fails.
2379         return
2380     title_bytes = title.encode()
2381     buf = ctypes.create_string_buffer(len(title_bytes))
2382     buf.value = title_bytes
2383     try:
2384         libc.prctl(15, buf, 0, 0, 0)
2385     except AttributeError:
2386         return  # Strange libc, just skip this
2387
2388
2389 def remove_start(s, start):
2390     return s[len(start):] if s is not None and s.startswith(start) else s
2391
2392
2393 def remove_end(s, end):
2394     return s[:-len(end)] if s is not None and s.endswith(end) else s
2395
2396
2397 def remove_quotes(s):
2398     if s is None or len(s) < 2:
2399         return s
2400     for quote in ('"', "'", ):
2401         if s[0] == quote and s[-1] == quote:
2402             return s[1:-1]
2403     return s
2404
2405
2406 def get_domain(url):
2407     """
2408     This implementation is inconsistent, but is kept for compatibility.
2409     Use this only for "webpage_url_domain"
2410     """
2411     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2412
2413
2414 def url_basename(url):
2415     path = urllib.parse.urlparse(url).path
2416     return path.strip('/').split('/')[-1]
2417
2418
2419 def base_url(url):
2420     return re.match(r'https?://[^?#&]+/', url).group()
2421
2422
2423 def urljoin(base, path):
2424     if isinstance(path, bytes):
2425         path = path.decode()
2426     if not isinstance(path, str) or not path:
2427         return None
2428     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2429         return path
2430     if isinstance(base, bytes):
2431         base = base.decode()
2432     if not isinstance(base, str) or not re.match(
2433             r'^(?:https?:)?//', base):
2434         return None
2435     return urllib.parse.urljoin(base, path)
2436
2437
2438 class HEADRequest(urllib.request.Request):
2439     def get_method(self):
2440         return 'HEAD'
2441
2442
2443 class PUTRequest(urllib.request.Request):
2444     def get_method(self):
2445         return 'PUT'
2446
2447
2448 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2449     if get_attr and v is not None:
2450         v = getattr(v, get_attr, None)
2451     try:
2452         return int(v) * invscale // scale
2453     except (ValueError, TypeError, OverflowError):
2454         return default
2455
2456
2457 def str_or_none(v, default=None):
2458     return default if v is None else str(v)
2459
2460
2461 def str_to_int(int_str):
2462     """ A more relaxed version of int_or_none """
2463     if isinstance(int_str, int):
2464         return int_str
2465     elif isinstance(int_str, str):
2466         int_str = re.sub(r'[,\.\+]', '', int_str)
2467         return int_or_none(int_str)
2468
2469
2470 def float_or_none(v, scale=1, invscale=1, default=None):
2471     if v is None:
2472         return default
2473     try:
2474         return float(v) * invscale / scale
2475     except (ValueError, TypeError):
2476         return default
2477
2478
2479 def bool_or_none(v, default=None):
2480     return v if isinstance(v, bool) else default
2481
2482
2483 def strip_or_none(v, default=None):
2484     return v.strip() if isinstance(v, str) else default
2485
2486
2487 def url_or_none(url):
2488     if not url or not isinstance(url, str):
2489         return None
2490     url = url.strip()
2491     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2492
2493
2494 def request_to_url(req):
2495     if isinstance(req, urllib.request.Request):
2496         return req.get_full_url()
2497     else:
2498         return req
2499
2500
2501 def strftime_or_none(timestamp, date_format, default=None):
2502     datetime_object = None
2503     try:
2504         if isinstance(timestamp, (int, float)):  # unix timestamp
2505             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2506         elif isinstance(timestamp, str):  # assume YYYYMMDD
2507             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2508         return datetime_object.strftime(date_format)
2509     except (ValueError, TypeError, AttributeError):
2510         return default
2511
2512
2513 def parse_duration(s):
2514     if not isinstance(s, str):
2515         return None
2516     s = s.strip()
2517     if not s:
2518         return None
2519
2520     days, hours, mins, secs, ms = [None] * 5
2521     m = re.match(r'''(?x)
2522             (?P<before_secs>
2523                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2524             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2525             (?P<ms>[.:][0-9]+)?Z?$
2526         ''', s)
2527     if m:
2528         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2529     else:
2530         m = re.match(
2531             r'''(?ix)(?:P?
2532                 (?:
2533                     [0-9]+\s*y(?:ears?)?,?\s*
2534                 )?
2535                 (?:
2536                     [0-9]+\s*m(?:onths?)?,?\s*
2537                 )?
2538                 (?:
2539                     [0-9]+\s*w(?:eeks?)?,?\s*
2540                 )?
2541                 (?:
2542                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2543                 )?
2544                 T)?
2545                 (?:
2546                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2547                 )?
2548                 (?:
2549                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2550                 )?
2551                 (?:
2552                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2553                 )?Z?$''', s)
2554         if m:
2555             days, hours, mins, secs, ms = m.groups()
2556         else:
2557             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2558             if m:
2559                 hours, mins = m.groups()
2560             else:
2561                 return None
2562
2563     if ms:
2564         ms = ms.replace(':', '.')
2565     return sum(float(part or 0) * mult for part, mult in (
2566         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2567
2568
2569 def prepend_extension(filename, ext, expected_real_ext=None):
2570     name, real_ext = os.path.splitext(filename)
2571     return (
2572         f'{name}.{ext}{real_ext}'
2573         if not expected_real_ext or real_ext[1:] == expected_real_ext
2574         else f'{filename}.{ext}')
2575
2576
2577 def replace_extension(filename, ext, expected_real_ext=None):
2578     name, real_ext = os.path.splitext(filename)
2579     return '{}.{}'.format(
2580         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2581         ext)
2582
2583
2584 def check_executable(exe, args=[]):
2585     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2586     args can be a list of arguments for a short output (like -version) """
2587     try:
2588         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2589     except OSError:
2590         return False
2591     return exe
2592
2593
2594 def _get_exe_version_output(exe, args, *, to_screen=None):
2595     if to_screen:
2596         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2597     try:
2598         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2599         # SIGTTOU if yt-dlp is run in the background.
2600         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2601         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2602                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2603     except OSError:
2604         return False
2605     return stdout
2606
2607
2608 def detect_exe_version(output, version_re=None, unrecognized='present'):
2609     assert isinstance(output, str)
2610     if version_re is None:
2611         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2612     m = re.search(version_re, output)
2613     if m:
2614         return m.group(1)
2615     else:
2616         return unrecognized
2617
2618
2619 def get_exe_version(exe, args=['--version'],
2620                     version_re=None, unrecognized='present'):
2621     """ Returns the version of the specified executable,
2622     or False if the executable is not present """
2623     out = _get_exe_version_output(exe, args)
2624     return detect_exe_version(out, version_re, unrecognized) if out else False
2625
2626
2627 def frange(start=0, stop=None, step=1):
2628     """Float range"""
2629     if stop is None:
2630         start, stop = 0, start
2631     sign = [-1, 1][step > 0] if step else 0
2632     while sign * start < sign * stop:
2633         yield start
2634         start += step
2635
2636
2637 class LazyList(collections.abc.Sequence):
2638     """Lazy immutable list from an iterable
2639     Note that slices of a LazyList are lists and not LazyList"""
2640
2641     class IndexError(IndexError):
2642         pass
2643
2644     def __init__(self, iterable, *, reverse=False, _cache=None):
2645         self._iterable = iter(iterable)
2646         self._cache = [] if _cache is None else _cache
2647         self._reversed = reverse
2648
2649     def __iter__(self):
2650         if self._reversed:
2651             # We need to consume the entire iterable to iterate in reverse
2652             yield from self.exhaust()
2653             return
2654         yield from self._cache
2655         for item in self._iterable:
2656             self._cache.append(item)
2657             yield item
2658
2659     def _exhaust(self):
2660         self._cache.extend(self._iterable)
2661         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2662         return self._cache
2663
2664     def exhaust(self):
2665         """Evaluate the entire iterable"""
2666         return self._exhaust()[::-1 if self._reversed else 1]
2667
2668     @staticmethod
2669     def _reverse_index(x):
2670         return None if x is None else ~x
2671
2672     def __getitem__(self, idx):
2673         if isinstance(idx, slice):
2674             if self._reversed:
2675                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2676             start, stop, step = idx.start, idx.stop, idx.step or 1
2677         elif isinstance(idx, int):
2678             if self._reversed:
2679                 idx = self._reverse_index(idx)
2680             start, stop, step = idx, idx, 0
2681         else:
2682             raise TypeError('indices must be integers or slices')
2683         if ((start or 0) < 0 or (stop or 0) < 0
2684                 or (start is None and step < 0)
2685                 or (stop is None and step > 0)):
2686             # We need to consume the entire iterable to be able to slice from the end
2687             # Obviously, never use this with infinite iterables
2688             self._exhaust()
2689             try:
2690                 return self._cache[idx]
2691             except IndexError as e:
2692                 raise self.IndexError(e) from e
2693         n = max(start or 0, stop or 0) - len(self._cache) + 1
2694         if n > 0:
2695             self._cache.extend(itertools.islice(self._iterable, n))
2696         try:
2697             return self._cache[idx]
2698         except IndexError as e:
2699             raise self.IndexError(e) from e
2700
2701     def __bool__(self):
2702         try:
2703             self[-1] if self._reversed else self[0]
2704         except self.IndexError:
2705             return False
2706         return True
2707
2708     def __len__(self):
2709         self._exhaust()
2710         return len(self._cache)
2711
2712     def __reversed__(self):
2713         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2714
2715     def __copy__(self):
2716         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2717
2718     def __repr__(self):
2719         # repr and str should mimic a list. So we exhaust the iterable
2720         return repr(self.exhaust())
2721
2722     def __str__(self):
2723         return repr(self.exhaust())
2724
2725
2726 class PagedList:
2727
2728     class IndexError(IndexError):
2729         pass
2730
2731     def __len__(self):
2732         # This is only useful for tests
2733         return len(self.getslice())
2734
2735     def __init__(self, pagefunc, pagesize, use_cache=True):
2736         self._pagefunc = pagefunc
2737         self._pagesize = pagesize
2738         self._pagecount = float('inf')
2739         self._use_cache = use_cache
2740         self._cache = {}
2741
2742     def getpage(self, pagenum):
2743         page_results = self._cache.get(pagenum)
2744         if page_results is None:
2745             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2746         if self._use_cache:
2747             self._cache[pagenum] = page_results
2748         return page_results
2749
2750     def getslice(self, start=0, end=None):
2751         return list(self._getslice(start, end))
2752
2753     def _getslice(self, start, end):
2754         raise NotImplementedError('This method must be implemented by subclasses')
2755
2756     def __getitem__(self, idx):
2757         assert self._use_cache, 'Indexing PagedList requires cache'
2758         if not isinstance(idx, int) or idx < 0:
2759             raise TypeError('indices must be non-negative integers')
2760         entries = self.getslice(idx, idx + 1)
2761         if not entries:
2762             raise self.IndexError()
2763         return entries[0]
2764
2765
2766 class OnDemandPagedList(PagedList):
2767     """Download pages until a page with less than maximum results"""
2768
2769     def _getslice(self, start, end):
2770         for pagenum in itertools.count(start // self._pagesize):
2771             firstid = pagenum * self._pagesize
2772             nextfirstid = pagenum * self._pagesize + self._pagesize
2773             if start >= nextfirstid:
2774                 continue
2775
2776             startv = (
2777                 start % self._pagesize
2778                 if firstid <= start < nextfirstid
2779                 else 0)
2780             endv = (
2781                 ((end - 1) % self._pagesize) + 1
2782                 if (end is not None and firstid <= end <= nextfirstid)
2783                 else None)
2784
2785             try:
2786                 page_results = self.getpage(pagenum)
2787             except Exception:
2788                 self._pagecount = pagenum - 1
2789                 raise
2790             if startv != 0 or endv is not None:
2791                 page_results = page_results[startv:endv]
2792             yield from page_results
2793
2794             # A little optimization - if current page is not "full", ie. does
2795             # not contain page_size videos then we can assume that this page
2796             # is the last one - there are no more ids on further pages -
2797             # i.e. no need to query again.
2798             if len(page_results) + startv < self._pagesize:
2799                 break
2800
2801             # If we got the whole page, but the next page is not interesting,
2802             # break out early as well
2803             if end == nextfirstid:
2804                 break
2805
2806
2807 class InAdvancePagedList(PagedList):
2808     """PagedList with total number of pages known in advance"""
2809
2810     def __init__(self, pagefunc, pagecount, pagesize):
2811         PagedList.__init__(self, pagefunc, pagesize, True)
2812         self._pagecount = pagecount
2813
2814     def _getslice(self, start, end):
2815         start_page = start // self._pagesize
2816         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2817         skip_elems = start - start_page * self._pagesize
2818         only_more = None if end is None else end - start
2819         for pagenum in range(start_page, end_page):
2820             page_results = self.getpage(pagenum)
2821             if skip_elems:
2822                 page_results = page_results[skip_elems:]
2823                 skip_elems = None
2824             if only_more is not None:
2825                 if len(page_results) < only_more:
2826                     only_more -= len(page_results)
2827                 else:
2828                     yield from page_results[:only_more]
2829                     break
2830             yield from page_results
2831
2832
2833 class PlaylistEntries:
2834     MissingEntry = object()
2835     is_exhausted = False
2836
2837     def __init__(self, ydl, info_dict):
2838         self.ydl = ydl
2839
2840         # _entries must be assigned now since infodict can change during iteration
2841         entries = info_dict.get('entries')
2842         if entries is None:
2843             raise EntryNotInPlaylist('There are no entries')
2844         elif isinstance(entries, list):
2845             self.is_exhausted = True
2846
2847         requested_entries = info_dict.get('requested_entries')
2848         self.is_incomplete = bool(requested_entries)
2849         if self.is_incomplete:
2850             assert self.is_exhausted
2851             self._entries = [self.MissingEntry] * max(requested_entries)
2852             for i, entry in zip(requested_entries, entries):
2853                 self._entries[i - 1] = entry
2854         elif isinstance(entries, (list, PagedList, LazyList)):
2855             self._entries = entries
2856         else:
2857             self._entries = LazyList(entries)
2858
2859     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2860         (?P<start>[+-]?\d+)?
2861         (?P<range>[:-]
2862             (?P<end>[+-]?\d+|inf(?:inite)?)?
2863             (?::(?P<step>[+-]?\d+))?
2864         )?''')
2865
2866     @classmethod
2867     def parse_playlist_items(cls, string):
2868         for segment in string.split(','):
2869             if not segment:
2870                 raise ValueError('There is two or more consecutive commas')
2871             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2872             if not mobj:
2873                 raise ValueError(f'{segment!r} is not a valid specification')
2874             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2875             if int_or_none(step) == 0:
2876                 raise ValueError(f'Step in {segment!r} cannot be zero')
2877             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2878
2879     def get_requested_items(self):
2880         playlist_items = self.ydl.params.get('playlist_items')
2881         playlist_start = self.ydl.params.get('playliststart', 1)
2882         playlist_end = self.ydl.params.get('playlistend')
2883         # For backwards compatibility, interpret -1 as whole list
2884         if playlist_end in (-1, None):
2885             playlist_end = ''
2886         if not playlist_items:
2887             playlist_items = f'{playlist_start}:{playlist_end}'
2888         elif playlist_start != 1 or playlist_end:
2889             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2890
2891         for index in self.parse_playlist_items(playlist_items):
2892             for i, entry in self[index]:
2893                 yield i, entry
2894                 if not entry:
2895                     continue
2896                 try:
2897                     # TODO: Add auto-generated fields
2898                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2899                 except (ExistingVideoReached, RejectedVideoReached):
2900                     return
2901
2902     def get_full_count(self):
2903         if self.is_exhausted and not self.is_incomplete:
2904             return len(self)
2905         elif isinstance(self._entries, InAdvancePagedList):
2906             if self._entries._pagesize == 1:
2907                 return self._entries._pagecount
2908
2909     @functools.cached_property
2910     def _getter(self):
2911         if isinstance(self._entries, list):
2912             def get_entry(i):
2913                 try:
2914                     entry = self._entries[i]
2915                 except IndexError:
2916                     entry = self.MissingEntry
2917                     if not self.is_incomplete:
2918                         raise self.IndexError()
2919                 if entry is self.MissingEntry:
2920                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2921                 return entry
2922         else:
2923             def get_entry(i):
2924                 try:
2925                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2926                 except (LazyList.IndexError, PagedList.IndexError):
2927                     raise self.IndexError()
2928         return get_entry
2929
2930     def __getitem__(self, idx):
2931         if isinstance(idx, int):
2932             idx = slice(idx, idx)
2933
2934         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2935         step = 1 if idx.step is None else idx.step
2936         if idx.start is None:
2937             start = 0 if step > 0 else len(self) - 1
2938         else:
2939             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2940
2941         # NB: Do not call len(self) when idx == [:]
2942         if idx.stop is None:
2943             stop = 0 if step < 0 else float('inf')
2944         else:
2945             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2946         stop += [-1, 1][step > 0]
2947
2948         for i in frange(start, stop, step):
2949             if i < 0:
2950                 continue
2951             try:
2952                 entry = self._getter(i)
2953             except self.IndexError:
2954                 self.is_exhausted = True
2955                 if step > 0:
2956                     break
2957                 continue
2958             yield i + 1, entry
2959
2960     def __len__(self):
2961         return len(tuple(self[:]))
2962
2963     class IndexError(IndexError):
2964         pass
2965
2966
2967 def uppercase_escape(s):
2968     unicode_escape = codecs.getdecoder('unicode_escape')
2969     return re.sub(
2970         r'\\U[0-9a-fA-F]{8}',
2971         lambda m: unicode_escape(m.group(0))[0],
2972         s)
2973
2974
2975 def lowercase_escape(s):
2976     unicode_escape = codecs.getdecoder('unicode_escape')
2977     return re.sub(
2978         r'\\u[0-9a-fA-F]{4}',
2979         lambda m: unicode_escape(m.group(0))[0],
2980         s)
2981
2982
2983 def escape_rfc3986(s):
2984     """Escape non-ASCII characters as suggested by RFC 3986"""
2985     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2986
2987
2988 def escape_url(url):
2989     """Escape URL as suggested by RFC 3986"""
2990     url_parsed = urllib.parse.urlparse(url)
2991     return url_parsed._replace(
2992         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2993         path=escape_rfc3986(url_parsed.path),
2994         params=escape_rfc3986(url_parsed.params),
2995         query=escape_rfc3986(url_parsed.query),
2996         fragment=escape_rfc3986(url_parsed.fragment)
2997     ).geturl()
2998
2999
3000 def parse_qs(url):
3001     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3002
3003
3004 def read_batch_urls(batch_fd):
3005     def fixup(url):
3006         if not isinstance(url, str):
3007             url = url.decode('utf-8', 'replace')
3008         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3009         for bom in BOM_UTF8:
3010             if url.startswith(bom):
3011                 url = url[len(bom):]
3012         url = url.lstrip()
3013         if not url or url.startswith(('#', ';', ']')):
3014             return False
3015         # "#" cannot be stripped out since it is part of the URI
3016         # However, it can be safely stripped out if following a whitespace
3017         return re.split(r'\s#', url, 1)[0].rstrip()
3018
3019     with contextlib.closing(batch_fd) as fd:
3020         return [url for url in map(fixup, fd) if url]
3021
3022
3023 def urlencode_postdata(*args, **kargs):
3024     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3025
3026
3027 def update_url_query(url, query):
3028     if not query:
3029         return url
3030     parsed_url = urllib.parse.urlparse(url)
3031     qs = urllib.parse.parse_qs(parsed_url.query)
3032     qs.update(query)
3033     return urllib.parse.urlunparse(parsed_url._replace(
3034         query=urllib.parse.urlencode(qs, True)))
3035
3036
3037 def update_Request(req, url=None, data=None, headers=None, query=None):
3038     req_headers = req.headers.copy()
3039     req_headers.update(headers or {})
3040     req_data = data or req.data
3041     req_url = update_url_query(url or req.get_full_url(), query)
3042     req_get_method = req.get_method()
3043     if req_get_method == 'HEAD':
3044         req_type = HEADRequest
3045     elif req_get_method == 'PUT':
3046         req_type = PUTRequest
3047     else:
3048         req_type = urllib.request.Request
3049     new_req = req_type(
3050         req_url, data=req_data, headers=req_headers,
3051         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3052     if hasattr(req, 'timeout'):
3053         new_req.timeout = req.timeout
3054     return new_req
3055
3056
3057 def _multipart_encode_impl(data, boundary):
3058     content_type = 'multipart/form-data; boundary=%s' % boundary
3059
3060     out = b''
3061     for k, v in data.items():
3062         out += b'--' + boundary.encode('ascii') + b'\r\n'
3063         if isinstance(k, str):
3064             k = k.encode()
3065         if isinstance(v, str):
3066             v = v.encode()
3067         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3068         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3069         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3070         if boundary.encode('ascii') in content:
3071             raise ValueError('Boundary overlaps with data')
3072         out += content
3073
3074     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3075
3076     return out, content_type
3077
3078
3079 def multipart_encode(data, boundary=None):
3080     '''
3081     Encode a dict to RFC 7578-compliant form-data
3082
3083     data:
3084         A dict where keys and values can be either Unicode or bytes-like
3085         objects.
3086     boundary:
3087         If specified a Unicode object, it's used as the boundary. Otherwise
3088         a random boundary is generated.
3089
3090     Reference: https://tools.ietf.org/html/rfc7578
3091     '''
3092     has_specified_boundary = boundary is not None
3093
3094     while True:
3095         if boundary is None:
3096             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3097
3098         try:
3099             out, content_type = _multipart_encode_impl(data, boundary)
3100             break
3101         except ValueError:
3102             if has_specified_boundary:
3103                 raise
3104             boundary = None
3105
3106     return out, content_type
3107
3108
3109 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3110     for val in map(d.get, variadic(key_or_keys)):
3111         if val is not None and (val or not skip_false_values):
3112             return val
3113     return default
3114
3115
3116 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3117     for f in funcs:
3118         try:
3119             val = f(*args, **kwargs)
3120         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3121             pass
3122         else:
3123             if expected_type is None or isinstance(val, expected_type):
3124                 return val
3125
3126
3127 def try_get(src, getter, expected_type=None):
3128     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3129
3130
3131 def filter_dict(dct, cndn=lambda _, v: v is not None):
3132     return {k: v for k, v in dct.items() if cndn(k, v)}
3133
3134
3135 def merge_dicts(*dicts):
3136     merged = {}
3137     for a_dict in dicts:
3138         for k, v in a_dict.items():
3139             if (v is not None and k not in merged
3140                     or isinstance(v, str) and merged[k] == ''):
3141                 merged[k] = v
3142     return merged
3143
3144
3145 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3146     return string if isinstance(string, str) else str(string, encoding, errors)
3147
3148
3149 US_RATINGS = {
3150     'G': 0,
3151     'PG': 10,
3152     'PG-13': 13,
3153     'R': 16,
3154     'NC': 18,
3155 }
3156
3157
3158 TV_PARENTAL_GUIDELINES = {
3159     'TV-Y': 0,
3160     'TV-Y7': 7,
3161     'TV-G': 0,
3162     'TV-PG': 0,
3163     'TV-14': 14,
3164     'TV-MA': 17,
3165 }
3166
3167
3168 def parse_age_limit(s):
3169     # isinstance(False, int) is True. So type() must be used instead
3170     if type(s) is int:  # noqa: E721
3171         return s if 0 <= s <= 21 else None
3172     elif not isinstance(s, str):
3173         return None
3174     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3175     if m:
3176         return int(m.group('age'))
3177     s = s.upper()
3178     if s in US_RATINGS:
3179         return US_RATINGS[s]
3180     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3181     if m:
3182         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3183     return None
3184
3185
3186 def strip_jsonp(code):
3187     return re.sub(
3188         r'''(?sx)^
3189             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3190             (?:\s*&&\s*(?P=func_name))?
3191             \s*\(\s*(?P<callback_data>.*)\);?
3192             \s*?(?://[^\n]*)*$''',
3193         r'\g<callback_data>', code)
3194
3195
3196 def js_to_json(code, vars={}):
3197     # vars is a dict of var, val pairs to substitute
3198     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3199     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3200     INTEGER_TABLE = (
3201         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3202         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3203     )
3204
3205     def fix_kv(m):
3206         v = m.group(0)
3207         if v in ('true', 'false', 'null'):
3208             return v
3209         elif v in ('undefined', 'void 0'):
3210             return 'null'
3211         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3212             return ""
3213
3214         if v[0] in ("'", '"'):
3215             v = re.sub(r'(?s)\\.|"', lambda m: {
3216                 '"': '\\"',
3217                 "\\'": "'",
3218                 '\\\n': '',
3219                 '\\x': '\\u00',
3220             }.get(m.group(0), m.group(0)), v[1:-1])
3221         else:
3222             for regex, base in INTEGER_TABLE:
3223                 im = re.match(regex, v)
3224                 if im:
3225                     i = int(im.group(1), base)
3226                     return '"%d":' % i if v.endswith(':') else '%d' % i
3227
3228             if v in vars:
3229                 return vars[v]
3230
3231         return '"%s"' % v
3232
3233     def create_map(mobj):
3234         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3235
3236     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3237     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3238
3239     return re.sub(r'''(?sx)
3240         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3241         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3242         {comment}|,(?={skip}[\]}}])|
3243         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3244         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3245         [0-9]+(?={skip}:)|
3246         !+
3247         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3248
3249
3250 def qualities(quality_ids):
3251     """ Get a numeric quality value out of a list of possible values """
3252     def q(qid):
3253         try:
3254             return quality_ids.index(qid)
3255         except ValueError:
3256             return -1
3257     return q
3258
3259
3260 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3261
3262
3263 DEFAULT_OUTTMPL = {
3264     'default': '%(title)s [%(id)s].%(ext)s',
3265     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3266 }
3267 OUTTMPL_TYPES = {
3268     'chapter': None,
3269     'subtitle': None,
3270     'thumbnail': None,
3271     'description': 'description',
3272     'annotation': 'annotations.xml',
3273     'infojson': 'info.json',
3274     'link': None,
3275     'pl_video': None,
3276     'pl_thumbnail': None,
3277     'pl_description': 'description',
3278     'pl_infojson': 'info.json',
3279 }
3280
3281 # As of [1] format syntax is:
3282 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3283 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3284 STR_FORMAT_RE_TMPL = r'''(?x)
3285     (?<!%)(?P<prefix>(?:%%)*)
3286     %
3287     (?P<has_key>\((?P<key>{0})\))?
3288     (?P<format>
3289         (?P<conversion>[#0\-+ ]+)?
3290         (?P<min_width>\d+)?
3291         (?P<precision>\.\d+)?
3292         (?P<len_mod>[hlL])?  # unused in python
3293         {1}  # conversion type
3294     )
3295 '''
3296
3297
3298 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3299
3300
3301 def limit_length(s, length):
3302     """ Add ellipses to overly long strings """
3303     if s is None:
3304         return None
3305     ELLIPSES = '...'
3306     if len(s) > length:
3307         return s[:length - len(ELLIPSES)] + ELLIPSES
3308     return s
3309
3310
3311 def version_tuple(v):
3312     return tuple(int(e) for e in re.split(r'[-.]', v))
3313
3314
3315 def is_outdated_version(version, limit, assume_new=True):
3316     if not version:
3317         return not assume_new
3318     try:
3319         return version_tuple(version) < version_tuple(limit)
3320     except ValueError:
3321         return not assume_new
3322
3323
3324 def ytdl_is_updateable():
3325     """ Returns if yt-dlp can be updated with -U """
3326
3327     from .update import is_non_updateable
3328
3329     return not is_non_updateable()
3330
3331
3332 def args_to_str(args):
3333     # Get a short string representation for a subprocess command
3334     return ' '.join(compat_shlex_quote(a) for a in args)
3335
3336
3337 def error_to_compat_str(err):
3338     return str(err)
3339
3340
3341 def error_to_str(err):
3342     return f'{type(err).__name__}: {err}'
3343
3344
3345 def mimetype2ext(mt):
3346     if mt is None:
3347         return None
3348
3349     mt, _, params = mt.partition(';')
3350     mt = mt.strip()
3351
3352     FULL_MAP = {
3353         'audio/mp4': 'm4a',
3354         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3355         # it's the most popular one
3356         'audio/mpeg': 'mp3',
3357         'audio/x-wav': 'wav',
3358         'audio/wav': 'wav',
3359         'audio/wave': 'wav',
3360     }
3361
3362     ext = FULL_MAP.get(mt)
3363     if ext is not None:
3364         return ext
3365
3366     SUBTYPE_MAP = {
3367         '3gpp': '3gp',
3368         'smptett+xml': 'tt',
3369         'ttaf+xml': 'dfxp',
3370         'ttml+xml': 'ttml',
3371         'x-flv': 'flv',
3372         'x-mp4-fragmented': 'mp4',
3373         'x-ms-sami': 'sami',
3374         'x-ms-wmv': 'wmv',
3375         'mpegurl': 'm3u8',
3376         'x-mpegurl': 'm3u8',
3377         'vnd.apple.mpegurl': 'm3u8',
3378         'dash+xml': 'mpd',
3379         'f4m+xml': 'f4m',
3380         'hds+xml': 'f4m',
3381         'vnd.ms-sstr+xml': 'ism',
3382         'quicktime': 'mov',
3383         'mp2t': 'ts',
3384         'x-wav': 'wav',
3385         'filmstrip+json': 'fs',
3386         'svg+xml': 'svg',
3387     }
3388
3389     _, _, subtype = mt.rpartition('/')
3390     ext = SUBTYPE_MAP.get(subtype.lower())
3391     if ext is not None:
3392         return ext
3393
3394     SUFFIX_MAP = {
3395         'json': 'json',
3396         'xml': 'xml',
3397         'zip': 'zip',
3398         'gzip': 'gz',
3399     }
3400
3401     _, _, suffix = subtype.partition('+')
3402     ext = SUFFIX_MAP.get(suffix)
3403     if ext is not None:
3404         return ext
3405
3406     return subtype.replace('+', '.')
3407
3408
3409 def ext2mimetype(ext_or_url):
3410     if not ext_or_url:
3411         return None
3412     if '.' not in ext_or_url:
3413         ext_or_url = f'file.{ext_or_url}'
3414     return mimetypes.guess_type(ext_or_url)[0]
3415
3416
3417 def parse_codecs(codecs_str):
3418     # http://tools.ietf.org/html/rfc6381
3419     if not codecs_str:
3420         return {}
3421     split_codecs = list(filter(None, map(
3422         str.strip, codecs_str.strip().strip(',').split(','))))
3423     vcodec, acodec, scodec, hdr = None, None, None, None
3424     for full_codec in split_codecs:
3425         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3426         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3427                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3428             if vcodec:
3429                 continue
3430             vcodec = full_codec
3431             if parts[0] in ('dvh1', 'dvhe'):
3432                 hdr = 'DV'
3433             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3434                 hdr = 'HDR10'
3435             elif parts[:2] == ['vp9', '2']:
3436                 hdr = 'HDR10'
3437         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3438                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3439             acodec = acodec or full_codec
3440         elif parts[0] in ('stpp', 'wvtt'):
3441             scodec = scodec or full_codec
3442         else:
3443             write_string(f'WARNING: Unknown codec {full_codec}\n')
3444     if vcodec or acodec or scodec:
3445         return {
3446             'vcodec': vcodec or 'none',
3447             'acodec': acodec or 'none',
3448             'dynamic_range': hdr,
3449             **({'scodec': scodec} if scodec is not None else {}),
3450         }
3451     elif len(split_codecs) == 2:
3452         return {
3453             'vcodec': split_codecs[0],
3454             'acodec': split_codecs[1],
3455         }
3456     return {}
3457
3458
3459 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3460     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3461
3462     allow_mkv = not preferences or 'mkv' in preferences
3463
3464     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3465         return 'mkv'  # TODO: any other format allows this?
3466
3467     # TODO: All codecs supported by parse_codecs isn't handled here
3468     COMPATIBLE_CODECS = {
3469         'mp4': {
3470             'av1', 'hevc', 'avc1', 'mp4a',  # fourcc (m3u8, mpd)
3471             'h264', 'aacl',  # Set in ISM
3472         },
3473         'webm': {
3474             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3475             'vp9x', 'vp8x',  # in the webm spec
3476         },
3477     }
3478
3479     sanitize_codec = functools.partial(try_get, getter=lambda x: x.split('.')[0].replace('0', ''))
3480     vcodec, acodec = sanitize_codec(vcodecs[0]), sanitize_codec(acodecs[0])
3481
3482     for ext in preferences or COMPATIBLE_CODECS.keys():
3483         codec_set = COMPATIBLE_CODECS.get(ext, set())
3484         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3485             return ext
3486
3487     COMPATIBLE_EXTS = (
3488         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3489         {'webm'},
3490     )
3491     for ext in preferences or vexts:
3492         current_exts = {ext, *vexts, *aexts}
3493         if ext == 'mkv' or current_exts == {ext} or any(
3494                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3495             return ext
3496     return 'mkv' if allow_mkv else preferences[-1]
3497
3498
3499 def urlhandle_detect_ext(url_handle):
3500     getheader = url_handle.headers.get
3501
3502     cd = getheader('Content-Disposition')
3503     if cd:
3504         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3505         if m:
3506             e = determine_ext(m.group('filename'), default_ext=None)
3507             if e:
3508                 return e
3509
3510     return mimetype2ext(getheader('Content-Type'))
3511
3512
3513 def encode_data_uri(data, mime_type):
3514     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3515
3516
3517 def age_restricted(content_limit, age_limit):
3518     """ Returns True iff the content should be blocked """
3519
3520     if age_limit is None:  # No limit set
3521         return False
3522     if content_limit is None:
3523         return False  # Content available for everyone
3524     return age_limit < content_limit
3525
3526
3527 # List of known byte-order-marks (BOM)
3528 BOMS = [
3529     (b'\xef\xbb\xbf', 'utf-8'),
3530     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3531     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3532     (b'\xff\xfe', 'utf-16-le'),
3533     (b'\xfe\xff', 'utf-16-be'),
3534 ]
3535
3536
3537 def is_html(first_bytes):
3538     """ Detect whether a file contains HTML by examining its first bytes. """
3539
3540     encoding = 'utf-8'
3541     for bom, enc in BOMS:
3542         while first_bytes.startswith(bom):
3543             encoding, first_bytes = enc, first_bytes[len(bom):]
3544
3545     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3546
3547
3548 def determine_protocol(info_dict):
3549     protocol = info_dict.get('protocol')
3550     if protocol is not None:
3551         return protocol
3552
3553     url = sanitize_url(info_dict['url'])
3554     if url.startswith('rtmp'):
3555         return 'rtmp'
3556     elif url.startswith('mms'):
3557         return 'mms'
3558     elif url.startswith('rtsp'):
3559         return 'rtsp'
3560
3561     ext = determine_ext(url)
3562     if ext == 'm3u8':
3563         return 'm3u8'
3564     elif ext == 'f4m':
3565         return 'f4m'
3566
3567     return urllib.parse.urlparse(url).scheme
3568
3569
3570 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3571     """ Render a list of rows, each as a list of values.
3572     Text after a \t will be right aligned """
3573     def width(string):
3574         return len(remove_terminal_sequences(string).replace('\t', ''))
3575
3576     def get_max_lens(table):
3577         return [max(width(str(v)) for v in col) for col in zip(*table)]
3578
3579     def filter_using_list(row, filterArray):
3580         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3581
3582     max_lens = get_max_lens(data) if hide_empty else []
3583     header_row = filter_using_list(header_row, max_lens)
3584     data = [filter_using_list(row, max_lens) for row in data]
3585
3586     table = [header_row] + data
3587     max_lens = get_max_lens(table)
3588     extra_gap += 1
3589     if delim:
3590         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3591         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3592     for row in table:
3593         for pos, text in enumerate(map(str, row)):
3594             if '\t' in text:
3595                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3596             else:
3597                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3598     ret = '\n'.join(''.join(row).rstrip() for row in table)
3599     return ret
3600
3601
3602 def _match_one(filter_part, dct, incomplete):
3603     # TODO: Generalize code with YoutubeDL._build_format_filter
3604     STRING_OPERATORS = {
3605         '*=': operator.contains,
3606         '^=': lambda attr, value: attr.startswith(value),
3607         '$=': lambda attr, value: attr.endswith(value),
3608         '~=': lambda attr, value: re.search(value, attr),
3609     }
3610     COMPARISON_OPERATORS = {
3611         **STRING_OPERATORS,
3612         '<=': operator.le,  # "<=" must be defined above "<"
3613         '<': operator.lt,
3614         '>=': operator.ge,
3615         '>': operator.gt,
3616         '=': operator.eq,
3617     }
3618
3619     if isinstance(incomplete, bool):
3620         is_incomplete = lambda _: incomplete
3621     else:
3622         is_incomplete = lambda k: k in incomplete
3623
3624     operator_rex = re.compile(r'''(?x)
3625         (?P<key>[a-z_]+)
3626         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3627         (?:
3628             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3629             (?P<strval>.+?)
3630         )
3631         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3632     m = operator_rex.fullmatch(filter_part.strip())
3633     if m:
3634         m = m.groupdict()
3635         unnegated_op = COMPARISON_OPERATORS[m['op']]
3636         if m['negation']:
3637             op = lambda attr, value: not unnegated_op(attr, value)
3638         else:
3639             op = unnegated_op
3640         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3641         if m['quote']:
3642             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3643         actual_value = dct.get(m['key'])
3644         numeric_comparison = None
3645         if isinstance(actual_value, (int, float)):
3646             # If the original field is a string and matching comparisonvalue is
3647             # a number we should respect the origin of the original field
3648             # and process comparison value as a string (see
3649             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3650             try:
3651                 numeric_comparison = int(comparison_value)
3652             except ValueError:
3653                 numeric_comparison = parse_filesize(comparison_value)
3654                 if numeric_comparison is None:
3655                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3656                 if numeric_comparison is None:
3657                     numeric_comparison = parse_duration(comparison_value)
3658         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3659             raise ValueError('Operator %s only supports string values!' % m['op'])
3660         if actual_value is None:
3661             return is_incomplete(m['key']) or m['none_inclusive']
3662         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3663
3664     UNARY_OPERATORS = {
3665         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3666         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3667     }
3668     operator_rex = re.compile(r'''(?x)
3669         (?P<op>%s)\s*(?P<key>[a-z_]+)
3670         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3671     m = operator_rex.fullmatch(filter_part.strip())
3672     if m:
3673         op = UNARY_OPERATORS[m.group('op')]
3674         actual_value = dct.get(m.group('key'))
3675         if is_incomplete(m.group('key')) and actual_value is None:
3676             return True
3677         return op(actual_value)
3678
3679     raise ValueError('Invalid filter part %r' % filter_part)
3680
3681
3682 def match_str(filter_str, dct, incomplete=False):
3683     """ Filter a dictionary with a simple string syntax.
3684     @returns           Whether the filter passes
3685     @param incomplete  Set of keys that is expected to be missing from dct.
3686                        Can be True/False to indicate all/none of the keys may be missing.
3687                        All conditions on incomplete keys pass if the key is missing
3688     """
3689     return all(
3690         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3691         for filter_part in re.split(r'(?<!\\)&', filter_str))
3692
3693
3694 def match_filter_func(filters):
3695     if not filters:
3696         return None
3697     filters = set(variadic(filters))
3698
3699     interactive = '-' in filters
3700     if interactive:
3701         filters.remove('-')
3702
3703     def _match_func(info_dict, incomplete=False):
3704         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3705             return NO_DEFAULT if interactive and not incomplete else None
3706         else:
3707             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3708             filter_str = ') | ('.join(map(str.strip, filters))
3709             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3710     return _match_func
3711
3712
3713 class download_range_func:
3714     def __init__(self, chapters, ranges):
3715         self.chapters, self.ranges = chapters, ranges
3716
3717     def __call__(self, info_dict, ydl):
3718         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3719                    else 'Cannot match chapters since chapter information is unavailable')
3720         for regex in self.chapters or []:
3721             for i, chapter in enumerate(info_dict.get('chapters') or []):
3722                 if re.search(regex, chapter['title']):
3723                     warning = None
3724                     yield {**chapter, 'index': i}
3725         if self.chapters and warning:
3726             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3727
3728         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3729
3730     def __eq__(self, other):
3731         return (isinstance(other, download_range_func)
3732                 and self.chapters == other.chapters and self.ranges == other.ranges)
3733
3734
3735 def parse_dfxp_time_expr(time_expr):
3736     if not time_expr:
3737         return
3738
3739     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3740     if mobj:
3741         return float(mobj.group('time_offset'))
3742
3743     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3744     if mobj:
3745         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3746
3747
3748 def srt_subtitles_timecode(seconds):
3749     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3750
3751
3752 def ass_subtitles_timecode(seconds):
3753     time = timetuple_from_msec(seconds * 1000)
3754     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3755
3756
3757 def dfxp2srt(dfxp_data):
3758     '''
3759     @param dfxp_data A bytes-like object containing DFXP data
3760     @returns A unicode object containing converted SRT data
3761     '''
3762     LEGACY_NAMESPACES = (
3763         (b'http://www.w3.org/ns/ttml', [
3764             b'http://www.w3.org/2004/11/ttaf1',
3765             b'http://www.w3.org/2006/04/ttaf1',
3766             b'http://www.w3.org/2006/10/ttaf1',
3767         ]),
3768         (b'http://www.w3.org/ns/ttml#styling', [
3769             b'http://www.w3.org/ns/ttml#style',
3770         ]),
3771     )
3772
3773     SUPPORTED_STYLING = [
3774         'color',
3775         'fontFamily',
3776         'fontSize',
3777         'fontStyle',
3778         'fontWeight',
3779         'textDecoration'
3780     ]
3781
3782     _x = functools.partial(xpath_with_ns, ns_map={
3783         'xml': 'http://www.w3.org/XML/1998/namespace',
3784         'ttml': 'http://www.w3.org/ns/ttml',
3785         'tts': 'http://www.w3.org/ns/ttml#styling',
3786     })
3787
3788     styles = {}
3789     default_style = {}
3790
3791     class TTMLPElementParser:
3792         _out = ''
3793         _unclosed_elements = []
3794         _applied_styles = []
3795
3796         def start(self, tag, attrib):
3797             if tag in (_x('ttml:br'), 'br'):
3798                 self._out += '\n'
3799             else:
3800                 unclosed_elements = []
3801                 style = {}
3802                 element_style_id = attrib.get('style')
3803                 if default_style:
3804                     style.update(default_style)
3805                 if element_style_id:
3806                     style.update(styles.get(element_style_id, {}))
3807                 for prop in SUPPORTED_STYLING:
3808                     prop_val = attrib.get(_x('tts:' + prop))
3809                     if prop_val:
3810                         style[prop] = prop_val
3811                 if style:
3812                     font = ''
3813                     for k, v in sorted(style.items()):
3814                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3815                             continue
3816                         if k == 'color':
3817                             font += ' color="%s"' % v
3818                         elif k == 'fontSize':
3819                             font += ' size="%s"' % v
3820                         elif k == 'fontFamily':
3821                             font += ' face="%s"' % v
3822                         elif k == 'fontWeight' and v == 'bold':
3823                             self._out += '<b>'
3824                             unclosed_elements.append('b')
3825                         elif k == 'fontStyle' and v == 'italic':
3826                             self._out += '<i>'
3827                             unclosed_elements.append('i')
3828                         elif k == 'textDecoration' and v == 'underline':
3829                             self._out += '<u>'
3830                             unclosed_elements.append('u')
3831                     if font:
3832                         self._out += '<font' + font + '>'
3833                         unclosed_elements.append('font')
3834                     applied_style = {}
3835                     if self._applied_styles:
3836                         applied_style.update(self._applied_styles[-1])
3837                     applied_style.update(style)
3838                     self._applied_styles.append(applied_style)
3839                 self._unclosed_elements.append(unclosed_elements)
3840
3841         def end(self, tag):
3842             if tag not in (_x('ttml:br'), 'br'):
3843                 unclosed_elements = self._unclosed_elements.pop()
3844                 for element in reversed(unclosed_elements):
3845                     self._out += '</%s>' % element
3846                 if unclosed_elements and self._applied_styles:
3847                     self._applied_styles.pop()
3848
3849         def data(self, data):
3850             self._out += data
3851
3852         def close(self):
3853             return self._out.strip()
3854
3855     def parse_node(node):
3856         target = TTMLPElementParser()
3857         parser = xml.etree.ElementTree.XMLParser(target=target)
3858         parser.feed(xml.etree.ElementTree.tostring(node))
3859         return parser.close()
3860
3861     for k, v in LEGACY_NAMESPACES:
3862         for ns in v:
3863             dfxp_data = dfxp_data.replace(ns, k)
3864
3865     dfxp = compat_etree_fromstring(dfxp_data)
3866     out = []
3867     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3868
3869     if not paras:
3870         raise ValueError('Invalid dfxp/TTML subtitle')
3871
3872     repeat = False
3873     while True:
3874         for style in dfxp.findall(_x('.//ttml:style')):
3875             style_id = style.get('id') or style.get(_x('xml:id'))
3876             if not style_id:
3877                 continue
3878             parent_style_id = style.get('style')
3879             if parent_style_id:
3880                 if parent_style_id not in styles:
3881                     repeat = True
3882                     continue
3883                 styles[style_id] = styles[parent_style_id].copy()
3884             for prop in SUPPORTED_STYLING:
3885                 prop_val = style.get(_x('tts:' + prop))
3886                 if prop_val:
3887                     styles.setdefault(style_id, {})[prop] = prop_val
3888         if repeat:
3889             repeat = False
3890         else:
3891             break
3892
3893     for p in ('body', 'div'):
3894         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3895         if ele is None:
3896             continue
3897         style = styles.get(ele.get('style'))
3898         if not style:
3899             continue
3900         default_style.update(style)
3901
3902     for para, index in zip(paras, itertools.count(1)):
3903         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3904         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3905         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3906         if begin_time is None:
3907             continue
3908         if not end_time:
3909             if not dur:
3910                 continue
3911             end_time = begin_time + dur
3912         out.append('%d\n%s --> %s\n%s\n\n' % (
3913             index,
3914             srt_subtitles_timecode(begin_time),
3915             srt_subtitles_timecode(end_time),
3916             parse_node(para)))
3917
3918     return ''.join(out)
3919
3920
3921 def cli_option(params, command_option, param, separator=None):
3922     param = params.get(param)
3923     return ([] if param is None
3924             else [command_option, str(param)] if separator is None
3925             else [f'{command_option}{separator}{param}'])
3926
3927
3928 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3929     param = params.get(param)
3930     assert param in (True, False, None)
3931     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3932
3933
3934 def cli_valueless_option(params, command_option, param, expected_value=True):
3935     return [command_option] if params.get(param) == expected_value else []
3936
3937
3938 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3939     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3940         if use_compat:
3941             return argdict
3942         else:
3943             argdict = None
3944     if argdict is None:
3945         return default
3946     assert isinstance(argdict, dict)
3947
3948     assert isinstance(keys, (list, tuple))
3949     for key_list in keys:
3950         arg_list = list(filter(
3951             lambda x: x is not None,
3952             [argdict.get(key.lower()) for key in variadic(key_list)]))
3953         if arg_list:
3954             return [arg for args in arg_list for arg in args]
3955     return default
3956
3957
3958 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3959     main_key, exe = main_key.lower(), exe.lower()
3960     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3961     keys = [f'{root_key}{k}' for k in (keys or [''])]
3962     if root_key in keys:
3963         if main_key != exe:
3964             keys.append((main_key, exe))
3965         keys.append('default')
3966     else:
3967         use_compat = False
3968     return cli_configuration_args(argdict, keys, default, use_compat)
3969
3970
3971 class ISO639Utils:
3972     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3973     _lang_map = {
3974         'aa': 'aar',
3975         'ab': 'abk',
3976         'ae': 'ave',
3977         'af': 'afr',
3978         'ak': 'aka',
3979         'am': 'amh',
3980         'an': 'arg',
3981         'ar': 'ara',
3982         'as': 'asm',
3983         'av': 'ava',
3984         'ay': 'aym',
3985         'az': 'aze',
3986         'ba': 'bak',
3987         'be': 'bel',
3988         'bg': 'bul',
3989         'bh': 'bih',
3990         'bi': 'bis',
3991         'bm': 'bam',
3992         'bn': 'ben',
3993         'bo': 'bod',
3994         'br': 'bre',
3995         'bs': 'bos',
3996         'ca': 'cat',
3997         'ce': 'che',
3998         'ch': 'cha',
3999         'co': 'cos',
4000         'cr': 'cre',
4001         'cs': 'ces',
4002         'cu': 'chu',
4003         'cv': 'chv',
4004         'cy': 'cym',
4005         'da': 'dan',
4006         'de': 'deu',
4007         'dv': 'div',
4008         'dz': 'dzo',
4009         'ee': 'ewe',
4010         'el': 'ell',
4011         'en': 'eng',
4012         'eo': 'epo',
4013         'es': 'spa',
4014         'et': 'est',
4015         'eu': 'eus',
4016         'fa': 'fas',
4017         'ff': 'ful',
4018         'fi': 'fin',
4019         'fj': 'fij',
4020         'fo': 'fao',
4021         'fr': 'fra',
4022         'fy': 'fry',
4023         'ga': 'gle',
4024         'gd': 'gla',
4025         'gl': 'glg',
4026         'gn': 'grn',
4027         'gu': 'guj',
4028         'gv': 'glv',
4029         'ha': 'hau',
4030         'he': 'heb',
4031         'iw': 'heb',  # Replaced by he in 1989 revision
4032         'hi': 'hin',
4033         'ho': 'hmo',
4034         'hr': 'hrv',
4035         'ht': 'hat',
4036         'hu': 'hun',
4037         'hy': 'hye',
4038         'hz': 'her',
4039         'ia': 'ina',
4040         'id': 'ind',
4041         'in': 'ind',  # Replaced by id in 1989 revision
4042         'ie': 'ile',
4043         'ig': 'ibo',
4044         'ii': 'iii',
4045         'ik': 'ipk',
4046         'io': 'ido',
4047         'is': 'isl',
4048         'it': 'ita',
4049         'iu': 'iku',
4050         'ja': 'jpn',
4051         'jv': 'jav',
4052         'ka': 'kat',
4053         'kg': 'kon',
4054         'ki': 'kik',
4055         'kj': 'kua',
4056         'kk': 'kaz',
4057         'kl': 'kal',
4058         'km': 'khm',
4059         'kn': 'kan',
4060         'ko': 'kor',
4061         'kr': 'kau',
4062         'ks': 'kas',
4063         'ku': 'kur',
4064         'kv': 'kom',
4065         'kw': 'cor',
4066         'ky': 'kir',
4067         'la': 'lat',
4068         'lb': 'ltz',
4069         'lg': 'lug',
4070         'li': 'lim',
4071         'ln': 'lin',
4072         'lo': 'lao',
4073         'lt': 'lit',
4074         'lu': 'lub',
4075         'lv': 'lav',
4076         'mg': 'mlg',
4077         'mh': 'mah',
4078         'mi': 'mri',
4079         'mk': 'mkd',
4080         'ml': 'mal',
4081         'mn': 'mon',
4082         'mr': 'mar',
4083         'ms': 'msa',
4084         'mt': 'mlt',
4085         'my': 'mya',
4086         'na': 'nau',
4087         'nb': 'nob',
4088         'nd': 'nde',
4089         'ne': 'nep',
4090         'ng': 'ndo',
4091         'nl': 'nld',
4092         'nn': 'nno',
4093         'no': 'nor',
4094         'nr': 'nbl',
4095         'nv': 'nav',
4096         'ny': 'nya',
4097         'oc': 'oci',
4098         'oj': 'oji',
4099         'om': 'orm',
4100         'or': 'ori',
4101         'os': 'oss',
4102         'pa': 'pan',
4103         'pi': 'pli',
4104         'pl': 'pol',
4105         'ps': 'pus',
4106         'pt': 'por',
4107         'qu': 'que',
4108         'rm': 'roh',
4109         'rn': 'run',
4110         'ro': 'ron',
4111         'ru': 'rus',
4112         'rw': 'kin',
4113         'sa': 'san',
4114         'sc': 'srd',
4115         'sd': 'snd',
4116         'se': 'sme',
4117         'sg': 'sag',
4118         'si': 'sin',
4119         'sk': 'slk',
4120         'sl': 'slv',
4121         'sm': 'smo',
4122         'sn': 'sna',
4123         'so': 'som',
4124         'sq': 'sqi',
4125         'sr': 'srp',
4126         'ss': 'ssw',
4127         'st': 'sot',
4128         'su': 'sun',
4129         'sv': 'swe',
4130         'sw': 'swa',
4131         'ta': 'tam',
4132         'te': 'tel',
4133         'tg': 'tgk',
4134         'th': 'tha',
4135         'ti': 'tir',
4136         'tk': 'tuk',
4137         'tl': 'tgl',
4138         'tn': 'tsn',
4139         'to': 'ton',
4140         'tr': 'tur',
4141         'ts': 'tso',
4142         'tt': 'tat',
4143         'tw': 'twi',
4144         'ty': 'tah',
4145         'ug': 'uig',
4146         'uk': 'ukr',
4147         'ur': 'urd',
4148         'uz': 'uzb',
4149         've': 'ven',
4150         'vi': 'vie',
4151         'vo': 'vol',
4152         'wa': 'wln',
4153         'wo': 'wol',
4154         'xh': 'xho',
4155         'yi': 'yid',
4156         'ji': 'yid',  # Replaced by yi in 1989 revision
4157         'yo': 'yor',
4158         'za': 'zha',
4159         'zh': 'zho',
4160         'zu': 'zul',
4161     }
4162
4163     @classmethod
4164     def short2long(cls, code):
4165         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4166         return cls._lang_map.get(code[:2])
4167
4168     @classmethod
4169     def long2short(cls, code):
4170         """Convert language code from ISO 639-2/T to ISO 639-1"""
4171         for short_name, long_name in cls._lang_map.items():
4172             if long_name == code:
4173                 return short_name
4174
4175
4176 class ISO3166Utils:
4177     # From http://data.okfn.org/data/core/country-list
4178     _country_map = {
4179         'AF': 'Afghanistan',
4180         'AX': 'Åland Islands',
4181         'AL': 'Albania',
4182         'DZ': 'Algeria',
4183         'AS': 'American Samoa',
4184         'AD': 'Andorra',
4185         'AO': 'Angola',
4186         'AI': 'Anguilla',
4187         'AQ': 'Antarctica',
4188         'AG': 'Antigua and Barbuda',
4189         'AR': 'Argentina',
4190         'AM': 'Armenia',
4191         'AW': 'Aruba',
4192         'AU': 'Australia',
4193         'AT': 'Austria',
4194         'AZ': 'Azerbaijan',
4195         'BS': 'Bahamas',
4196         'BH': 'Bahrain',
4197         'BD': 'Bangladesh',
4198         'BB': 'Barbados',
4199         'BY': 'Belarus',
4200         'BE': 'Belgium',
4201         'BZ': 'Belize',
4202         'BJ': 'Benin',
4203         'BM': 'Bermuda',
4204         'BT': 'Bhutan',
4205         'BO': 'Bolivia, Plurinational State of',
4206         'BQ': 'Bonaire, Sint Eustatius and Saba',
4207         'BA': 'Bosnia and Herzegovina',
4208         'BW': 'Botswana',
4209         'BV': 'Bouvet Island',
4210         'BR': 'Brazil',
4211         'IO': 'British Indian Ocean Territory',
4212         'BN': 'Brunei Darussalam',
4213         'BG': 'Bulgaria',
4214         'BF': 'Burkina Faso',
4215         'BI': 'Burundi',
4216         'KH': 'Cambodia',
4217         'CM': 'Cameroon',
4218         'CA': 'Canada',
4219         'CV': 'Cape Verde',
4220         'KY': 'Cayman Islands',
4221         'CF': 'Central African Republic',
4222         'TD': 'Chad',
4223         'CL': 'Chile',
4224         'CN': 'China',
4225         'CX': 'Christmas Island',
4226         'CC': 'Cocos (Keeling) Islands',
4227         'CO': 'Colombia',
4228         'KM': 'Comoros',
4229         'CG': 'Congo',
4230         'CD': 'Congo, the Democratic Republic of the',
4231         'CK': 'Cook Islands',
4232         'CR': 'Costa Rica',
4233         'CI': 'Côte d\'Ivoire',
4234         'HR': 'Croatia',
4235         'CU': 'Cuba',
4236         'CW': 'Curaçao',
4237         'CY': 'Cyprus',
4238         'CZ': 'Czech Republic',
4239         'DK': 'Denmark',
4240         'DJ': 'Djibouti',
4241         'DM': 'Dominica',
4242         'DO': 'Dominican Republic',
4243         'EC': 'Ecuador',
4244         'EG': 'Egypt',
4245         'SV': 'El Salvador',
4246         'GQ': 'Equatorial Guinea',
4247         'ER': 'Eritrea',
4248         'EE': 'Estonia',
4249         'ET': 'Ethiopia',
4250         'FK': 'Falkland Islands (Malvinas)',
4251         'FO': 'Faroe Islands',
4252         'FJ': 'Fiji',
4253         'FI': 'Finland',
4254         'FR': 'France',
4255         'GF': 'French Guiana',
4256         'PF': 'French Polynesia',
4257         'TF': 'French Southern Territories',
4258         'GA': 'Gabon',
4259         'GM': 'Gambia',
4260         'GE': 'Georgia',
4261         'DE': 'Germany',
4262         'GH': 'Ghana',
4263         'GI': 'Gibraltar',
4264         'GR': 'Greece',
4265         'GL': 'Greenland',
4266         'GD': 'Grenada',
4267         'GP': 'Guadeloupe',
4268         'GU': 'Guam',
4269         'GT': 'Guatemala',
4270         'GG': 'Guernsey',
4271         'GN': 'Guinea',
4272         'GW': 'Guinea-Bissau',
4273         'GY': 'Guyana',
4274         'HT': 'Haiti',
4275         'HM': 'Heard Island and McDonald Islands',
4276         'VA': 'Holy See (Vatican City State)',
4277         'HN': 'Honduras',
4278         'HK': 'Hong Kong',
4279         'HU': 'Hungary',
4280         'IS': 'Iceland',
4281         'IN': 'India',
4282         'ID': 'Indonesia',
4283         'IR': 'Iran, Islamic Republic of',
4284         'IQ': 'Iraq',
4285         'IE': 'Ireland',
4286         'IM': 'Isle of Man',
4287         'IL': 'Israel',
4288         'IT': 'Italy',
4289         'JM': 'Jamaica',
4290         'JP': 'Japan',
4291         'JE': 'Jersey',
4292         'JO': 'Jordan',
4293         'KZ': 'Kazakhstan',
4294         'KE': 'Kenya',
4295         'KI': 'Kiribati',
4296         'KP': 'Korea, Democratic People\'s Republic of',
4297         'KR': 'Korea, Republic of',
4298         'KW': 'Kuwait',
4299         'KG': 'Kyrgyzstan',
4300         'LA': 'Lao People\'s Democratic Republic',
4301         'LV': 'Latvia',
4302         'LB': 'Lebanon',
4303         'LS': 'Lesotho',
4304         'LR': 'Liberia',
4305         'LY': 'Libya',
4306         'LI': 'Liechtenstein',
4307         'LT': 'Lithuania',
4308         'LU': 'Luxembourg',
4309         'MO': 'Macao',
4310         'MK': 'Macedonia, the Former Yugoslav Republic of',
4311         'MG': 'Madagascar',
4312         'MW': 'Malawi',
4313         'MY': 'Malaysia',
4314         'MV': 'Maldives',
4315         'ML': 'Mali',
4316         'MT': 'Malta',
4317         'MH': 'Marshall Islands',
4318         'MQ': 'Martinique',
4319         'MR': 'Mauritania',
4320         'MU': 'Mauritius',
4321         'YT': 'Mayotte',
4322         'MX': 'Mexico',
4323         'FM': 'Micronesia, Federated States of',
4324         'MD': 'Moldova, Republic of',
4325         'MC': 'Monaco',
4326         'MN': 'Mongolia',
4327         'ME': 'Montenegro',
4328         'MS': 'Montserrat',
4329         'MA': 'Morocco',
4330         'MZ': 'Mozambique',
4331         'MM': 'Myanmar',
4332         'NA': 'Namibia',
4333         'NR': 'Nauru',
4334         'NP': 'Nepal',
4335         'NL': 'Netherlands',
4336         'NC': 'New Caledonia',
4337         'NZ': 'New Zealand',
4338         'NI': 'Nicaragua',
4339         'NE': 'Niger',
4340         'NG': 'Nigeria',
4341         'NU': 'Niue',
4342         'NF': 'Norfolk Island',
4343         'MP': 'Northern Mariana Islands',
4344         'NO': 'Norway',
4345         'OM': 'Oman',
4346         'PK': 'Pakistan',
4347         'PW': 'Palau',
4348         'PS': 'Palestine, State of',
4349         'PA': 'Panama',
4350         'PG': 'Papua New Guinea',
4351         'PY': 'Paraguay',
4352         'PE': 'Peru',
4353         'PH': 'Philippines',
4354         'PN': 'Pitcairn',
4355         'PL': 'Poland',
4356         'PT': 'Portugal',
4357         'PR': 'Puerto Rico',
4358         'QA': 'Qatar',
4359         'RE': 'Réunion',
4360         'RO': 'Romania',
4361         'RU': 'Russian Federation',
4362         'RW': 'Rwanda',
4363         'BL': 'Saint Barthélemy',
4364         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4365         'KN': 'Saint Kitts and Nevis',
4366         'LC': 'Saint Lucia',
4367         'MF': 'Saint Martin (French part)',
4368         'PM': 'Saint Pierre and Miquelon',
4369         'VC': 'Saint Vincent and the Grenadines',
4370         'WS': 'Samoa',
4371         'SM': 'San Marino',
4372         'ST': 'Sao Tome and Principe',
4373         'SA': 'Saudi Arabia',
4374         'SN': 'Senegal',
4375         'RS': 'Serbia',
4376         'SC': 'Seychelles',
4377         'SL': 'Sierra Leone',
4378         'SG': 'Singapore',
4379         'SX': 'Sint Maarten (Dutch part)',
4380         'SK': 'Slovakia',
4381         'SI': 'Slovenia',
4382         'SB': 'Solomon Islands',
4383         'SO': 'Somalia',
4384         'ZA': 'South Africa',
4385         'GS': 'South Georgia and the South Sandwich Islands',
4386         'SS': 'South Sudan',
4387         'ES': 'Spain',
4388         'LK': 'Sri Lanka',
4389         'SD': 'Sudan',
4390         'SR': 'Suriname',
4391         'SJ': 'Svalbard and Jan Mayen',
4392         'SZ': 'Swaziland',
4393         'SE': 'Sweden',
4394         'CH': 'Switzerland',
4395         'SY': 'Syrian Arab Republic',
4396         'TW': 'Taiwan, Province of China',
4397         'TJ': 'Tajikistan',
4398         'TZ': 'Tanzania, United Republic of',
4399         'TH': 'Thailand',
4400         'TL': 'Timor-Leste',
4401         'TG': 'Togo',
4402         'TK': 'Tokelau',
4403         'TO': 'Tonga',
4404         'TT': 'Trinidad and Tobago',
4405         'TN': 'Tunisia',
4406         'TR': 'Turkey',
4407         'TM': 'Turkmenistan',
4408         'TC': 'Turks and Caicos Islands',
4409         'TV': 'Tuvalu',
4410         'UG': 'Uganda',
4411         'UA': 'Ukraine',
4412         'AE': 'United Arab Emirates',
4413         'GB': 'United Kingdom',
4414         'US': 'United States',
4415         'UM': 'United States Minor Outlying Islands',
4416         'UY': 'Uruguay',
4417         'UZ': 'Uzbekistan',
4418         'VU': 'Vanuatu',
4419         'VE': 'Venezuela, Bolivarian Republic of',
4420         'VN': 'Viet Nam',
4421         'VG': 'Virgin Islands, British',
4422         'VI': 'Virgin Islands, U.S.',
4423         'WF': 'Wallis and Futuna',
4424         'EH': 'Western Sahara',
4425         'YE': 'Yemen',
4426         'ZM': 'Zambia',
4427         'ZW': 'Zimbabwe',
4428         # Not ISO 3166 codes, but used for IP blocks
4429         'AP': 'Asia/Pacific Region',
4430         'EU': 'Europe',
4431     }
4432
4433     @classmethod
4434     def short2full(cls, code):
4435         """Convert an ISO 3166-2 country code to the corresponding full name"""
4436         return cls._country_map.get(code.upper())
4437
4438
4439 class GeoUtils:
4440     # Major IPv4 address blocks per country
4441     _country_ip_map = {
4442         'AD': '46.172.224.0/19',
4443         'AE': '94.200.0.0/13',
4444         'AF': '149.54.0.0/17',
4445         'AG': '209.59.64.0/18',
4446         'AI': '204.14.248.0/21',
4447         'AL': '46.99.0.0/16',
4448         'AM': '46.70.0.0/15',
4449         'AO': '105.168.0.0/13',
4450         'AP': '182.50.184.0/21',
4451         'AQ': '23.154.160.0/24',
4452         'AR': '181.0.0.0/12',
4453         'AS': '202.70.112.0/20',
4454         'AT': '77.116.0.0/14',
4455         'AU': '1.128.0.0/11',
4456         'AW': '181.41.0.0/18',
4457         'AX': '185.217.4.0/22',
4458         'AZ': '5.197.0.0/16',
4459         'BA': '31.176.128.0/17',
4460         'BB': '65.48.128.0/17',
4461         'BD': '114.130.0.0/16',
4462         'BE': '57.0.0.0/8',
4463         'BF': '102.178.0.0/15',
4464         'BG': '95.42.0.0/15',
4465         'BH': '37.131.0.0/17',
4466         'BI': '154.117.192.0/18',
4467         'BJ': '137.255.0.0/16',
4468         'BL': '185.212.72.0/23',
4469         'BM': '196.12.64.0/18',
4470         'BN': '156.31.0.0/16',
4471         'BO': '161.56.0.0/16',
4472         'BQ': '161.0.80.0/20',
4473         'BR': '191.128.0.0/12',
4474         'BS': '24.51.64.0/18',
4475         'BT': '119.2.96.0/19',
4476         'BW': '168.167.0.0/16',
4477         'BY': '178.120.0.0/13',
4478         'BZ': '179.42.192.0/18',
4479         'CA': '99.224.0.0/11',
4480         'CD': '41.243.0.0/16',
4481         'CF': '197.242.176.0/21',
4482         'CG': '160.113.0.0/16',
4483         'CH': '85.0.0.0/13',
4484         'CI': '102.136.0.0/14',
4485         'CK': '202.65.32.0/19',
4486         'CL': '152.172.0.0/14',
4487         'CM': '102.244.0.0/14',
4488         'CN': '36.128.0.0/10',
4489         'CO': '181.240.0.0/12',
4490         'CR': '201.192.0.0/12',
4491         'CU': '152.206.0.0/15',
4492         'CV': '165.90.96.0/19',
4493         'CW': '190.88.128.0/17',
4494         'CY': '31.153.0.0/16',
4495         'CZ': '88.100.0.0/14',
4496         'DE': '53.0.0.0/8',
4497         'DJ': '197.241.0.0/17',
4498         'DK': '87.48.0.0/12',
4499         'DM': '192.243.48.0/20',
4500         'DO': '152.166.0.0/15',
4501         'DZ': '41.96.0.0/12',
4502         'EC': '186.68.0.0/15',
4503         'EE': '90.190.0.0/15',
4504         'EG': '156.160.0.0/11',
4505         'ER': '196.200.96.0/20',
4506         'ES': '88.0.0.0/11',
4507         'ET': '196.188.0.0/14',
4508         'EU': '2.16.0.0/13',
4509         'FI': '91.152.0.0/13',
4510         'FJ': '144.120.0.0/16',
4511         'FK': '80.73.208.0/21',
4512         'FM': '119.252.112.0/20',
4513         'FO': '88.85.32.0/19',
4514         'FR': '90.0.0.0/9',
4515         'GA': '41.158.0.0/15',
4516         'GB': '25.0.0.0/8',
4517         'GD': '74.122.88.0/21',
4518         'GE': '31.146.0.0/16',
4519         'GF': '161.22.64.0/18',
4520         'GG': '62.68.160.0/19',
4521         'GH': '154.160.0.0/12',
4522         'GI': '95.164.0.0/16',
4523         'GL': '88.83.0.0/19',
4524         'GM': '160.182.0.0/15',
4525         'GN': '197.149.192.0/18',
4526         'GP': '104.250.0.0/19',
4527         'GQ': '105.235.224.0/20',
4528         'GR': '94.64.0.0/13',
4529         'GT': '168.234.0.0/16',
4530         'GU': '168.123.0.0/16',
4531         'GW': '197.214.80.0/20',
4532         'GY': '181.41.64.0/18',
4533         'HK': '113.252.0.0/14',
4534         'HN': '181.210.0.0/16',
4535         'HR': '93.136.0.0/13',
4536         'HT': '148.102.128.0/17',
4537         'HU': '84.0.0.0/14',
4538         'ID': '39.192.0.0/10',
4539         'IE': '87.32.0.0/12',
4540         'IL': '79.176.0.0/13',
4541         'IM': '5.62.80.0/20',
4542         'IN': '117.192.0.0/10',
4543         'IO': '203.83.48.0/21',
4544         'IQ': '37.236.0.0/14',
4545         'IR': '2.176.0.0/12',
4546         'IS': '82.221.0.0/16',
4547         'IT': '79.0.0.0/10',
4548         'JE': '87.244.64.0/18',
4549         'JM': '72.27.0.0/17',
4550         'JO': '176.29.0.0/16',
4551         'JP': '133.0.0.0/8',
4552         'KE': '105.48.0.0/12',
4553         'KG': '158.181.128.0/17',
4554         'KH': '36.37.128.0/17',
4555         'KI': '103.25.140.0/22',
4556         'KM': '197.255.224.0/20',
4557         'KN': '198.167.192.0/19',
4558         'KP': '175.45.176.0/22',
4559         'KR': '175.192.0.0/10',
4560         'KW': '37.36.0.0/14',
4561         'KY': '64.96.0.0/15',
4562         'KZ': '2.72.0.0/13',
4563         'LA': '115.84.64.0/18',
4564         'LB': '178.135.0.0/16',
4565         'LC': '24.92.144.0/20',
4566         'LI': '82.117.0.0/19',
4567         'LK': '112.134.0.0/15',
4568         'LR': '102.183.0.0/16',
4569         'LS': '129.232.0.0/17',
4570         'LT': '78.56.0.0/13',
4571         'LU': '188.42.0.0/16',
4572         'LV': '46.109.0.0/16',
4573         'LY': '41.252.0.0/14',
4574         'MA': '105.128.0.0/11',
4575         'MC': '88.209.64.0/18',
4576         'MD': '37.246.0.0/16',
4577         'ME': '178.175.0.0/17',
4578         'MF': '74.112.232.0/21',
4579         'MG': '154.126.0.0/17',
4580         'MH': '117.103.88.0/21',
4581         'MK': '77.28.0.0/15',
4582         'ML': '154.118.128.0/18',
4583         'MM': '37.111.0.0/17',
4584         'MN': '49.0.128.0/17',
4585         'MO': '60.246.0.0/16',
4586         'MP': '202.88.64.0/20',
4587         'MQ': '109.203.224.0/19',
4588         'MR': '41.188.64.0/18',
4589         'MS': '208.90.112.0/22',
4590         'MT': '46.11.0.0/16',
4591         'MU': '105.16.0.0/12',
4592         'MV': '27.114.128.0/18',
4593         'MW': '102.70.0.0/15',
4594         'MX': '187.192.0.0/11',
4595         'MY': '175.136.0.0/13',
4596         'MZ': '197.218.0.0/15',
4597         'NA': '41.182.0.0/16',
4598         'NC': '101.101.0.0/18',
4599         'NE': '197.214.0.0/18',
4600         'NF': '203.17.240.0/22',
4601         'NG': '105.112.0.0/12',
4602         'NI': '186.76.0.0/15',
4603         'NL': '145.96.0.0/11',
4604         'NO': '84.208.0.0/13',
4605         'NP': '36.252.0.0/15',
4606         'NR': '203.98.224.0/19',
4607         'NU': '49.156.48.0/22',
4608         'NZ': '49.224.0.0/14',
4609         'OM': '5.36.0.0/15',
4610         'PA': '186.72.0.0/15',
4611         'PE': '186.160.0.0/14',
4612         'PF': '123.50.64.0/18',
4613         'PG': '124.240.192.0/19',
4614         'PH': '49.144.0.0/13',
4615         'PK': '39.32.0.0/11',
4616         'PL': '83.0.0.0/11',
4617         'PM': '70.36.0.0/20',
4618         'PR': '66.50.0.0/16',
4619         'PS': '188.161.0.0/16',
4620         'PT': '85.240.0.0/13',
4621         'PW': '202.124.224.0/20',
4622         'PY': '181.120.0.0/14',
4623         'QA': '37.210.0.0/15',
4624         'RE': '102.35.0.0/16',
4625         'RO': '79.112.0.0/13',
4626         'RS': '93.86.0.0/15',
4627         'RU': '5.136.0.0/13',
4628         'RW': '41.186.0.0/16',
4629         'SA': '188.48.0.0/13',
4630         'SB': '202.1.160.0/19',
4631         'SC': '154.192.0.0/11',
4632         'SD': '102.120.0.0/13',
4633         'SE': '78.64.0.0/12',
4634         'SG': '8.128.0.0/10',
4635         'SI': '188.196.0.0/14',
4636         'SK': '78.98.0.0/15',
4637         'SL': '102.143.0.0/17',
4638         'SM': '89.186.32.0/19',
4639         'SN': '41.82.0.0/15',
4640         'SO': '154.115.192.0/18',
4641         'SR': '186.179.128.0/17',
4642         'SS': '105.235.208.0/21',
4643         'ST': '197.159.160.0/19',
4644         'SV': '168.243.0.0/16',
4645         'SX': '190.102.0.0/20',
4646         'SY': '5.0.0.0/16',
4647         'SZ': '41.84.224.0/19',
4648         'TC': '65.255.48.0/20',
4649         'TD': '154.68.128.0/19',
4650         'TG': '196.168.0.0/14',
4651         'TH': '171.96.0.0/13',
4652         'TJ': '85.9.128.0/18',
4653         'TK': '27.96.24.0/21',
4654         'TL': '180.189.160.0/20',
4655         'TM': '95.85.96.0/19',
4656         'TN': '197.0.0.0/11',
4657         'TO': '175.176.144.0/21',
4658         'TR': '78.160.0.0/11',
4659         'TT': '186.44.0.0/15',
4660         'TV': '202.2.96.0/19',
4661         'TW': '120.96.0.0/11',
4662         'TZ': '156.156.0.0/14',
4663         'UA': '37.52.0.0/14',
4664         'UG': '102.80.0.0/13',
4665         'US': '6.0.0.0/8',
4666         'UY': '167.56.0.0/13',
4667         'UZ': '84.54.64.0/18',
4668         'VA': '212.77.0.0/19',
4669         'VC': '207.191.240.0/21',
4670         'VE': '186.88.0.0/13',
4671         'VG': '66.81.192.0/20',
4672         'VI': '146.226.0.0/16',
4673         'VN': '14.160.0.0/11',
4674         'VU': '202.80.32.0/20',
4675         'WF': '117.20.32.0/21',
4676         'WS': '202.4.32.0/19',
4677         'YE': '134.35.0.0/16',
4678         'YT': '41.242.116.0/22',
4679         'ZA': '41.0.0.0/11',
4680         'ZM': '102.144.0.0/13',
4681         'ZW': '102.177.192.0/18',
4682     }
4683
4684     @classmethod
4685     def random_ipv4(cls, code_or_block):
4686         if len(code_or_block) == 2:
4687             block = cls._country_ip_map.get(code_or_block.upper())
4688             if not block:
4689                 return None
4690         else:
4691             block = code_or_block
4692         addr, preflen = block.split('/')
4693         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4694         addr_max = addr_min | (0xffffffff >> int(preflen))
4695         return str(socket.inet_ntoa(
4696             struct.pack('!L', random.randint(addr_min, addr_max))))
4697
4698
4699 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4700     def __init__(self, proxies=None):
4701         # Set default handlers
4702         for type in ('http', 'https'):
4703             setattr(self, '%s_open' % type,
4704                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4705                         meth(r, proxy, type))
4706         urllib.request.ProxyHandler.__init__(self, proxies)
4707
4708     def proxy_open(self, req, proxy, type):
4709         req_proxy = req.headers.get('Ytdl-request-proxy')
4710         if req_proxy is not None:
4711             proxy = req_proxy
4712             del req.headers['Ytdl-request-proxy']
4713
4714         if proxy == '__noproxy__':
4715             return None  # No Proxy
4716         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4717             req.add_header('Ytdl-socks-proxy', proxy)
4718             # yt-dlp's http/https handlers do wrapping the socket with socks
4719             return None
4720         return urllib.request.ProxyHandler.proxy_open(
4721             self, req, proxy, type)
4722
4723
4724 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4725 # released into Public Domain
4726 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4727
4728 def long_to_bytes(n, blocksize=0):
4729     """long_to_bytes(n:long, blocksize:int) : string
4730     Convert a long integer to a byte string.
4731
4732     If optional blocksize is given and greater than zero, pad the front of the
4733     byte string with binary zeros so that the length is a multiple of
4734     blocksize.
4735     """
4736     # after much testing, this algorithm was deemed to be the fastest
4737     s = b''
4738     n = int(n)
4739     while n > 0:
4740         s = struct.pack('>I', n & 0xffffffff) + s
4741         n = n >> 32
4742     # strip off leading zeros
4743     for i in range(len(s)):
4744         if s[i] != b'\000'[0]:
4745             break
4746     else:
4747         # only happens when n == 0
4748         s = b'\000'
4749         i = 0
4750     s = s[i:]
4751     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4752     # de-padding being done above, but sigh...
4753     if blocksize > 0 and len(s) % blocksize:
4754         s = (blocksize - len(s) % blocksize) * b'\000' + s
4755     return s
4756
4757
4758 def bytes_to_long(s):
4759     """bytes_to_long(string) : long
4760     Convert a byte string to a long integer.
4761
4762     This is (essentially) the inverse of long_to_bytes().
4763     """
4764     acc = 0
4765     length = len(s)
4766     if length % 4:
4767         extra = (4 - length % 4)
4768         s = b'\000' * extra + s
4769         length = length + extra
4770     for i in range(0, length, 4):
4771         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4772     return acc
4773
4774
4775 def ohdave_rsa_encrypt(data, exponent, modulus):
4776     '''
4777     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4778
4779     Input:
4780         data: data to encrypt, bytes-like object
4781         exponent, modulus: parameter e and N of RSA algorithm, both integer
4782     Output: hex string of encrypted data
4783
4784     Limitation: supports one block encryption only
4785     '''
4786
4787     payload = int(binascii.hexlify(data[::-1]), 16)
4788     encrypted = pow(payload, exponent, modulus)
4789     return '%x' % encrypted
4790
4791
4792 def pkcs1pad(data, length):
4793     """
4794     Padding input data with PKCS#1 scheme
4795
4796     @param {int[]} data        input data
4797     @param {int}   length      target length
4798     @returns {int[]}           padded data
4799     """
4800     if len(data) > length - 11:
4801         raise ValueError('Input data too long for PKCS#1 padding')
4802
4803     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4804     return [0, 2] + pseudo_random + [0] + data
4805
4806
4807 def _base_n_table(n, table):
4808     if not table and not n:
4809         raise ValueError('Either table or n must be specified')
4810     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4811
4812     if n and n != len(table):
4813         raise ValueError(f'base {n} exceeds table length {len(table)}')
4814     return table
4815
4816
4817 def encode_base_n(num, n=None, table=None):
4818     """Convert given int to a base-n string"""
4819     table = _base_n_table(n, table)
4820     if not num:
4821         return table[0]
4822
4823     result, base = '', len(table)
4824     while num:
4825         result = table[num % base] + result
4826         num = num // base
4827     return result
4828
4829
4830 def decode_base_n(string, n=None, table=None):
4831     """Convert given base-n string to int"""
4832     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4833     result, base = 0, len(table)
4834     for char in string:
4835         result = result * base + table[char]
4836     return result
4837
4838
4839 def decode_base(value, digits):
4840     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4841                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4842     return decode_base_n(value, table=digits)
4843
4844
4845 def decode_packed_codes(code):
4846     mobj = re.search(PACKED_CODES_RE, code)
4847     obfuscated_code, base, count, symbols = mobj.groups()
4848     base = int(base)
4849     count = int(count)
4850     symbols = symbols.split('|')
4851     symbol_table = {}
4852
4853     while count:
4854         count -= 1
4855         base_n_count = encode_base_n(count, base)
4856         symbol_table[base_n_count] = symbols[count] or base_n_count
4857
4858     return re.sub(
4859         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4860         obfuscated_code)
4861
4862
4863 def caesar(s, alphabet, shift):
4864     if shift == 0:
4865         return s
4866     l = len(alphabet)
4867     return ''.join(
4868         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4869         for c in s)
4870
4871
4872 def rot47(s):
4873     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4874
4875
4876 def parse_m3u8_attributes(attrib):
4877     info = {}
4878     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4879         if val.startswith('"'):
4880             val = val[1:-1]
4881         info[key] = val
4882     return info
4883
4884
4885 def urshift(val, n):
4886     return val >> n if val >= 0 else (val + 0x100000000) >> n
4887
4888
4889 # Based on png2str() written by @gdkchan and improved by @yokrysty
4890 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4891 def decode_png(png_data):
4892     # Reference: https://www.w3.org/TR/PNG/
4893     header = png_data[8:]
4894
4895     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4896         raise OSError('Not a valid PNG file.')
4897
4898     int_map = {1: '>B', 2: '>H', 4: '>I'}
4899     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4900
4901     chunks = []
4902
4903     while header:
4904         length = unpack_integer(header[:4])
4905         header = header[4:]
4906
4907         chunk_type = header[:4]
4908         header = header[4:]
4909
4910         chunk_data = header[:length]
4911         header = header[length:]
4912
4913         header = header[4:]  # Skip CRC
4914
4915         chunks.append({
4916             'type': chunk_type,
4917             'length': length,
4918             'data': chunk_data
4919         })
4920
4921     ihdr = chunks[0]['data']
4922
4923     width = unpack_integer(ihdr[:4])
4924     height = unpack_integer(ihdr[4:8])
4925
4926     idat = b''
4927
4928     for chunk in chunks:
4929         if chunk['type'] == b'IDAT':
4930             idat += chunk['data']
4931
4932     if not idat:
4933         raise OSError('Unable to read PNG data.')
4934
4935     decompressed_data = bytearray(zlib.decompress(idat))
4936
4937     stride = width * 3
4938     pixels = []
4939
4940     def _get_pixel(idx):
4941         x = idx % stride
4942         y = idx // stride
4943         return pixels[y][x]
4944
4945     for y in range(height):
4946         basePos = y * (1 + stride)
4947         filter_type = decompressed_data[basePos]
4948
4949         current_row = []
4950
4951         pixels.append(current_row)
4952
4953         for x in range(stride):
4954             color = decompressed_data[1 + basePos + x]
4955             basex = y * stride + x
4956             left = 0
4957             up = 0
4958
4959             if x > 2:
4960                 left = _get_pixel(basex - 3)
4961             if y > 0:
4962                 up = _get_pixel(basex - stride)
4963
4964             if filter_type == 1:  # Sub
4965                 color = (color + left) & 0xff
4966             elif filter_type == 2:  # Up
4967                 color = (color + up) & 0xff
4968             elif filter_type == 3:  # Average
4969                 color = (color + ((left + up) >> 1)) & 0xff
4970             elif filter_type == 4:  # Paeth
4971                 a = left
4972                 b = up
4973                 c = 0
4974
4975                 if x > 2 and y > 0:
4976                     c = _get_pixel(basex - stride - 3)
4977
4978                 p = a + b - c
4979
4980                 pa = abs(p - a)
4981                 pb = abs(p - b)
4982                 pc = abs(p - c)
4983
4984                 if pa <= pb and pa <= pc:
4985                     color = (color + a) & 0xff
4986                 elif pb <= pc:
4987                     color = (color + b) & 0xff
4988                 else:
4989                     color = (color + c) & 0xff
4990
4991             current_row.append(color)
4992
4993     return width, height, pixels
4994
4995
4996 def write_xattr(path, key, value):
4997     # Windows: Write xattrs to NTFS Alternate Data Streams:
4998     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4999     if compat_os_name == 'nt':
5000         assert ':' not in key
5001         assert os.path.exists(path)
5002
5003         try:
5004             with open(f'{path}:{key}', 'wb') as f:
5005                 f.write(value)
5006         except OSError as e:
5007             raise XAttrMetadataError(e.errno, e.strerror)
5008         return
5009
5010     # UNIX Method 1. Use xattrs/pyxattrs modules
5011
5012     setxattr = None
5013     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5014         # Unicode arguments are not supported in pyxattr until version 0.5.0
5015         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5016         if version_tuple(xattr.__version__) >= (0, 5, 0):
5017             setxattr = xattr.set
5018     elif xattr:
5019         setxattr = xattr.setxattr
5020
5021     if setxattr:
5022         try:
5023             setxattr(path, key, value)
5024         except OSError as e:
5025             raise XAttrMetadataError(e.errno, e.strerror)
5026         return
5027
5028     # UNIX Method 2. Use setfattr/xattr executables
5029     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5030            else 'xattr' if check_executable('xattr', ['-h']) else None)
5031     if not exe:
5032         raise XAttrUnavailableError(
5033             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5034             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5035
5036     value = value.decode()
5037     try:
5038         _, stderr, returncode = Popen.run(
5039             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5040             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5041     except OSError as e:
5042         raise XAttrMetadataError(e.errno, e.strerror)
5043     if returncode:
5044         raise XAttrMetadataError(returncode, stderr)
5045
5046
5047 def random_birthday(year_field, month_field, day_field):
5048     start_date = datetime.date(1950, 1, 1)
5049     end_date = datetime.date(1995, 12, 31)
5050     offset = random.randint(0, (end_date - start_date).days)
5051     random_date = start_date + datetime.timedelta(offset)
5052     return {
5053         year_field: str(random_date.year),
5054         month_field: str(random_date.month),
5055         day_field: str(random_date.day),
5056     }
5057
5058
5059 # Templates for internet shortcut files, which are plain text files.
5060 DOT_URL_LINK_TEMPLATE = '''\
5061 [InternetShortcut]
5062 URL=%(url)s
5063 '''
5064
5065 DOT_WEBLOC_LINK_TEMPLATE = '''\
5066 <?xml version="1.0" encoding="UTF-8"?>
5067 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5068 <plist version="1.0">
5069 <dict>
5070 \t<key>URL</key>
5071 \t<string>%(url)s</string>
5072 </dict>
5073 </plist>
5074 '''
5075
5076 DOT_DESKTOP_LINK_TEMPLATE = '''\
5077 [Desktop Entry]
5078 Encoding=UTF-8
5079 Name=%(filename)s
5080 Type=Link
5081 URL=%(url)s
5082 Icon=text-html
5083 '''
5084
5085 LINK_TEMPLATES = {
5086     'url': DOT_URL_LINK_TEMPLATE,
5087     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5088     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5089 }
5090
5091
5092 def iri_to_uri(iri):
5093     """
5094     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5095
5096     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5097     """
5098
5099     iri_parts = urllib.parse.urlparse(iri)
5100
5101     if '[' in iri_parts.netloc:
5102         raise ValueError('IPv6 URIs are not, yet, supported.')
5103         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5104
5105     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5106
5107     net_location = ''
5108     if iri_parts.username:
5109         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5110         if iri_parts.password is not None:
5111             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5112         net_location += '@'
5113
5114     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5115     # The 'idna' encoding produces ASCII text.
5116     if iri_parts.port is not None and iri_parts.port != 80:
5117         net_location += ':' + str(iri_parts.port)
5118
5119     return urllib.parse.urlunparse(
5120         (iri_parts.scheme,
5121             net_location,
5122
5123             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5124
5125             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5126             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5127
5128             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5129             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5130
5131             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5132
5133     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5134
5135
5136 def to_high_limit_path(path):
5137     if sys.platform in ['win32', 'cygwin']:
5138         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5139         return '\\\\?\\' + os.path.abspath(path)
5140
5141     return path
5142
5143
5144 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5145     val = traverse_obj(obj, *variadic(field))
5146     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5147         return default
5148     return template % func(val)
5149
5150
5151 def clean_podcast_url(url):
5152     return re.sub(r'''(?x)
5153         (?:
5154             (?:
5155                 chtbl\.com/track|
5156                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5157                 play\.podtrac\.com
5158             )/[^/]+|
5159             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5160             flex\.acast\.com|
5161             pd(?:
5162                 cn\.co| # https://podcorn.com/analytics-prefix/
5163                 st\.fm # https://podsights.com/docs/
5164             )/e
5165         )/''', '', url)
5166
5167
5168 _HEX_TABLE = '0123456789abcdef'
5169
5170
5171 def random_uuidv4():
5172     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5173
5174
5175 def make_dir(path, to_screen=None):
5176     try:
5177         dn = os.path.dirname(path)
5178         if dn and not os.path.exists(dn):
5179             os.makedirs(dn)
5180         return True
5181     except OSError as err:
5182         if callable(to_screen) is not None:
5183             to_screen('unable to create directory ' + error_to_compat_str(err))
5184         return False
5185
5186
5187 def get_executable_path():
5188     from .update import _get_variant_and_executable_path
5189
5190     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5191
5192
5193 def load_plugins(name, suffix, namespace):
5194     classes = {}
5195     with contextlib.suppress(FileNotFoundError):
5196         plugins_spec = importlib.util.spec_from_file_location(
5197             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5198         plugins = importlib.util.module_from_spec(plugins_spec)
5199         sys.modules[plugins_spec.name] = plugins
5200         plugins_spec.loader.exec_module(plugins)
5201         for name in dir(plugins):
5202             if name in namespace:
5203                 continue
5204             if not name.endswith(suffix):
5205                 continue
5206             klass = getattr(plugins, name)
5207             classes[name] = namespace[name] = klass
5208     return classes
5209
5210
5211 def traverse_obj(
5212         obj, *path_list, default=None, expected_type=None, get_all=True,
5213         casesense=True, is_user_input=False, traverse_string=False):
5214     ''' Traverse nested list/dict/tuple
5215     @param path_list        A list of paths which are checked one by one.
5216                             Each path is a list of keys where each key is a:
5217                               - None:     Do nothing
5218                               - string:   A dictionary key
5219                               - int:      An index into a list
5220                               - tuple:    A list of keys all of which will be traversed
5221                               - Ellipsis: Fetch all values in the object
5222                               - Function: Takes the key and value as arguments
5223                                           and returns whether the key matches or not
5224     @param default          Default value to return
5225     @param expected_type    Only accept final value of this type (Can also be any callable)
5226     @param get_all          Return all the values obtained from a path or only the first one
5227     @param casesense        Whether to consider dictionary keys as case sensitive
5228     @param is_user_input    Whether the keys are generated from user input. If True,
5229                             strings are converted to int/slice if necessary
5230     @param traverse_string  Whether to traverse inside strings. If True, any
5231                             non-compatible object will also be converted into a string
5232     # TODO: Write tests
5233     '''
5234     if not casesense:
5235         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5236         path_list = (map(_lower, variadic(path)) for path in path_list)
5237
5238     def _traverse_obj(obj, path, _current_depth=0):
5239         nonlocal depth
5240         path = tuple(variadic(path))
5241         for i, key in enumerate(path):
5242             if None in (key, obj):
5243                 return obj
5244             if isinstance(key, (list, tuple)):
5245                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5246                 key = ...
5247             if key is ...:
5248                 obj = (obj.values() if isinstance(obj, dict)
5249                        else obj if isinstance(obj, (list, tuple, LazyList))
5250                        else str(obj) if traverse_string else [])
5251                 _current_depth += 1
5252                 depth = max(depth, _current_depth)
5253                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5254             elif callable(key):
5255                 if isinstance(obj, (list, tuple, LazyList)):
5256                     obj = enumerate(obj)
5257                 elif isinstance(obj, dict):
5258                     obj = obj.items()
5259                 else:
5260                     if not traverse_string:
5261                         return None
5262                     obj = str(obj)
5263                 _current_depth += 1
5264                 depth = max(depth, _current_depth)
5265                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5266             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5267                 obj = (obj.get(key) if casesense or (key in obj)
5268                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5269             else:
5270                 if is_user_input:
5271                     key = (int_or_none(key) if ':' not in key
5272                            else slice(*map(int_or_none, key.split(':'))))
5273                     if key == slice(None):
5274                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5275                 if not isinstance(key, (int, slice)):
5276                     return None
5277                 if not isinstance(obj, (list, tuple, LazyList)):
5278                     if not traverse_string:
5279                         return None
5280                     obj = str(obj)
5281                 try:
5282                     obj = obj[key]
5283                 except IndexError:
5284                     return None
5285         return obj
5286
5287     if isinstance(expected_type, type):
5288         type_test = lambda val: val if isinstance(val, expected_type) else None
5289     else:
5290         type_test = expected_type or IDENTITY
5291
5292     for path in path_list:
5293         depth = 0
5294         val = _traverse_obj(obj, path)
5295         if val is not None:
5296             if depth:
5297                 for _ in range(depth - 1):
5298                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5299                 val = [v for v in map(type_test, val) if v is not None]
5300                 if val:
5301                     return val if get_all else val[0]
5302             else:
5303                 val = type_test(val)
5304                 if val is not None:
5305                     return val
5306     return default
5307
5308
5309 def traverse_dict(dictn, keys, casesense=True):
5310     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5311                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5312     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5313
5314
5315 def get_first(obj, keys, **kwargs):
5316     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5317
5318
5319 def variadic(x, allowed_types=(str, bytes, dict)):
5320     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5321
5322
5323 def time_seconds(**kwargs):
5324     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5325     return t.timestamp()
5326
5327
5328 # create a JSON Web Signature (jws) with HS256 algorithm
5329 # the resulting format is in JWS Compact Serialization
5330 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5331 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5332 def jwt_encode_hs256(payload_data, key, headers={}):
5333     header_data = {
5334         'alg': 'HS256',
5335         'typ': 'JWT',
5336     }
5337     if headers:
5338         header_data.update(headers)
5339     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5340     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5341     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5342     signature_b64 = base64.b64encode(h.digest())
5343     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5344     return token
5345
5346
5347 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5348 def jwt_decode_hs256(jwt):
5349     header_b64, payload_b64, signature_b64 = jwt.split('.')
5350     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5351     return payload_data
5352
5353
5354 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5355
5356
5357 @functools.cache
5358 def supports_terminal_sequences(stream):
5359     if compat_os_name == 'nt':
5360         if not WINDOWS_VT_MODE:
5361             return False
5362     elif not os.getenv('TERM'):
5363         return False
5364     try:
5365         return stream.isatty()
5366     except BaseException:
5367         return False
5368
5369
5370 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5371     if get_windows_version() < (10, 0, 10586):
5372         return
5373     global WINDOWS_VT_MODE
5374     try:
5375         Popen.run('', shell=True)
5376     except Exception:
5377         return
5378
5379     WINDOWS_VT_MODE = True
5380     supports_terminal_sequences.cache_clear()
5381
5382
5383 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5384
5385
5386 def remove_terminal_sequences(string):
5387     return _terminal_sequences_re.sub('', string)
5388
5389
5390 def number_of_digits(number):
5391     return len('%d' % number)
5392
5393
5394 def join_nonempty(*values, delim='-', from_dict=None):
5395     if from_dict is not None:
5396         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5397     return delim.join(map(str, filter(None, values)))
5398
5399
5400 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5401     """
5402     Find the largest format dimensions in terms of video width and, for each thumbnail:
5403     * Modify the URL: Match the width with the provided regex and replace with the former width
5404     * Update dimensions
5405
5406     This function is useful with video services that scale the provided thumbnails on demand
5407     """
5408     _keys = ('width', 'height')
5409     max_dimensions = max(
5410         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5411         default=(0, 0))
5412     if not max_dimensions[0]:
5413         return thumbnails
5414     return [
5415         merge_dicts(
5416             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5417             dict(zip(_keys, max_dimensions)), thumbnail)
5418         for thumbnail in thumbnails
5419     ]
5420
5421
5422 def parse_http_range(range):
5423     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5424     if not range:
5425         return None, None, None
5426     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5427     if not crg:
5428         return None, None, None
5429     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5430
5431
5432 def read_stdin(what):
5433     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5434     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5435     return sys.stdin
5436
5437
5438 def determine_file_encoding(data):
5439     """
5440     Detect the text encoding used
5441     @returns (encoding, bytes to skip)
5442     """
5443
5444     # BOM marks are given priority over declarations
5445     for bom, enc in BOMS:
5446         if data.startswith(bom):
5447             return enc, len(bom)
5448
5449     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5450     # We ignore the endianness to get a good enough match
5451     data = data.replace(b'\0', b'')
5452     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5453     return mobj.group(1).decode() if mobj else None, 0
5454
5455
5456 class Config:
5457     own_args = None
5458     parsed_args = None
5459     filename = None
5460     __initialized = False
5461
5462     def __init__(self, parser, label=None):
5463         self.parser, self.label = parser, label
5464         self._loaded_paths, self.configs = set(), []
5465
5466     def init(self, args=None, filename=None):
5467         assert not self.__initialized
5468         self.own_args, self.filename = args, filename
5469         return self.load_configs()
5470
5471     def load_configs(self):
5472         directory = ''
5473         if self.filename:
5474             location = os.path.realpath(self.filename)
5475             directory = os.path.dirname(location)
5476             if location in self._loaded_paths:
5477                 return False
5478             self._loaded_paths.add(location)
5479
5480         self.__initialized = True
5481         opts, _ = self.parser.parse_known_args(self.own_args)
5482         self.parsed_args = self.own_args
5483         for location in opts.config_locations or []:
5484             if location == '-':
5485                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5486                 continue
5487             location = os.path.join(directory, expand_path(location))
5488             if os.path.isdir(location):
5489                 location = os.path.join(location, 'yt-dlp.conf')
5490             if not os.path.exists(location):
5491                 self.parser.error(f'config location {location} does not exist')
5492             self.append_config(self.read_file(location), location)
5493         return True
5494
5495     def __str__(self):
5496         label = join_nonempty(
5497             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5498             delim=' ')
5499         return join_nonempty(
5500             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5501             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5502             delim='\n')
5503
5504     @staticmethod
5505     def read_file(filename, default=[]):
5506         try:
5507             optionf = open(filename, 'rb')
5508         except OSError:
5509             return default  # silently skip if file is not present
5510         try:
5511             enc, skip = determine_file_encoding(optionf.read(512))
5512             optionf.seek(skip, io.SEEK_SET)
5513         except OSError:
5514             enc = None  # silently skip read errors
5515         try:
5516             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5517             contents = optionf.read().decode(enc or preferredencoding())
5518             res = shlex.split(contents, comments=True)
5519         except Exception as err:
5520             raise ValueError(f'Unable to parse "{filename}": {err}')
5521         finally:
5522             optionf.close()
5523         return res
5524
5525     @staticmethod
5526     def hide_login_info(opts):
5527         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5528         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5529
5530         def _scrub_eq(o):
5531             m = eqre.match(o)
5532             if m:
5533                 return m.group('key') + '=PRIVATE'
5534             else:
5535                 return o
5536
5537         opts = list(map(_scrub_eq, opts))
5538         for idx, opt in enumerate(opts):
5539             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5540                 opts[idx + 1] = 'PRIVATE'
5541         return opts
5542
5543     def append_config(self, *args, label=None):
5544         config = type(self)(self.parser, label)
5545         config._loaded_paths = self._loaded_paths
5546         if config.init(*args):
5547             self.configs.append(config)
5548
5549     @property
5550     def all_args(self):
5551         for config in reversed(self.configs):
5552             yield from config.all_args
5553         yield from self.parsed_args or []
5554
5555     def parse_known_args(self, **kwargs):
5556         return self.parser.parse_known_args(self.all_args, **kwargs)
5557
5558     def parse_args(self):
5559         return self.parser.parse_args(self.all_args)
5560
5561
5562 class WebSocketsWrapper():
5563     """Wraps websockets module to use in non-async scopes"""
5564     pool = None
5565
5566     def __init__(self, url, headers=None, connect=True):
5567         self.loop = asyncio.new_event_loop()
5568         # XXX: "loop" is deprecated
5569         self.conn = websockets.connect(
5570             url, extra_headers=headers, ping_interval=None,
5571             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5572         if connect:
5573             self.__enter__()
5574         atexit.register(self.__exit__, None, None, None)
5575
5576     def __enter__(self):
5577         if not self.pool:
5578             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5579         return self
5580
5581     def send(self, *args):
5582         self.run_with_loop(self.pool.send(*args), self.loop)
5583
5584     def recv(self, *args):
5585         return self.run_with_loop(self.pool.recv(*args), self.loop)
5586
5587     def __exit__(self, type, value, traceback):
5588         try:
5589             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5590         finally:
5591             self.loop.close()
5592             self._cancel_all_tasks(self.loop)
5593
5594     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5595     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5596     @staticmethod
5597     def run_with_loop(main, loop):
5598         if not asyncio.iscoroutine(main):
5599             raise ValueError(f'a coroutine was expected, got {main!r}')
5600
5601         try:
5602             return loop.run_until_complete(main)
5603         finally:
5604             loop.run_until_complete(loop.shutdown_asyncgens())
5605             if hasattr(loop, 'shutdown_default_executor'):
5606                 loop.run_until_complete(loop.shutdown_default_executor())
5607
5608     @staticmethod
5609     def _cancel_all_tasks(loop):
5610         to_cancel = asyncio.all_tasks(loop)
5611
5612         if not to_cancel:
5613             return
5614
5615         for task in to_cancel:
5616             task.cancel()
5617
5618         # XXX: "loop" is removed in python 3.10+
5619         loop.run_until_complete(
5620             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5621
5622         for task in to_cancel:
5623             if task.cancelled():
5624                 continue
5625             if task.exception() is not None:
5626                 loop.call_exception_handler({
5627                     'message': 'unhandled exception during asyncio.run() shutdown',
5628                     'exception': task.exception(),
5629                     'task': task,
5630                 })
5631
5632
5633 def merge_headers(*dicts):
5634     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5635     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5636
5637
5638 def cached_method(f):
5639     """Cache a method"""
5640     signature = inspect.signature(f)
5641
5642     @functools.wraps(f)
5643     def wrapper(self, *args, **kwargs):
5644         bound_args = signature.bind(self, *args, **kwargs)
5645         bound_args.apply_defaults()
5646         key = tuple(bound_args.arguments.values())
5647
5648         if not hasattr(self, '__cached_method__cache'):
5649             self.__cached_method__cache = {}
5650         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5651         if key not in cache:
5652             cache[key] = f(self, *args, **kwargs)
5653         return cache[key]
5654     return wrapper
5655
5656
5657 class classproperty:
5658     """property access for class methods"""
5659
5660     def __init__(self, func):
5661         functools.update_wrapper(self, func)
5662         self.func = func
5663
5664     def __get__(self, _, cls):
5665         return self.func(cls)
5666
5667
5668 class Namespace(types.SimpleNamespace):
5669     """Immutable namespace"""
5670
5671     def __iter__(self):
5672         return iter(self.__dict__.values())
5673
5674     @property
5675     def items_(self):
5676         return self.__dict__.items()
5677
5678
5679 MEDIA_EXTENSIONS = Namespace(
5680     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5681     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5682     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5683     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5684     thumbnails=('jpg', 'png', 'webp'),
5685     storyboards=('mhtml', ),
5686     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5687     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5688 )
5689 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5690 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5691
5692 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5693
5694
5695 class RetryManager:
5696     """Usage:
5697         for retry in RetryManager(...):
5698             try:
5699                 ...
5700             except SomeException as err:
5701                 retry.error = err
5702                 continue
5703     """
5704     attempt, _error = 0, None
5705
5706     def __init__(self, _retries, _error_callback, **kwargs):
5707         self.retries = _retries or 0
5708         self.error_callback = functools.partial(_error_callback, **kwargs)
5709
5710     def _should_retry(self):
5711         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5712
5713     @property
5714     def error(self):
5715         if self._error is NO_DEFAULT:
5716             return None
5717         return self._error
5718
5719     @error.setter
5720     def error(self, value):
5721         self._error = value
5722
5723     def __iter__(self):
5724         while self._should_retry():
5725             self.error = NO_DEFAULT
5726             self.attempt += 1
5727             yield self
5728             if self.error:
5729                 self.error_callback(self.error, self.attempt, self.retries)
5730
5731     @staticmethod
5732     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5733         """Utility function for reporting retries"""
5734         if count > retries:
5735             if error:
5736                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5737             raise e
5738
5739         if not count:
5740             return warn(e)
5741         elif isinstance(e, ExtractorError):
5742             e = remove_end(e.cause or e.orig_msg, '.')
5743         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5744
5745         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5746         if delay:
5747             info(f'Sleeping {delay:.2f} seconds ...')
5748             time.sleep(delay)
5749
5750
5751 def make_archive_id(ie, video_id):
5752     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5753     return f'{ie_key.lower()} {video_id}'
5754
5755
5756 # Deprecated
5757 has_certifi = bool(certifi)
5758 has_websockets = bool(websockets)