yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import urllib.error
  44 import urllib.parse
  45 import urllib.request
  46 import xml.etree.ElementTree
  47 import zlib
  48
  49 from .compat import functools  # isort: split
  50 from .compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from .dependencies import brotli, certifi, websockets, xattr
  58 from .socks import ProxyType, sockssocket
  59
  60
  61 def register_socks_protocols():
  62     # "Register" SOCKS protocols
  63     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  64     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  65     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  66         if scheme not in urllib.parse.uses_netloc:
  67             urllib.parse.uses_netloc.append(scheme)
  68
  69
  70 # This is not clearly defined otherwise
  71 compiled_regex_type = type(re.compile(''))
  72
  73
  74 def random_user_agent():
  75     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  76     _CHROME_VERSIONS = (
  77         '90.0.4430.212',
  78         '90.0.4430.24',
  79         '90.0.4430.70',
  80         '90.0.4430.72',
  81         '90.0.4430.85',
  82         '90.0.4430.93',
  83         '91.0.4472.101',
  84         '91.0.4472.106',
  85         '91.0.4472.114',
  86         '91.0.4472.124',
  87         '91.0.4472.164',
  88         '91.0.4472.19',
  89         '91.0.4472.77',
  90         '92.0.4515.107',
  91         '92.0.4515.115',
  92         '92.0.4515.131',
  93         '92.0.4515.159',
  94         '92.0.4515.43',
  95         '93.0.4556.0',
  96         '93.0.4577.15',
  97         '93.0.4577.63',
  98         '93.0.4577.82',
  99         '94.0.4606.41',
 100         '94.0.4606.54',
 101         '94.0.4606.61',
 102         '94.0.4606.71',
 103         '94.0.4606.81',
 104         '94.0.4606.85',
 105         '95.0.4638.17',
 106         '95.0.4638.50',
 107         '95.0.4638.54',
 108         '95.0.4638.69',
 109         '95.0.4638.74',
 110         '96.0.4664.18',
 111         '96.0.4664.45',
 112         '96.0.4664.55',
 113         '96.0.4664.93',
 114         '97.0.4692.20',
 115     )
 116     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 117
 118
 119 SUPPORTED_ENCODINGS = [
 120     'gzip', 'deflate'
 121 ]
 122 if brotli:
 123     SUPPORTED_ENCODINGS.append('br')
 124
 125 std_headers = {
 126     'User-Agent': random_user_agent(),
 127     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 128     'Accept-Language': 'en-us,en;q=0.5',
 129     'Sec-Fetch-Mode': 'navigate',
 130 }
 131
 132
 133 USER_AGENTS = {
 134     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 135 }
 136
 137
 138 NO_DEFAULT = object()
 139 IDENTITY = lambda x: x
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150 }
 151
 152 # needed for sanitizing filenames in restricted mode
 153 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 154                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 155                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 156
 157 DATE_FORMATS = (
 158     '%d %B %Y',
 159     '%d %b %Y',
 160     '%B %d %Y',
 161     '%B %dst %Y',
 162     '%B %dnd %Y',
 163     '%B %drd %Y',
 164     '%B %dth %Y',
 165     '%b %d %Y',
 166     '%b %dst %Y',
 167     '%b %dnd %Y',
 168     '%b %drd %Y',
 169     '%b %dth %Y',
 170     '%b %dst %Y %I:%M',
 171     '%b %dnd %Y %I:%M',
 172     '%b %drd %Y %I:%M',
 173     '%b %dth %Y %I:%M',
 174     '%Y %m %d',
 175     '%Y-%m-%d',
 176     '%Y.%m.%d.',
 177     '%Y/%m/%d',
 178     '%Y/%m/%d %H:%M',
 179     '%Y/%m/%d %H:%M:%S',
 180     '%Y%m%d%H%M',
 181     '%Y%m%d%H%M%S',
 182     '%Y%m%d',
 183     '%Y-%m-%d %H:%M',
 184     '%Y-%m-%d %H:%M:%S',
 185     '%Y-%m-%d %H:%M:%S.%f',
 186     '%Y-%m-%d %H:%M:%S:%f',
 187     '%d.%m.%Y %H:%M',
 188     '%d.%m.%Y %H.%M',
 189     '%Y-%m-%dT%H:%M:%SZ',
 190     '%Y-%m-%dT%H:%M:%S.%fZ',
 191     '%Y-%m-%dT%H:%M:%S.%f0Z',
 192     '%Y-%m-%dT%H:%M:%S',
 193     '%Y-%m-%dT%H:%M:%S.%f',
 194     '%Y-%m-%dT%H:%M',
 195     '%b %d %Y at %H:%M',
 196     '%b %d %Y at %H:%M:%S',
 197     '%B %d %Y at %H:%M',
 198     '%B %d %Y at %H:%M:%S',
 199     '%H:%M %d-%b-%Y',
 200 )
 201
 202 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 203 DATE_FORMATS_DAY_FIRST.extend([
 204     '%d-%m-%Y',
 205     '%d.%m.%Y',
 206     '%d.%m.%y',
 207     '%d/%m/%Y',
 208     '%d/%m/%y',
 209     '%d/%m/%Y %H:%M:%S',
 210     '%d-%m-%Y %H:%M',
 211 ])
 212
 213 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 214 DATE_FORMATS_MONTH_FIRST.extend([
 215     '%m-%d-%Y',
 216     '%m.%d.%Y',
 217     '%m/%d/%Y',
 218     '%m/%d/%y',
 219     '%m/%d/%Y %H:%M:%S',
 220 ])
 221
 222 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 223 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 224
 225 NUMBER_RE = r'\d+(?:\.\d+)?'
 226
 227
 228 @functools.cache
 229 def preferredencoding():
 230     """Get preferred encoding.
 231
 232     Returns the best encoding scheme for the system, based on
 233     locale.getpreferredencoding() and some further tweaks.
 234     """
 235     try:
 236         pref = locale.getpreferredencoding()
 237         'TEST'.encode(pref)
 238     except Exception:
 239         pref = 'UTF-8'
 240
 241     return pref
 242
 243
 244 def write_json_file(obj, fn):
 245     """ Encode obj as JSON and write it to fn, atomically if possible """
 246
 247     tf = tempfile.NamedTemporaryFile(
 248         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 249         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 250
 251     try:
 252         with tf:
 253             json.dump(obj, tf, ensure_ascii=False)
 254         if sys.platform == 'win32':
 255             # Need to remove existing file on Windows, else os.rename raises
 256             # WindowsError or FileExistsError.
 257             with contextlib.suppress(OSError):
 258                 os.unlink(fn)
 259         with contextlib.suppress(OSError):
 260             mask = os.umask(0)
 261             os.umask(mask)
 262             os.chmod(tf.name, 0o666 & ~mask)
 263         os.rename(tf.name, fn)
 264     except Exception:
 265         with contextlib.suppress(OSError):
 266             os.remove(tf.name)
 267         raise
 268
 269
 270 def find_xpath_attr(node, xpath, key, val=None):
 271     """ Find the xpath xpath[@key=val] """
 272     assert re.match(r'^[a-zA-Z_-]+$', key)
 273     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 274     return node.find(expr)
 275
 276 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 277 # the namespace parameter
 278
 279
 280 def xpath_with_ns(path, ns_map):
 281     components = [c.split(':') for c in path.split('/')]
 282     replaced = []
 283     for c in components:
 284         if len(c) == 1:
 285             replaced.append(c[0])
 286         else:
 287             ns, tag = c
 288             replaced.append('{%s}%s' % (ns_map[ns], tag))
 289     return '/'.join(replaced)
 290
 291
 292 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 293     def _find_xpath(xpath):
 294         return node.find(xpath)
 295
 296     if isinstance(xpath, str):
 297         n = _find_xpath(xpath)
 298     else:
 299         for xp in xpath:
 300             n = _find_xpath(xp)
 301             if n is not None:
 302                 break
 303
 304     if n is None:
 305         if default is not NO_DEFAULT:
 306             return default
 307         elif fatal:
 308             name = xpath if name is None else name
 309             raise ExtractorError('Could not find XML element %s' % name)
 310         else:
 311             return None
 312     return n
 313
 314
 315 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 316     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 317     if n is None or n == default:
 318         return n
 319     if n.text is None:
 320         if default is not NO_DEFAULT:
 321             return default
 322         elif fatal:
 323             name = xpath if name is None else name
 324             raise ExtractorError('Could not find XML element\'s text %s' % name)
 325         else:
 326             return None
 327     return n.text
 328
 329
 330 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 331     n = find_xpath_attr(node, xpath, key)
 332     if n is None:
 333         if default is not NO_DEFAULT:
 334             return default
 335         elif fatal:
 336             name = f'{xpath}[@{key}]' if name is None else name
 337             raise ExtractorError('Could not find XML attribute %s' % name)
 338         else:
 339             return None
 340     return n.attrib[key]
 341
 342
 343 def get_element_by_id(id, html, **kwargs):
 344     """Return the content of the tag with the specified ID in the passed HTML document"""
 345     return get_element_by_attribute('id', id, html, **kwargs)
 346
 347
 348 def get_element_html_by_id(id, html, **kwargs):
 349     """Return the html of the tag with the specified ID in the passed HTML document"""
 350     return get_element_html_by_attribute('id', id, html, **kwargs)
 351
 352
 353 def get_element_by_class(class_name, html):
 354     """Return the content of the first tag with the specified class in the passed HTML document"""
 355     retval = get_elements_by_class(class_name, html)
 356     return retval[0] if retval else None
 357
 358
 359 def get_element_html_by_class(class_name, html):
 360     """Return the html of the first tag with the specified class in the passed HTML document"""
 361     retval = get_elements_html_by_class(class_name, html)
 362     return retval[0] if retval else None
 363
 364
 365 def get_element_by_attribute(attribute, value, html, **kwargs):
 366     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 367     return retval[0] if retval else None
 368
 369
 370 def get_element_html_by_attribute(attribute, value, html, **kargs):
 371     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 372     return retval[0] if retval else None
 373
 374
 375 def get_elements_by_class(class_name, html, **kargs):
 376     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 377     return get_elements_by_attribute(
 378         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 379         html, escape_value=False)
 380
 381
 382 def get_elements_html_by_class(class_name, html):
 383     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 384     return get_elements_html_by_attribute(
 385         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 386         html, escape_value=False)
 387
 388
 389 def get_elements_by_attribute(*args, **kwargs):
 390     """Return the content of the tag with the specified attribute in the passed HTML document"""
 391     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 392
 393
 394 def get_elements_html_by_attribute(*args, **kwargs):
 395     """Return the html of the tag with the specified attribute in the passed HTML document"""
 396     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 397
 398
 399 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 400     """
 401     Return the text (content) and the html (whole) of the tag with the specified
 402     attribute in the passed HTML document
 403     """
 404
 405     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 406
 407     value = re.escape(value) if escape_value else value
 408
 409     partial_element_re = rf'''(?x)
 410         <(?P<tag>[a-zA-Z0-9:._-]+)
 411          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 412          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 413         '''
 414
 415     for m in re.finditer(partial_element_re, html):
 416         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 417
 418         yield (
 419             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 420             whole
 421         )
 422
 423
 424 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 425     """
 426     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 427     closing tag for the first opening tag it has encountered, and can be used
 428     as a context manager
 429     """
 430
 431     class HTMLBreakOnClosingTagException(Exception):
 432         pass
 433
 434     def __init__(self):
 435         self.tagstack = collections.deque()
 436         html.parser.HTMLParser.__init__(self)
 437
 438     def __enter__(self):
 439         return self
 440
 441     def __exit__(self, *_):
 442         self.close()
 443
 444     def close(self):
 445         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 446         # so data remains buffered; we no longer have any interest in it, thus
 447         # override this method to discard it
 448         pass
 449
 450     def handle_starttag(self, tag, _):
 451         self.tagstack.append(tag)
 452
 453     def handle_endtag(self, tag):
 454         if not self.tagstack:
 455             raise compat_HTMLParseError('no tags in the stack')
 456         while self.tagstack:
 457             inner_tag = self.tagstack.pop()
 458             if inner_tag == tag:
 459                 break
 460         else:
 461             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 462         if not self.tagstack:
 463             raise self.HTMLBreakOnClosingTagException()
 464
 465
 466 def get_element_text_and_html_by_tag(tag, html):
 467     """
 468     For the first element with the specified tag in the passed HTML document
 469     return its' content (text) and the whole element (html)
 470     """
 471     def find_or_raise(haystack, needle, exc):
 472         try:
 473             return haystack.index(needle)
 474         except ValueError:
 475             raise exc
 476     closing_tag = f'</{tag}>'
 477     whole_start = find_or_raise(
 478         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 479     content_start = find_or_raise(
 480         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 481     content_start += whole_start + 1
 482     with HTMLBreakOnClosingTagParser() as parser:
 483         parser.feed(html[whole_start:content_start])
 484         if not parser.tagstack or parser.tagstack[0] != tag:
 485             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 486         offset = content_start
 487         while offset < len(html):
 488             next_closing_tag_start = find_or_raise(
 489                 html[offset:], closing_tag,
 490                 compat_HTMLParseError(f'closing {tag} tag not found'))
 491             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 492             try:
 493                 parser.feed(html[offset:offset + next_closing_tag_end])
 494                 offset += next_closing_tag_end
 495             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 496                 return html[content_start:offset + next_closing_tag_start], \
 497                     html[whole_start:offset + next_closing_tag_end]
 498         raise compat_HTMLParseError('unexpected end of html')
 499
 500
 501 class HTMLAttributeParser(html.parser.HTMLParser):
 502     """Trivial HTML parser to gather the attributes for a single element"""
 503
 504     def __init__(self):
 505         self.attrs = {}
 506         html.parser.HTMLParser.__init__(self)
 507
 508     def handle_starttag(self, tag, attrs):
 509         self.attrs = dict(attrs)
 510
 511
 512 class HTMLListAttrsParser(html.parser.HTMLParser):
 513     """HTML parser to gather the attributes for the elements of a list"""
 514
 515     def __init__(self):
 516         html.parser.HTMLParser.__init__(self)
 517         self.items = []
 518         self._level = 0
 519
 520     def handle_starttag(self, tag, attrs):
 521         if tag == 'li' and self._level == 0:
 522             self.items.append(dict(attrs))
 523         self._level += 1
 524
 525     def handle_endtag(self, tag):
 526         self._level -= 1
 527
 528
 529 def extract_attributes(html_element):
 530     """Given a string for an HTML element such as
 531     <el
 532          a="foo" B="bar" c="&98;az" d=boz
 533          empty= noval entity="&amp;"
 534          sq='"' dq="'"
 535     >
 536     Decode and return a dictionary of attributes.
 537     {
 538         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 539         'empty': '', 'noval': None, 'entity': '&',
 540         'sq': '"', 'dq': '\''
 541     }.
 542     """
 543     parser = HTMLAttributeParser()
 544     with contextlib.suppress(compat_HTMLParseError):
 545         parser.feed(html_element)
 546         parser.close()
 547     return parser.attrs
 548
 549
 550 def parse_list(webpage):
 551     """Given a string for an series of HTML <li> elements,
 552     return a dictionary of their attributes"""
 553     parser = HTMLListAttrsParser()
 554     parser.feed(webpage)
 555     parser.close()
 556     return parser.items
 557
 558
 559 def clean_html(html):
 560     """Clean an HTML snippet into a readable string"""
 561
 562     if html is None:  # Convenience for sanitizing descriptions etc.
 563         return html
 564
 565     html = re.sub(r'\s+', ' ', html)
 566     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 567     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 568     # Strip html tags
 569     html = re.sub('<.*?>', '', html)
 570     # Replace html entities
 571     html = unescapeHTML(html)
 572     return html.strip()
 573
 574
 575 class LenientJSONDecoder(json.JSONDecoder):
 576     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 577         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 578         super().__init__(*args, **kwargs)
 579
 580     def decode(self, s):
 581         if self.transform_source:
 582             s = self.transform_source(s)
 583         if self.ignore_extra:
 584             return self.raw_decode(s.lstrip())[0]
 585         return super().decode(s)
 586
 587
 588 def sanitize_open(filename, open_mode):
 589     """Try to open the given filename, and slightly tweak it if this fails.
 590
 591     Attempts to open the given filename. If this fails, it tries to change
 592     the filename slightly, step by step, until it's either able to open it
 593     or it fails and raises a final exception, like the standard open()
 594     function.
 595
 596     It returns the tuple (stream, definitive_file_name).
 597     """
 598     if filename == '-':
 599         if sys.platform == 'win32':
 600             import msvcrt
 601
 602             # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout
 603             with contextlib.suppress(io.UnsupportedOperation):
 604                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 605         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 606
 607     for attempt in range(2):
 608         try:
 609             try:
 610                 if sys.platform == 'win32':
 611                     # FIXME: An exclusive lock also locks the file from being read.
 612                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 613                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 614                     raise LockingUnsupportedError()
 615                 stream = locked_file(filename, open_mode, block=False).__enter__()
 616             except OSError:
 617                 stream = open(filename, open_mode)
 618             return stream, filename
 619         except OSError as err:
 620             if attempt or err.errno in (errno.EACCES,):
 621                 raise
 622             old_filename, filename = filename, sanitize_path(filename)
 623             if old_filename == filename:
 624                 raise
 625
 626
 627 def timeconvert(timestr):
 628     """Convert RFC 2822 defined time string into system timestamp"""
 629     timestamp = None
 630     timetuple = email.utils.parsedate_tz(timestr)
 631     if timetuple is not None:
 632         timestamp = email.utils.mktime_tz(timetuple)
 633     return timestamp
 634
 635
 636 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 637     """Sanitizes a string so it could be used as part of a filename.
 638     @param restricted   Use a stricter subset of allowed characters
 639     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 640                         If unset, yt-dlp's new sanitization rules are in effect
 641     """
 642     if s == '':
 643         return ''
 644
 645     def replace_insane(char):
 646         if restricted and char in ACCENT_CHARS:
 647             return ACCENT_CHARS[char]
 648         elif not restricted and char == '\n':
 649             return '\0 '
 650         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 651             return ''
 652         elif char == '"':
 653             return '' if restricted else '\''
 654         elif char == ':':
 655             return '\0_\0-' if restricted else '\0 \0-'
 656         elif char in '\\/|*<>':
 657             return '\0_'
 658         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 659             return '\0_'
 660         return char
 661
 662     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 663     result = ''.join(map(replace_insane, s))
 664     if is_id is NO_DEFAULT:
 665         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 666         STRIP_RE = r'(?:\0.|[ _-])*'
 667         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 668     result = result.replace('\0', '') or '_'
 669
 670     if not is_id:
 671         while '__' in result:
 672             result = result.replace('__', '_')
 673         result = result.strip('_')
 674         # Common case of "Foreign band name - English song title"
 675         if restricted and result.startswith('-_'):
 676             result = result[2:]
 677         if result.startswith('-'):
 678             result = '_' + result[len('-'):]
 679         result = result.lstrip('.')
 680         if not result:
 681             result = '_'
 682     return result
 683
 684
 685 def sanitize_path(s, force=False):
 686     """Sanitizes and normalizes path on Windows"""
 687     if sys.platform == 'win32':
 688         force = False
 689         drive_or_unc, _ = os.path.splitdrive(s)
 690     elif force:
 691         drive_or_unc = ''
 692     else:
 693         return s
 694
 695     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 696     if drive_or_unc:
 697         norm_path.pop(0)
 698     sanitized_path = [
 699         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 700         for path_part in norm_path]
 701     if drive_or_unc:
 702         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 703     elif force and s and s[0] == os.path.sep:
 704         sanitized_path.insert(0, os.path.sep)
 705     return os.path.join(*sanitized_path)
 706
 707
 708 def sanitize_url(url, *, scheme='http'):
 709     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 710     # the number of unwanted failures due to missing protocol
 711     if url is None:
 712         return
 713     elif url.startswith('//'):
 714         return f'{scheme}:{url}'
 715     # Fix some common typos seen so far
 716     COMMON_TYPOS = (
 717         # https://github.com/ytdl-org/youtube-dl/issues/15649
 718         (r'^httpss://', r'https://'),
 719         # https://bx1.be/lives/direct-tv/
 720         (r'^rmtp([es]?)://', r'rtmp\1://'),
 721     )
 722     for mistake, fixup in COMMON_TYPOS:
 723         if re.match(mistake, url):
 724             return re.sub(mistake, fixup, url)
 725     return url
 726
 727
 728 def extract_basic_auth(url):
 729     parts = urllib.parse.urlsplit(url)
 730     if parts.username is None:
 731         return url, None
 732     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 733         parts.hostname if parts.port is None
 734         else '%s:%d' % (parts.hostname, parts.port))))
 735     auth_payload = base64.b64encode(
 736         ('%s:%s' % (parts.username, parts.password or '')).encode())
 737     return url, f'Basic {auth_payload.decode()}'
 738
 739
 740 def sanitized_Request(url, *args, **kwargs):
 741     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 742     if auth_header is not None:
 743         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 744         headers['Authorization'] = auth_header
 745     return urllib.request.Request(url, *args, **kwargs)
 746
 747
 748 def expand_path(s):
 749     """Expand shell variables and ~"""
 750     return os.path.expandvars(compat_expanduser(s))
 751
 752
 753 def orderedSet(iterable, *, lazy=False):
 754     """Remove all duplicates from the input iterable"""
 755     def _iter():
 756         seen = []  # Do not use set since the items can be unhashable
 757         for x in iterable:
 758             if x not in seen:
 759                 seen.append(x)
 760                 yield x
 761
 762     return _iter() if lazy else list(_iter())
 763
 764
 765 def _htmlentity_transform(entity_with_semicolon):
 766     """Transforms an HTML entity to a character."""
 767     entity = entity_with_semicolon[:-1]
 768
 769     # Known non-numeric HTML entity
 770     if entity in html.entities.name2codepoint:
 771         return chr(html.entities.name2codepoint[entity])
 772
 773     # TODO: HTML5 allows entities without a semicolon. For example,
 774     # '&Eacuteric' should be decoded as 'Éric'.
 775     if entity_with_semicolon in html.entities.html5:
 776         return html.entities.html5[entity_with_semicolon]
 777
 778     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 779     if mobj is not None:
 780         numstr = mobj.group(1)
 781         if numstr.startswith('x'):
 782             base = 16
 783             numstr = '0%s' % numstr
 784         else:
 785             base = 10
 786         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 787         with contextlib.suppress(ValueError):
 788             return chr(int(numstr, base))
 789
 790     # Unknown entity in name, return its literal representation
 791     return '&%s;' % entity
 792
 793
 794 def unescapeHTML(s):
 795     if s is None:
 796         return None
 797     assert isinstance(s, str)
 798
 799     return re.sub(
 800         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 801
 802
 803 def escapeHTML(text):
 804     return (
 805         text
 806         .replace('&', '&amp;')
 807         .replace('<', '&lt;')
 808         .replace('>', '&gt;')
 809         .replace('"', '&quot;')
 810         .replace("'", '&#39;')
 811     )
 812
 813
 814 def process_communicate_or_kill(p, *args, **kwargs):
 815     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 816                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 817     return Popen.communicate_or_kill(p, *args, **kwargs)
 818
 819
 820 class Popen(subprocess.Popen):
 821     if sys.platform == 'win32':
 822         _startupinfo = subprocess.STARTUPINFO()
 823         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 824     else:
 825         _startupinfo = None
 826
 827     def __init__(self, *args, text=False, **kwargs):
 828         if text is True:
 829             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 830             kwargs.setdefault('encoding', 'utf-8')
 831             kwargs.setdefault('errors', 'replace')
 832         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 833
 834     def communicate_or_kill(self, *args, **kwargs):
 835         try:
 836             return self.communicate(*args, **kwargs)
 837         except BaseException:  # Including KeyboardInterrupt
 838             self.kill(timeout=None)
 839             raise
 840
 841     def kill(self, *, timeout=0):
 842         super().kill()
 843         if timeout != 0:
 844             self.wait(timeout=timeout)
 845
 846     @classmethod
 847     def run(cls, *args, **kwargs):
 848         with cls(*args, **kwargs) as proc:
 849             stdout, stderr = proc.communicate_or_kill()
 850             return stdout or '', stderr or '', proc.returncode
 851
 852
 853 def get_subprocess_encoding():
 854     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 855         # For subprocess calls, encode with locale encoding
 856         # Refer to http://stackoverflow.com/a/9951851/35070
 857         encoding = preferredencoding()
 858     else:
 859         encoding = sys.getfilesystemencoding()
 860     if encoding is None:
 861         encoding = 'utf-8'
 862     return encoding
 863
 864
 865 def encodeFilename(s, for_subprocess=False):
 866     assert isinstance(s, str)
 867     return s
 868
 869
 870 def decodeFilename(b, for_subprocess=False):
 871     return b
 872
 873
 874 def encodeArgument(s):
 875     # Legacy code that uses byte strings
 876     # Uncomment the following line after fixing all post processors
 877     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 878     return s if isinstance(s, str) else s.decode('ascii')
 879
 880
 881 def decodeArgument(b):
 882     return b
 883
 884
 885 def decodeOption(optval):
 886     if optval is None:
 887         return optval
 888     if isinstance(optval, bytes):
 889         optval = optval.decode(preferredencoding())
 890
 891     assert isinstance(optval, str)
 892     return optval
 893
 894
 895 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 896
 897
 898 def timetuple_from_msec(msec):
 899     secs, msec = divmod(msec, 1000)
 900     mins, secs = divmod(secs, 60)
 901     hrs, mins = divmod(mins, 60)
 902     return _timetuple(hrs, mins, secs, msec)
 903
 904
 905 def formatSeconds(secs, delim=':', msec=False):
 906     time = timetuple_from_msec(secs * 1000)
 907     if time.hours:
 908         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 909     elif time.minutes:
 910         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 911     else:
 912         ret = '%d' % time.seconds
 913     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 914
 915
 916 def _ssl_load_windows_store_certs(ssl_context, storename):
 917     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 918     try:
 919         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 920                  if encoding == 'x509_asn' and (
 921                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 922     except PermissionError:
 923         return
 924     for cert in certs:
 925         with contextlib.suppress(ssl.SSLError):
 926             ssl_context.load_verify_locations(cadata=cert)
 927
 928
 929 def make_HTTPS_handler(params, **kwargs):
 930     opts_check_certificate = not params.get('nocheckcertificate')
 931     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 932     context.check_hostname = opts_check_certificate
 933     if params.get('legacyserverconnect'):
 934         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 935         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 936         context.set_ciphers('DEFAULT')
 937
 938     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 939     if opts_check_certificate:
 940         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 941             context.load_verify_locations(cafile=certifi.where())
 942         else:
 943             try:
 944                 context.load_default_certs()
 945                 # Work around the issue in load_default_certs when there are bad certificates. See:
 946                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 947                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 948             except ssl.SSLError:
 949                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 950                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 951                     for storename in ('CA', 'ROOT'):
 952                         _ssl_load_windows_store_certs(context, storename)
 953                 context.set_default_verify_paths()
 954
 955     client_certfile = params.get('client_certificate')
 956     if client_certfile:
 957         try:
 958             context.load_cert_chain(
 959                 client_certfile, keyfile=params.get('client_certificate_key'),
 960                 password=params.get('client_certificate_password'))
 961         except ssl.SSLError:
 962             raise YoutubeDLError('Unable to load client certificate')
 963
 964     # Some servers may reject requests if ALPN extension is not sent. See:
 965     # https://github.com/python/cpython/issues/85140
 966     # https://github.com/yt-dlp/yt-dlp/issues/3878
 967     with contextlib.suppress(NotImplementedError):
 968         context.set_alpn_protocols(['http/1.1'])
 969
 970     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 971
 972
 973 def bug_reports_message(before=';'):
 974     from .update import REPOSITORY
 975
 976     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 977            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 978
 979     before = before.rstrip()
 980     if not before or before.endswith(('.', '!', '?')):
 981         msg = msg[0].title() + msg[1:]
 982
 983     return (before + ' ' if before else '') + msg
 984
 985
 986 class YoutubeDLError(Exception):
 987     """Base exception for YoutubeDL errors."""
 988     msg = None
 989
 990     def __init__(self, msg=None):
 991         if msg is not None:
 992             self.msg = msg
 993         elif self.msg is None:
 994             self.msg = type(self).__name__
 995         super().__init__(self.msg)
 996
 997
 998 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
 999 if hasattr(ssl, 'CertificateError'):
1000     network_exceptions.append(ssl.CertificateError)
1001 network_exceptions = tuple(network_exceptions)
1002
1003
1004 class ExtractorError(YoutubeDLError):
1005     """Error during info extraction."""
1006
1007     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1008         """ tb, if given, is the original traceback (so that it can be printed out).
1009         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1010         """
1011         if sys.exc_info()[0] in network_exceptions:
1012             expected = True
1013
1014         self.orig_msg = str(msg)
1015         self.traceback = tb
1016         self.expected = expected
1017         self.cause = cause
1018         self.video_id = video_id
1019         self.ie = ie
1020         self.exc_info = sys.exc_info()  # preserve original exception
1021         if isinstance(self.exc_info[1], ExtractorError):
1022             self.exc_info = self.exc_info[1].exc_info
1023
1024         super().__init__(''.join((
1025             format_field(ie, None, '[%s] '),
1026             format_field(video_id, None, '%s: '),
1027             msg,
1028             format_field(cause, None, ' (caused by %r)'),
1029             '' if expected else bug_reports_message())))
1030
1031     def format_traceback(self):
1032         return join_nonempty(
1033             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1034             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1035             delim='\n') or None
1036
1037
1038 class UnsupportedError(ExtractorError):
1039     def __init__(self, url):
1040         super().__init__(
1041             'Unsupported URL: %s' % url, expected=True)
1042         self.url = url
1043
1044
1045 class RegexNotFoundError(ExtractorError):
1046     """Error when a regex didn't match"""
1047     pass
1048
1049
1050 class GeoRestrictedError(ExtractorError):
1051     """Geographic restriction Error exception.
1052
1053     This exception may be thrown when a video is not available from your
1054     geographic location due to geographic restrictions imposed by a website.
1055     """
1056
1057     def __init__(self, msg, countries=None, **kwargs):
1058         kwargs['expected'] = True
1059         super().__init__(msg, **kwargs)
1060         self.countries = countries
1061
1062
1063 class UserNotLive(ExtractorError):
1064     """Error when a channel/user is not live"""
1065
1066     def __init__(self, msg=None, **kwargs):
1067         kwargs['expected'] = True
1068         super().__init__(msg or 'The channel is not currently live', **kwargs)
1069
1070
1071 class DownloadError(YoutubeDLError):
1072     """Download Error exception.
1073
1074     This exception may be thrown by FileDownloader objects if they are not
1075     configured to continue on errors. They will contain the appropriate
1076     error message.
1077     """
1078
1079     def __init__(self, msg, exc_info=None):
1080         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1081         super().__init__(msg)
1082         self.exc_info = exc_info
1083
1084
1085 class EntryNotInPlaylist(YoutubeDLError):
1086     """Entry not in playlist exception.
1087
1088     This exception will be thrown by YoutubeDL when a requested entry
1089     is not found in the playlist info_dict
1090     """
1091     msg = 'Entry not found in info'
1092
1093
1094 class SameFileError(YoutubeDLError):
1095     """Same File exception.
1096
1097     This exception will be thrown by FileDownloader objects if they detect
1098     multiple files would have to be downloaded to the same file on disk.
1099     """
1100     msg = 'Fixed output name but more than one file to download'
1101
1102     def __init__(self, filename=None):
1103         if filename is not None:
1104             self.msg += f': {filename}'
1105         super().__init__(self.msg)
1106
1107
1108 class PostProcessingError(YoutubeDLError):
1109     """Post Processing exception.
1110
1111     This exception may be raised by PostProcessor's .run() method to
1112     indicate an error in the postprocessing task.
1113     """
1114
1115
1116 class DownloadCancelled(YoutubeDLError):
1117     """ Exception raised when the download queue should be interrupted """
1118     msg = 'The download was cancelled'
1119
1120
1121 class ExistingVideoReached(DownloadCancelled):
1122     """ --break-on-existing triggered """
1123     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1124
1125
1126 class RejectedVideoReached(DownloadCancelled):
1127     """ --break-on-reject triggered """
1128     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1129
1130
1131 class MaxDownloadsReached(DownloadCancelled):
1132     """ --max-downloads limit has been reached. """
1133     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1134
1135
1136 class ReExtractInfo(YoutubeDLError):
1137     """ Video info needs to be re-extracted. """
1138
1139     def __init__(self, msg, expected=False):
1140         super().__init__(msg)
1141         self.expected = expected
1142
1143
1144 class ThrottledDownload(ReExtractInfo):
1145     """ Download speed below --throttled-rate. """
1146     msg = 'The download speed is below throttle limit'
1147
1148     def __init__(self):
1149         super().__init__(self.msg, expected=False)
1150
1151
1152 class UnavailableVideoError(YoutubeDLError):
1153     """Unavailable Format exception.
1154
1155     This exception will be thrown when a video is requested
1156     in a format that is not available for that video.
1157     """
1158     msg = 'Unable to download video'
1159
1160     def __init__(self, err=None):
1161         if err is not None:
1162             self.msg += f': {err}'
1163         super().__init__(self.msg)
1164
1165
1166 class ContentTooShortError(YoutubeDLError):
1167     """Content Too Short exception.
1168
1169     This exception may be raised by FileDownloader objects when a file they
1170     download is too small for what the server announced first, indicating
1171     the connection was probably interrupted.
1172     """
1173
1174     def __init__(self, downloaded, expected):
1175         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1176         # Both in bytes
1177         self.downloaded = downloaded
1178         self.expected = expected
1179
1180
1181 class XAttrMetadataError(YoutubeDLError):
1182     def __init__(self, code=None, msg='Unknown error'):
1183         super().__init__(msg)
1184         self.code = code
1185         self.msg = msg
1186
1187         # Parsing code and msg
1188         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1189                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1190             self.reason = 'NO_SPACE'
1191         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1192             self.reason = 'VALUE_TOO_LONG'
1193         else:
1194             self.reason = 'NOT_SUPPORTED'
1195
1196
1197 class XAttrUnavailableError(YoutubeDLError):
1198     pass
1199
1200
1201 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1202     hc = http_class(*args, **kwargs)
1203     source_address = ydl_handler._params.get('source_address')
1204
1205     if source_address is not None:
1206         # This is to workaround _create_connection() from socket where it will try all
1207         # address data from getaddrinfo() including IPv6. This filters the result from
1208         # getaddrinfo() based on the source_address value.
1209         # This is based on the cpython socket.create_connection() function.
1210         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1211         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1212             host, port = address
1213             err = None
1214             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1215             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1216             ip_addrs = [addr for addr in addrs if addr[0] == af]
1217             if addrs and not ip_addrs:
1218                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1219                 raise OSError(
1220                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1221                     % (ip_version, source_address[0]))
1222             for res in ip_addrs:
1223                 af, socktype, proto, canonname, sa = res
1224                 sock = None
1225                 try:
1226                     sock = socket.socket(af, socktype, proto)
1227                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1228                         sock.settimeout(timeout)
1229                     sock.bind(source_address)
1230                     sock.connect(sa)
1231                     err = None  # Explicitly break reference cycle
1232                     return sock
1233                 except OSError as _:
1234                     err = _
1235                     if sock is not None:
1236                         sock.close()
1237             if err is not None:
1238                 raise err
1239             else:
1240                 raise OSError('getaddrinfo returns an empty list')
1241         if hasattr(hc, '_create_connection'):
1242             hc._create_connection = _create_connection
1243         hc.source_address = (source_address, 0)
1244
1245     return hc
1246
1247
1248 def handle_youtubedl_headers(headers):
1249     filtered_headers = headers
1250
1251     if 'Youtubedl-no-compression' in filtered_headers:
1252         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1253         del filtered_headers['Youtubedl-no-compression']
1254
1255     return filtered_headers
1256
1257
1258 class YoutubeDLHandler(urllib.request.HTTPHandler):
1259     """Handler for HTTP requests and responses.
1260
1261     This class, when installed with an OpenerDirector, automatically adds
1262     the standard headers to every HTTP request and handles gzipped and
1263     deflated responses from web servers. If compression is to be avoided in
1264     a particular request, the original request in the program code only has
1265     to include the HTTP header "Youtubedl-no-compression", which will be
1266     removed before making the real request.
1267
1268     Part of this code was copied from:
1269
1270     http://techknack.net/python-urllib2-handlers/
1271
1272     Andrew Rowls, the author of that code, agreed to release it to the
1273     public domain.
1274     """
1275
1276     def __init__(self, params, *args, **kwargs):
1277         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1278         self._params = params
1279
1280     def http_open(self, req):
1281         conn_class = http.client.HTTPConnection
1282
1283         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1284         if socks_proxy:
1285             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1286             del req.headers['Ytdl-socks-proxy']
1287
1288         return self.do_open(functools.partial(
1289             _create_http_connection, self, conn_class, False),
1290             req)
1291
1292     @staticmethod
1293     def deflate(data):
1294         if not data:
1295             return data
1296         try:
1297             return zlib.decompress(data, -zlib.MAX_WBITS)
1298         except zlib.error:
1299             return zlib.decompress(data)
1300
1301     @staticmethod
1302     def brotli(data):
1303         if not data:
1304             return data
1305         return brotli.decompress(data)
1306
1307     def http_request(self, req):
1308         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1309         # always respected by websites, some tend to give out URLs with non percent-encoded
1310         # non-ASCII characters (see telemb.py, ard.py [#3412])
1311         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1312         # To work around aforementioned issue we will replace request's original URL with
1313         # percent-encoded one
1314         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1315         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1316         url = req.get_full_url()
1317         url_escaped = escape_url(url)
1318
1319         # Substitute URL if any change after escaping
1320         if url != url_escaped:
1321             req = update_Request(req, url=url_escaped)
1322
1323         for h, v in self._params.get('http_headers', std_headers).items():
1324             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1325             # The dict keys are capitalized because of this bug by urllib
1326             if h.capitalize() not in req.headers:
1327                 req.add_header(h, v)
1328
1329         if 'Accept-encoding' not in req.headers:
1330             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1331
1332         req.headers = handle_youtubedl_headers(req.headers)
1333
1334         return super().do_request_(req)
1335
1336     def http_response(self, req, resp):
1337         old_resp = resp
1338         # gzip
1339         if resp.headers.get('Content-encoding', '') == 'gzip':
1340             content = resp.read()
1341             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1342             try:
1343                 uncompressed = io.BytesIO(gz.read())
1344             except OSError as original_ioerror:
1345                 # There may be junk add the end of the file
1346                 # See http://stackoverflow.com/q/4928560/35070 for details
1347                 for i in range(1, 1024):
1348                     try:
1349                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1350                         uncompressed = io.BytesIO(gz.read())
1351                     except OSError:
1352                         continue
1353                     break
1354                 else:
1355                     raise original_ioerror
1356             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1357             resp.msg = old_resp.msg
1358             del resp.headers['Content-encoding']
1359         # deflate
1360         if resp.headers.get('Content-encoding', '') == 'deflate':
1361             gz = io.BytesIO(self.deflate(resp.read()))
1362             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1363             resp.msg = old_resp.msg
1364             del resp.headers['Content-encoding']
1365         # brotli
1366         if resp.headers.get('Content-encoding', '') == 'br':
1367             resp = urllib.request.addinfourl(
1368                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1369             resp.msg = old_resp.msg
1370             del resp.headers['Content-encoding']
1371         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1372         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1373         if 300 <= resp.code < 400:
1374             location = resp.headers.get('Location')
1375             if location:
1376                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1377                 location = location.encode('iso-8859-1').decode()
1378                 location_escaped = escape_url(location)
1379                 if location != location_escaped:
1380                     del resp.headers['Location']
1381                     resp.headers['Location'] = location_escaped
1382         return resp
1383
1384     https_request = http_request
1385     https_response = http_response
1386
1387
1388 def make_socks_conn_class(base_class, socks_proxy):
1389     assert issubclass(base_class, (
1390         http.client.HTTPConnection, http.client.HTTPSConnection))
1391
1392     url_components = urllib.parse.urlparse(socks_proxy)
1393     if url_components.scheme.lower() == 'socks5':
1394         socks_type = ProxyType.SOCKS5
1395     elif url_components.scheme.lower() in ('socks', 'socks4'):
1396         socks_type = ProxyType.SOCKS4
1397     elif url_components.scheme.lower() == 'socks4a':
1398         socks_type = ProxyType.SOCKS4A
1399
1400     def unquote_if_non_empty(s):
1401         if not s:
1402             return s
1403         return urllib.parse.unquote_plus(s)
1404
1405     proxy_args = (
1406         socks_type,
1407         url_components.hostname, url_components.port or 1080,
1408         True,  # Remote DNS
1409         unquote_if_non_empty(url_components.username),
1410         unquote_if_non_empty(url_components.password),
1411     )
1412
1413     class SocksConnection(base_class):
1414         def connect(self):
1415             self.sock = sockssocket()
1416             self.sock.setproxy(*proxy_args)
1417             if isinstance(self.timeout, (int, float)):
1418                 self.sock.settimeout(self.timeout)
1419             self.sock.connect((self.host, self.port))
1420
1421             if isinstance(self, http.client.HTTPSConnection):
1422                 if hasattr(self, '_context'):  # Python > 2.6
1423                     self.sock = self._context.wrap_socket(
1424                         self.sock, server_hostname=self.host)
1425                 else:
1426                     self.sock = ssl.wrap_socket(self.sock)
1427
1428     return SocksConnection
1429
1430
1431 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1432     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1433         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1434         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1435         self._params = params
1436
1437     def https_open(self, req):
1438         kwargs = {}
1439         conn_class = self._https_conn_class
1440
1441         if hasattr(self, '_context'):  # python > 2.6
1442             kwargs['context'] = self._context
1443         if hasattr(self, '_check_hostname'):  # python 3.x
1444             kwargs['check_hostname'] = self._check_hostname
1445
1446         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1447         if socks_proxy:
1448             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1449             del req.headers['Ytdl-socks-proxy']
1450
1451         try:
1452             return self.do_open(
1453                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1454         except urllib.error.URLError as e:
1455             if (isinstance(e.reason, ssl.SSLError)
1456                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1457                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1458             raise
1459
1460
1461 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1462     """
1463     See [1] for cookie file format.
1464
1465     1. https://curl.haxx.se/docs/http-cookies.html
1466     """
1467     _HTTPONLY_PREFIX = '#HttpOnly_'
1468     _ENTRY_LEN = 7
1469     _HEADER = '''# Netscape HTTP Cookie File
1470 # This file is generated by yt-dlp.  Do not edit.
1471
1472 '''
1473     _CookieFileEntry = collections.namedtuple(
1474         'CookieFileEntry',
1475         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1476
1477     def __init__(self, filename=None, *args, **kwargs):
1478         super().__init__(None, *args, **kwargs)
1479         if self.is_path(filename):
1480             filename = os.fspath(filename)
1481         self.filename = filename
1482
1483     @staticmethod
1484     def _true_or_false(cndn):
1485         return 'TRUE' if cndn else 'FALSE'
1486
1487     @staticmethod
1488     def is_path(file):
1489         return isinstance(file, (str, bytes, os.PathLike))
1490
1491     @contextlib.contextmanager
1492     def open(self, file, *, write=False):
1493         if self.is_path(file):
1494             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1495                 yield f
1496         else:
1497             if write:
1498                 file.truncate(0)
1499             yield file
1500
1501     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1502         now = time.time()
1503         for cookie in self:
1504             if (not ignore_discard and cookie.discard
1505                     or not ignore_expires and cookie.is_expired(now)):
1506                 continue
1507             name, value = cookie.name, cookie.value
1508             if value is None:
1509                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1510                 # with no name, whereas http.cookiejar regards it as a
1511                 # cookie with no value.
1512                 name, value = '', name
1513             f.write('%s\n' % '\t'.join((
1514                 cookie.domain,
1515                 self._true_or_false(cookie.domain.startswith('.')),
1516                 cookie.path,
1517                 self._true_or_false(cookie.secure),
1518                 str_or_none(cookie.expires, default=''),
1519                 name, value
1520             )))
1521
1522     def save(self, filename=None, *args, **kwargs):
1523         """
1524         Save cookies to a file.
1525         Code is taken from CPython 3.6
1526         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1527
1528         if filename is None:
1529             if self.filename is not None:
1530                 filename = self.filename
1531             else:
1532                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1533
1534         # Store session cookies with `expires` set to 0 instead of an empty string
1535         for cookie in self:
1536             if cookie.expires is None:
1537                 cookie.expires = 0
1538
1539         with self.open(filename, write=True) as f:
1540             f.write(self._HEADER)
1541             self._really_save(f, *args, **kwargs)
1542
1543     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1544         """Load cookies from a file."""
1545         if filename is None:
1546             if self.filename is not None:
1547                 filename = self.filename
1548             else:
1549                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1550
1551         def prepare_line(line):
1552             if line.startswith(self._HTTPONLY_PREFIX):
1553                 line = line[len(self._HTTPONLY_PREFIX):]
1554             # comments and empty lines are fine
1555             if line.startswith('#') or not line.strip():
1556                 return line
1557             cookie_list = line.split('\t')
1558             if len(cookie_list) != self._ENTRY_LEN:
1559                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1560             cookie = self._CookieFileEntry(*cookie_list)
1561             if cookie.expires_at and not cookie.expires_at.isdigit():
1562                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1563             return line
1564
1565         cf = io.StringIO()
1566         with self.open(filename) as f:
1567             for line in f:
1568                 try:
1569                     cf.write(prepare_line(line))
1570                 except http.cookiejar.LoadError as e:
1571                     if f'{line.strip()} '[0] in '[{"':
1572                         raise http.cookiejar.LoadError(
1573                             'Cookies file must be Netscape formatted, not JSON. See  '
1574                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1575                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1576                     continue
1577         cf.seek(0)
1578         self._really_load(cf, filename, ignore_discard, ignore_expires)
1579         # Session cookies are denoted by either `expires` field set to
1580         # an empty string or 0. MozillaCookieJar only recognizes the former
1581         # (see [1]). So we need force the latter to be recognized as session
1582         # cookies on our own.
1583         # Session cookies may be important for cookies-based authentication,
1584         # e.g. usually, when user does not check 'Remember me' check box while
1585         # logging in on a site, some important cookies are stored as session
1586         # cookies so that not recognizing them will result in failed login.
1587         # 1. https://bugs.python.org/issue17164
1588         for cookie in self:
1589             # Treat `expires=0` cookies as session cookies
1590             if cookie.expires == 0:
1591                 cookie.expires = None
1592                 cookie.discard = True
1593
1594
1595 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1596     def __init__(self, cookiejar=None):
1597         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1598
1599     def http_response(self, request, response):
1600         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1601
1602     https_request = urllib.request.HTTPCookieProcessor.http_request
1603     https_response = http_response
1604
1605
1606 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1607     """YoutubeDL redirect handler
1608
1609     The code is based on HTTPRedirectHandler implementation from CPython [1].
1610
1611     This redirect handler solves two issues:
1612      - ensures redirect URL is always unicode under python 2
1613      - introduces support for experimental HTTP response status code
1614        308 Permanent Redirect [2] used by some sites [3]
1615
1616     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1617     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1618     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1619     """
1620
1621     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1622
1623     def redirect_request(self, req, fp, code, msg, headers, newurl):
1624         """Return a Request or None in response to a redirect.
1625
1626         This is called by the http_error_30x methods when a
1627         redirection response is received.  If a redirection should
1628         take place, return a new Request to allow http_error_30x to
1629         perform the redirect.  Otherwise, raise HTTPError if no-one
1630         else should try to handle this url.  Return None if you can't
1631         but another Handler might.
1632         """
1633         m = req.get_method()
1634         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1635                  or code in (301, 302, 303) and m == "POST")):
1636             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1637         # Strictly (according to RFC 2616), 301 or 302 in response to
1638         # a POST MUST NOT cause a redirection without confirmation
1639         # from the user (of urllib.request, in this case).  In practice,
1640         # essentially all clients do redirect in this case, so we do
1641         # the same.
1642
1643         # Be conciliant with URIs containing a space.  This is mainly
1644         # redundant with the more complete encoding done in http_error_302(),
1645         # but it is kept for compatibility with other callers.
1646         newurl = newurl.replace(' ', '%20')
1647
1648         CONTENT_HEADERS = ("content-length", "content-type")
1649         # NB: don't use dict comprehension for python 2.6 compatibility
1650         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1651
1652         # A 303 must either use GET or HEAD for subsequent request
1653         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1654         if code == 303 and m != 'HEAD':
1655             m = 'GET'
1656         # 301 and 302 redirects are commonly turned into a GET from a POST
1657         # for subsequent requests by browsers, so we'll do the same.
1658         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1659         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1660         if code in (301, 302) and m == 'POST':
1661             m = 'GET'
1662
1663         return urllib.request.Request(
1664             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1665             unverifiable=True, method=m)
1666
1667
1668 def extract_timezone(date_str):
1669     m = re.search(
1670         r'''(?x)
1671             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1672             (?P<tz>Z|                                            # just the UTC Z, or
1673                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1674                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1675                    [ ]?                                          # optional space
1676                 (?P<sign>\+|-)                                   # +/-
1677                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1678             $)
1679         ''', date_str)
1680     if not m:
1681         timezone = datetime.timedelta()
1682     else:
1683         date_str = date_str[:-len(m.group('tz'))]
1684         if not m.group('sign'):
1685             timezone = datetime.timedelta()
1686         else:
1687             sign = 1 if m.group('sign') == '+' else -1
1688             timezone = datetime.timedelta(
1689                 hours=sign * int(m.group('hours')),
1690                 minutes=sign * int(m.group('minutes')))
1691     return timezone, date_str
1692
1693
1694 def parse_iso8601(date_str, delimiter='T', timezone=None):
1695     """ Return a UNIX timestamp from the given date """
1696
1697     if date_str is None:
1698         return None
1699
1700     date_str = re.sub(r'\.[0-9]+', '', date_str)
1701
1702     if timezone is None:
1703         timezone, date_str = extract_timezone(date_str)
1704
1705     with contextlib.suppress(ValueError):
1706         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1707         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1708         return calendar.timegm(dt.timetuple())
1709
1710
1711 def date_formats(day_first=True):
1712     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1713
1714
1715 def unified_strdate(date_str, day_first=True):
1716     """Return a string with the date in the format YYYYMMDD"""
1717
1718     if date_str is None:
1719         return None
1720     upload_date = None
1721     # Replace commas
1722     date_str = date_str.replace(',', ' ')
1723     # Remove AM/PM + timezone
1724     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1725     _, date_str = extract_timezone(date_str)
1726
1727     for expression in date_formats(day_first):
1728         with contextlib.suppress(ValueError):
1729             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1730     if upload_date is None:
1731         timetuple = email.utils.parsedate_tz(date_str)
1732         if timetuple:
1733             with contextlib.suppress(ValueError):
1734                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1735     if upload_date is not None:
1736         return str(upload_date)
1737
1738
1739 def unified_timestamp(date_str, day_first=True):
1740     if date_str is None:
1741         return None
1742
1743     date_str = re.sub(r'[,|]', '', date_str)
1744
1745     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1746     timezone, date_str = extract_timezone(date_str)
1747
1748     # Remove AM/PM + timezone
1749     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1750
1751     # Remove unrecognized timezones from ISO 8601 alike timestamps
1752     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1753     if m:
1754         date_str = date_str[:-len(m.group('tz'))]
1755
1756     # Python only supports microseconds, so remove nanoseconds
1757     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1758     if m:
1759         date_str = m.group(1)
1760
1761     for expression in date_formats(day_first):
1762         with contextlib.suppress(ValueError):
1763             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1764             return calendar.timegm(dt.timetuple())
1765     timetuple = email.utils.parsedate_tz(date_str)
1766     if timetuple:
1767         return calendar.timegm(timetuple) + pm_delta * 3600
1768
1769
1770 def determine_ext(url, default_ext='unknown_video'):
1771     if url is None or '.' not in url:
1772         return default_ext
1773     guess = url.partition('?')[0].rpartition('.')[2]
1774     if re.match(r'^[A-Za-z0-9]+$', guess):
1775         return guess
1776     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1777     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1778         return guess.rstrip('/')
1779     else:
1780         return default_ext
1781
1782
1783 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1784     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1785
1786
1787 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1788     R"""
1789     Return a datetime object from a string.
1790     Supported format:
1791         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1792
1793     @param format       strftime format of DATE
1794     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1795                         auto: round to the unit provided in date_str (if applicable).
1796     """
1797     auto_precision = False
1798     if precision == 'auto':
1799         auto_precision = True
1800         precision = 'microsecond'
1801     today = datetime_round(datetime.datetime.utcnow(), precision)
1802     if date_str in ('now', 'today'):
1803         return today
1804     if date_str == 'yesterday':
1805         return today - datetime.timedelta(days=1)
1806     match = re.match(
1807         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1808         date_str)
1809     if match is not None:
1810         start_time = datetime_from_str(match.group('start'), precision, format)
1811         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1812         unit = match.group('unit')
1813         if unit == 'month' or unit == 'year':
1814             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1815             unit = 'day'
1816         else:
1817             if unit == 'week':
1818                 unit = 'day'
1819                 time *= 7
1820             delta = datetime.timedelta(**{unit + 's': time})
1821             new_date = start_time + delta
1822         if auto_precision:
1823             return datetime_round(new_date, unit)
1824         return new_date
1825
1826     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1827
1828
1829 def date_from_str(date_str, format='%Y%m%d', strict=False):
1830     R"""
1831     Return a date object from a string using datetime_from_str
1832
1833     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1834                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1835     """
1836     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1837         raise ValueError(f'Invalid date format "{date_str}"')
1838     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1839
1840
1841 def datetime_add_months(dt, months):
1842     """Increment/Decrement a datetime object by months."""
1843     month = dt.month + months - 1
1844     year = dt.year + month // 12
1845     month = month % 12 + 1
1846     day = min(dt.day, calendar.monthrange(year, month)[1])
1847     return dt.replace(year, month, day)
1848
1849
1850 def datetime_round(dt, precision='day'):
1851     """
1852     Round a datetime object's time to a specific precision
1853     """
1854     if precision == 'microsecond':
1855         return dt
1856
1857     unit_seconds = {
1858         'day': 86400,
1859         'hour': 3600,
1860         'minute': 60,
1861         'second': 1,
1862     }
1863     roundto = lambda x, n: ((x + n / 2) // n) * n
1864     timestamp = calendar.timegm(dt.timetuple())
1865     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1866
1867
1868 def hyphenate_date(date_str):
1869     """
1870     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1871     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1872     if match is not None:
1873         return '-'.join(match.groups())
1874     else:
1875         return date_str
1876
1877
1878 class DateRange:
1879     """Represents a time interval between two dates"""
1880
1881     def __init__(self, start=None, end=None):
1882         """start and end must be strings in the format accepted by date"""
1883         if start is not None:
1884             self.start = date_from_str(start, strict=True)
1885         else:
1886             self.start = datetime.datetime.min.date()
1887         if end is not None:
1888             self.end = date_from_str(end, strict=True)
1889         else:
1890             self.end = datetime.datetime.max.date()
1891         if self.start > self.end:
1892             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1893
1894     @classmethod
1895     def day(cls, day):
1896         """Returns a range that only contains the given day"""
1897         return cls(day, day)
1898
1899     def __contains__(self, date):
1900         """Check if the date is in the range"""
1901         if not isinstance(date, datetime.date):
1902             date = date_from_str(date)
1903         return self.start <= date <= self.end
1904
1905     def __str__(self):
1906         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1907
1908     def __eq__(self, other):
1909         return (isinstance(other, DateRange)
1910                 and self.start == other.start and self.end == other.end)
1911
1912
1913 def platform_name():
1914     """ Returns the platform name as a str """
1915     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1916     return platform.platform()
1917
1918
1919 @functools.cache
1920 def system_identifier():
1921     python_implementation = platform.python_implementation()
1922     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1923         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1924
1925     return 'Python %s (%s %s) - %s %s' % (
1926         platform.python_version(),
1927         python_implementation,
1928         platform.architecture()[0],
1929         platform.platform(),
1930         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1931     )
1932
1933
1934 @functools.cache
1935 def get_windows_version():
1936     ''' Get Windows version. returns () if it's not running on Windows '''
1937     if compat_os_name == 'nt':
1938         return version_tuple(platform.win32_ver()[1])
1939     else:
1940         return ()
1941
1942
1943 def write_string(s, out=None, encoding=None):
1944     assert isinstance(s, str)
1945     out = out or sys.stderr
1946
1947     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1948         s = re.sub(r'([\r\n]+)', r' \1', s)
1949
1950     enc, buffer = None, out
1951     if 'b' in getattr(out, 'mode', ''):
1952         enc = encoding or preferredencoding()
1953     elif hasattr(out, 'buffer'):
1954         buffer = out.buffer
1955         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1956
1957     buffer.write(s.encode(enc, 'ignore') if enc else s)
1958     out.flush()
1959
1960
1961 def bytes_to_intlist(bs):
1962     if not bs:
1963         return []
1964     if isinstance(bs[0], int):  # Python 3
1965         return list(bs)
1966     else:
1967         return [ord(c) for c in bs]
1968
1969
1970 def intlist_to_bytes(xs):
1971     if not xs:
1972         return b''
1973     return struct.pack('%dB' % len(xs), *xs)
1974
1975
1976 class LockingUnsupportedError(OSError):
1977     msg = 'File locking is not supported'
1978
1979     def __init__(self):
1980         super().__init__(self.msg)
1981
1982
1983 # Cross-platform file locking
1984 if sys.platform == 'win32':
1985     import ctypes
1986     import ctypes.wintypes
1987     import msvcrt
1988
1989     class OVERLAPPED(ctypes.Structure):
1990         _fields_ = [
1991             ('Internal', ctypes.wintypes.LPVOID),
1992             ('InternalHigh', ctypes.wintypes.LPVOID),
1993             ('Offset', ctypes.wintypes.DWORD),
1994             ('OffsetHigh', ctypes.wintypes.DWORD),
1995             ('hEvent', ctypes.wintypes.HANDLE),
1996         ]
1997
1998     kernel32 = ctypes.windll.kernel32
1999     LockFileEx = kernel32.LockFileEx
2000     LockFileEx.argtypes = [
2001         ctypes.wintypes.HANDLE,     # hFile
2002         ctypes.wintypes.DWORD,      # dwFlags
2003         ctypes.wintypes.DWORD,      # dwReserved
2004         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2005         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2006         ctypes.POINTER(OVERLAPPED)  # Overlapped
2007     ]
2008     LockFileEx.restype = ctypes.wintypes.BOOL
2009     UnlockFileEx = kernel32.UnlockFileEx
2010     UnlockFileEx.argtypes = [
2011         ctypes.wintypes.HANDLE,     # hFile
2012         ctypes.wintypes.DWORD,      # dwReserved
2013         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2014         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2015         ctypes.POINTER(OVERLAPPED)  # Overlapped
2016     ]
2017     UnlockFileEx.restype = ctypes.wintypes.BOOL
2018     whole_low = 0xffffffff
2019     whole_high = 0x7fffffff
2020
2021     def _lock_file(f, exclusive, block):
2022         overlapped = OVERLAPPED()
2023         overlapped.Offset = 0
2024         overlapped.OffsetHigh = 0
2025         overlapped.hEvent = 0
2026         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2027
2028         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2029                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2030                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2031             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2032             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2033
2034     def _unlock_file(f):
2035         assert f._lock_file_overlapped_p
2036         handle = msvcrt.get_osfhandle(f.fileno())
2037         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2038             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2039
2040 else:
2041     try:
2042         import fcntl
2043
2044         def _lock_file(f, exclusive, block):
2045             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2046             if not block:
2047                 flags |= fcntl.LOCK_NB
2048             try:
2049                 fcntl.flock(f, flags)
2050             except BlockingIOError:
2051                 raise
2052             except OSError:  # AOSP does not have flock()
2053                 fcntl.lockf(f, flags)
2054
2055         def _unlock_file(f):
2056             try:
2057                 fcntl.flock(f, fcntl.LOCK_UN)
2058             except OSError:
2059                 fcntl.lockf(f, fcntl.LOCK_UN)
2060
2061     except ImportError:
2062
2063         def _lock_file(f, exclusive, block):
2064             raise LockingUnsupportedError()
2065
2066         def _unlock_file(f):
2067             raise LockingUnsupportedError()
2068
2069
2070 class locked_file:
2071     locked = False
2072
2073     def __init__(self, filename, mode, block=True, encoding=None):
2074         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2075             raise NotImplementedError(mode)
2076         self.mode, self.block = mode, block
2077
2078         writable = any(f in mode for f in 'wax+')
2079         readable = any(f in mode for f in 'r+')
2080         flags = functools.reduce(operator.ior, (
2081             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2082             getattr(os, 'O_BINARY', 0),  # Windows only
2083             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2084             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2085             os.O_APPEND if 'a' in mode else 0,
2086             os.O_EXCL if 'x' in mode else 0,
2087             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2088         ))
2089
2090         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2091
2092     def __enter__(self):
2093         exclusive = 'r' not in self.mode
2094         try:
2095             _lock_file(self.f, exclusive, self.block)
2096             self.locked = True
2097         except OSError:
2098             self.f.close()
2099             raise
2100         if 'w' in self.mode:
2101             try:
2102                 self.f.truncate()
2103             except OSError as e:
2104                 if e.errno not in (
2105                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2106                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2107                 ):
2108                     raise
2109         return self
2110
2111     def unlock(self):
2112         if not self.locked:
2113             return
2114         try:
2115             _unlock_file(self.f)
2116         finally:
2117             self.locked = False
2118
2119     def __exit__(self, *_):
2120         try:
2121             self.unlock()
2122         finally:
2123             self.f.close()
2124
2125     open = __enter__
2126     close = __exit__
2127
2128     def __getattr__(self, attr):
2129         return getattr(self.f, attr)
2130
2131     def __iter__(self):
2132         return iter(self.f)
2133
2134
2135 @functools.cache
2136 def get_filesystem_encoding():
2137     encoding = sys.getfilesystemencoding()
2138     return encoding if encoding is not None else 'utf-8'
2139
2140
2141 def shell_quote(args):
2142     quoted_args = []
2143     encoding = get_filesystem_encoding()
2144     for a in args:
2145         if isinstance(a, bytes):
2146             # We may get a filename encoded with 'encodeFilename'
2147             a = a.decode(encoding)
2148         quoted_args.append(compat_shlex_quote(a))
2149     return ' '.join(quoted_args)
2150
2151
2152 def smuggle_url(url, data):
2153     """ Pass additional data in a URL for internal use. """
2154
2155     url, idata = unsmuggle_url(url, {})
2156     data.update(idata)
2157     sdata = urllib.parse.urlencode(
2158         {'__youtubedl_smuggle': json.dumps(data)})
2159     return url + '#' + sdata
2160
2161
2162 def unsmuggle_url(smug_url, default=None):
2163     if '#__youtubedl_smuggle' not in smug_url:
2164         return smug_url, default
2165     url, _, sdata = smug_url.rpartition('#')
2166     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2167     data = json.loads(jsond)
2168     return url, data
2169
2170
2171 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2172     """ Formats numbers with decimal sufixes like K, M, etc """
2173     num, factor = float_or_none(num), float(factor)
2174     if num is None or num < 0:
2175         return None
2176     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2177     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2178     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2179     if factor == 1024:
2180         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2181     converted = num / (factor ** exponent)
2182     return fmt % (converted, suffix)
2183
2184
2185 def format_bytes(bytes):
2186     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2187
2188
2189 def lookup_unit_table(unit_table, s):
2190     units_re = '|'.join(re.escape(u) for u in unit_table)
2191     m = re.match(
2192         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2193     if not m:
2194         return None
2195     num_str = m.group('num').replace(',', '.')
2196     mult = unit_table[m.group('unit')]
2197     return int(float(num_str) * mult)
2198
2199
2200 def parse_filesize(s):
2201     if s is None:
2202         return None
2203
2204     # The lower-case forms are of course incorrect and unofficial,
2205     # but we support those too
2206     _UNIT_TABLE = {
2207         'B': 1,
2208         'b': 1,
2209         'bytes': 1,
2210         'KiB': 1024,
2211         'KB': 1000,
2212         'kB': 1024,
2213         'Kb': 1000,
2214         'kb': 1000,
2215         'kilobytes': 1000,
2216         'kibibytes': 1024,
2217         'MiB': 1024 ** 2,
2218         'MB': 1000 ** 2,
2219         'mB': 1024 ** 2,
2220         'Mb': 1000 ** 2,
2221         'mb': 1000 ** 2,
2222         'megabytes': 1000 ** 2,
2223         'mebibytes': 1024 ** 2,
2224         'GiB': 1024 ** 3,
2225         'GB': 1000 ** 3,
2226         'gB': 1024 ** 3,
2227         'Gb': 1000 ** 3,
2228         'gb': 1000 ** 3,
2229         'gigabytes': 1000 ** 3,
2230         'gibibytes': 1024 ** 3,
2231         'TiB': 1024 ** 4,
2232         'TB': 1000 ** 4,
2233         'tB': 1024 ** 4,
2234         'Tb': 1000 ** 4,
2235         'tb': 1000 ** 4,
2236         'terabytes': 1000 ** 4,
2237         'tebibytes': 1024 ** 4,
2238         'PiB': 1024 ** 5,
2239         'PB': 1000 ** 5,
2240         'pB': 1024 ** 5,
2241         'Pb': 1000 ** 5,
2242         'pb': 1000 ** 5,
2243         'petabytes': 1000 ** 5,
2244         'pebibytes': 1024 ** 5,
2245         'EiB': 1024 ** 6,
2246         'EB': 1000 ** 6,
2247         'eB': 1024 ** 6,
2248         'Eb': 1000 ** 6,
2249         'eb': 1000 ** 6,
2250         'exabytes': 1000 ** 6,
2251         'exbibytes': 1024 ** 6,
2252         'ZiB': 1024 ** 7,
2253         'ZB': 1000 ** 7,
2254         'zB': 1024 ** 7,
2255         'Zb': 1000 ** 7,
2256         'zb': 1000 ** 7,
2257         'zettabytes': 1000 ** 7,
2258         'zebibytes': 1024 ** 7,
2259         'YiB': 1024 ** 8,
2260         'YB': 1000 ** 8,
2261         'yB': 1024 ** 8,
2262         'Yb': 1000 ** 8,
2263         'yb': 1000 ** 8,
2264         'yottabytes': 1000 ** 8,
2265         'yobibytes': 1024 ** 8,
2266     }
2267
2268     return lookup_unit_table(_UNIT_TABLE, s)
2269
2270
2271 def parse_count(s):
2272     if s is None:
2273         return None
2274
2275     s = re.sub(r'^[^\d]+\s', '', s).strip()
2276
2277     if re.match(r'^[\d,.]+$', s):
2278         return str_to_int(s)
2279
2280     _UNIT_TABLE = {
2281         'k': 1000,
2282         'K': 1000,
2283         'm': 1000 ** 2,
2284         'M': 1000 ** 2,
2285         'kk': 1000 ** 2,
2286         'KK': 1000 ** 2,
2287         'b': 1000 ** 3,
2288         'B': 1000 ** 3,
2289     }
2290
2291     ret = lookup_unit_table(_UNIT_TABLE, s)
2292     if ret is not None:
2293         return ret
2294
2295     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2296     if mobj:
2297         return str_to_int(mobj.group(1))
2298
2299
2300 def parse_resolution(s, *, lenient=False):
2301     if s is None:
2302         return {}
2303
2304     if lenient:
2305         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2306     else:
2307         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2308     if mobj:
2309         return {
2310             'width': int(mobj.group('w')),
2311             'height': int(mobj.group('h')),
2312         }
2313
2314     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2315     if mobj:
2316         return {'height': int(mobj.group(1))}
2317
2318     mobj = re.search(r'\b([48])[kK]\b', s)
2319     if mobj:
2320         return {'height': int(mobj.group(1)) * 540}
2321
2322     return {}
2323
2324
2325 def parse_bitrate(s):
2326     if not isinstance(s, str):
2327         return
2328     mobj = re.search(r'\b(\d+)\s*kbps', s)
2329     if mobj:
2330         return int(mobj.group(1))
2331
2332
2333 def month_by_name(name, lang='en'):
2334     """ Return the number of a month by (locale-independently) English name """
2335
2336     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2337
2338     try:
2339         return month_names.index(name) + 1
2340     except ValueError:
2341         return None
2342
2343
2344 def month_by_abbreviation(abbrev):
2345     """ Return the number of a month by (locale-independently) English
2346         abbreviations """
2347
2348     try:
2349         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2350     except ValueError:
2351         return None
2352
2353
2354 def fix_xml_ampersands(xml_str):
2355     """Replace all the '&' by '&amp;' in XML"""
2356     return re.sub(
2357         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2358         '&amp;',
2359         xml_str)
2360
2361
2362 def setproctitle(title):
2363     assert isinstance(title, str)
2364
2365     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2366     try:
2367         import ctypes
2368     except ImportError:
2369         return
2370
2371     try:
2372         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2373     except OSError:
2374         return
2375     except TypeError:
2376         # LoadLibrary in Windows Python 2.7.13 only expects
2377         # a bytestring, but since unicode_literals turns
2378         # every string into a unicode string, it fails.
2379         return
2380     title_bytes = title.encode()
2381     buf = ctypes.create_string_buffer(len(title_bytes))
2382     buf.value = title_bytes
2383     try:
2384         libc.prctl(15, buf, 0, 0, 0)
2385     except AttributeError:
2386         return  # Strange libc, just skip this
2387
2388
2389 def remove_start(s, start):
2390     return s[len(start):] if s is not None and s.startswith(start) else s
2391
2392
2393 def remove_end(s, end):
2394     return s[:-len(end)] if s is not None and s.endswith(end) else s
2395
2396
2397 def remove_quotes(s):
2398     if s is None or len(s) < 2:
2399         return s
2400     for quote in ('"', "'", ):
2401         if s[0] == quote and s[-1] == quote:
2402             return s[1:-1]
2403     return s
2404
2405
2406 def get_domain(url):
2407     """
2408     This implementation is inconsistent, but is kept for compatibility.
2409     Use this only for "webpage_url_domain"
2410     """
2411     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2412
2413
2414 def url_basename(url):
2415     path = urllib.parse.urlparse(url).path
2416     return path.strip('/').split('/')[-1]
2417
2418
2419 def base_url(url):
2420     return re.match(r'https?://[^?#&]+/', url).group()
2421
2422
2423 def urljoin(base, path):
2424     if isinstance(path, bytes):
2425         path = path.decode()
2426     if not isinstance(path, str) or not path:
2427         return None
2428     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2429         return path
2430     if isinstance(base, bytes):
2431         base = base.decode()
2432     if not isinstance(base, str) or not re.match(
2433             r'^(?:https?:)?//', base):
2434         return None
2435     return urllib.parse.urljoin(base, path)
2436
2437
2438 class HEADRequest(urllib.request.Request):
2439     def get_method(self):
2440         return 'HEAD'
2441
2442
2443 class PUTRequest(urllib.request.Request):
2444     def get_method(self):
2445         return 'PUT'
2446
2447
2448 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2449     if get_attr and v is not None:
2450         v = getattr(v, get_attr, None)
2451     try:
2452         return int(v) * invscale // scale
2453     except (ValueError, TypeError, OverflowError):
2454         return default
2455
2456
2457 def str_or_none(v, default=None):
2458     return default if v is None else str(v)
2459
2460
2461 def str_to_int(int_str):
2462     """ A more relaxed version of int_or_none """
2463     if isinstance(int_str, int):
2464         return int_str
2465     elif isinstance(int_str, str):
2466         int_str = re.sub(r'[,\.\+]', '', int_str)
2467         return int_or_none(int_str)
2468
2469
2470 def float_or_none(v, scale=1, invscale=1, default=None):
2471     if v is None:
2472         return default
2473     try:
2474         return float(v) * invscale / scale
2475     except (ValueError, TypeError):
2476         return default
2477
2478
2479 def bool_or_none(v, default=None):
2480     return v if isinstance(v, bool) else default
2481
2482
2483 def strip_or_none(v, default=None):
2484     return v.strip() if isinstance(v, str) else default
2485
2486
2487 def url_or_none(url):
2488     if not url or not isinstance(url, str):
2489         return None
2490     url = url.strip()
2491     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2492
2493
2494 def request_to_url(req):
2495     if isinstance(req, urllib.request.Request):
2496         return req.get_full_url()
2497     else:
2498         return req
2499
2500
2501 def strftime_or_none(timestamp, date_format, default=None):
2502     datetime_object = None
2503     try:
2504         if isinstance(timestamp, (int, float)):  # unix timestamp
2505             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2506         elif isinstance(timestamp, str):  # assume YYYYMMDD
2507             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2508         return datetime_object.strftime(date_format)
2509     except (ValueError, TypeError, AttributeError):
2510         return default
2511
2512
2513 def parse_duration(s):
2514     if not isinstance(s, str):
2515         return None
2516     s = s.strip()
2517     if not s:
2518         return None
2519
2520     days, hours, mins, secs, ms = [None] * 5
2521     m = re.match(r'''(?x)
2522             (?P<before_secs>
2523                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2524             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2525             (?P<ms>[.:][0-9]+)?Z?$
2526         ''', s)
2527     if m:
2528         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2529     else:
2530         m = re.match(
2531             r'''(?ix)(?:P?
2532                 (?:
2533                     [0-9]+\s*y(?:ears?)?,?\s*
2534                 )?
2535                 (?:
2536                     [0-9]+\s*m(?:onths?)?,?\s*
2537                 )?
2538                 (?:
2539                     [0-9]+\s*w(?:eeks?)?,?\s*
2540                 )?
2541                 (?:
2542                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2543                 )?
2544                 T)?
2545                 (?:
2546                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2547                 )?
2548                 (?:
2549                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2550                 )?
2551                 (?:
2552                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2553                 )?Z?$''', s)
2554         if m:
2555             days, hours, mins, secs, ms = m.groups()
2556         else:
2557             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2558             if m:
2559                 hours, mins = m.groups()
2560             else:
2561                 return None
2562
2563     if ms:
2564         ms = ms.replace(':', '.')
2565     return sum(float(part or 0) * mult for part, mult in (
2566         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2567
2568
2569 def prepend_extension(filename, ext, expected_real_ext=None):
2570     name, real_ext = os.path.splitext(filename)
2571     return (
2572         f'{name}.{ext}{real_ext}'
2573         if not expected_real_ext or real_ext[1:] == expected_real_ext
2574         else f'{filename}.{ext}')
2575
2576
2577 def replace_extension(filename, ext, expected_real_ext=None):
2578     name, real_ext = os.path.splitext(filename)
2579     return '{}.{}'.format(
2580         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2581         ext)
2582
2583
2584 def check_executable(exe, args=[]):
2585     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2586     args can be a list of arguments for a short output (like -version) """
2587     try:
2588         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2589     except OSError:
2590         return False
2591     return exe
2592
2593
2594 def _get_exe_version_output(exe, args, *, to_screen=None):
2595     if to_screen:
2596         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2597     try:
2598         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2599         # SIGTTOU if yt-dlp is run in the background.
2600         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2601         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2602                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2603     except OSError:
2604         return False
2605     return stdout
2606
2607
2608 def detect_exe_version(output, version_re=None, unrecognized='present'):
2609     assert isinstance(output, str)
2610     if version_re is None:
2611         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2612     m = re.search(version_re, output)
2613     if m:
2614         return m.group(1)
2615     else:
2616         return unrecognized
2617
2618
2619 def get_exe_version(exe, args=['--version'],
2620                     version_re=None, unrecognized='present'):
2621     """ Returns the version of the specified executable,
2622     or False if the executable is not present """
2623     out = _get_exe_version_output(exe, args)
2624     return detect_exe_version(out, version_re, unrecognized) if out else False
2625
2626
2627 def frange(start=0, stop=None, step=1):
2628     """Float range"""
2629     if stop is None:
2630         start, stop = 0, start
2631     sign = [-1, 1][step > 0] if step else 0
2632     while sign * start < sign * stop:
2633         yield start
2634         start += step
2635
2636
2637 class LazyList(collections.abc.Sequence):
2638     """Lazy immutable list from an iterable
2639     Note that slices of a LazyList are lists and not LazyList"""
2640
2641     class IndexError(IndexError):
2642         pass
2643
2644     def __init__(self, iterable, *, reverse=False, _cache=None):
2645         self._iterable = iter(iterable)
2646         self._cache = [] if _cache is None else _cache
2647         self._reversed = reverse
2648
2649     def __iter__(self):
2650         if self._reversed:
2651             # We need to consume the entire iterable to iterate in reverse
2652             yield from self.exhaust()
2653             return
2654         yield from self._cache
2655         for item in self._iterable:
2656             self._cache.append(item)
2657             yield item
2658
2659     def _exhaust(self):
2660         self._cache.extend(self._iterable)
2661         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2662         return self._cache
2663
2664     def exhaust(self):
2665         """Evaluate the entire iterable"""
2666         return self._exhaust()[::-1 if self._reversed else 1]
2667
2668     @staticmethod
2669     def _reverse_index(x):
2670         return None if x is None else ~x
2671
2672     def __getitem__(self, idx):
2673         if isinstance(idx, slice):
2674             if self._reversed:
2675                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2676             start, stop, step = idx.start, idx.stop, idx.step or 1
2677         elif isinstance(idx, int):
2678             if self._reversed:
2679                 idx = self._reverse_index(idx)
2680             start, stop, step = idx, idx, 0
2681         else:
2682             raise TypeError('indices must be integers or slices')
2683         if ((start or 0) < 0 or (stop or 0) < 0
2684                 or (start is None and step < 0)
2685                 or (stop is None and step > 0)):
2686             # We need to consume the entire iterable to be able to slice from the end
2687             # Obviously, never use this with infinite iterables
2688             self._exhaust()
2689             try:
2690                 return self._cache[idx]
2691             except IndexError as e:
2692                 raise self.IndexError(e) from e
2693         n = max(start or 0, stop or 0) - len(self._cache) + 1
2694         if n > 0:
2695             self._cache.extend(itertools.islice(self._iterable, n))
2696         try:
2697             return self._cache[idx]
2698         except IndexError as e:
2699             raise self.IndexError(e) from e
2700
2701     def __bool__(self):
2702         try:
2703             self[-1] if self._reversed else self[0]
2704         except self.IndexError:
2705             return False
2706         return True
2707
2708     def __len__(self):
2709         self._exhaust()
2710         return len(self._cache)
2711
2712     def __reversed__(self):
2713         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2714
2715     def __copy__(self):
2716         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2717
2718     def __repr__(self):
2719         # repr and str should mimic a list. So we exhaust the iterable
2720         return repr(self.exhaust())
2721
2722     def __str__(self):
2723         return repr(self.exhaust())
2724
2725
2726 class PagedList:
2727
2728     class IndexError(IndexError):
2729         pass
2730
2731     def __len__(self):
2732         # This is only useful for tests
2733         return len(self.getslice())
2734
2735     def __init__(self, pagefunc, pagesize, use_cache=True):
2736         self._pagefunc = pagefunc
2737         self._pagesize = pagesize
2738         self._pagecount = float('inf')
2739         self._use_cache = use_cache
2740         self._cache = {}
2741
2742     def getpage(self, pagenum):
2743         page_results = self._cache.get(pagenum)
2744         if page_results is None:
2745             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2746         if self._use_cache:
2747             self._cache[pagenum] = page_results
2748         return page_results
2749
2750     def getslice(self, start=0, end=None):
2751         return list(self._getslice(start, end))
2752
2753     def _getslice(self, start, end):
2754         raise NotImplementedError('This method must be implemented by subclasses')
2755
2756     def __getitem__(self, idx):
2757         assert self._use_cache, 'Indexing PagedList requires cache'
2758         if not isinstance(idx, int) or idx < 0:
2759             raise TypeError('indices must be non-negative integers')
2760         entries = self.getslice(idx, idx + 1)
2761         if not entries:
2762             raise self.IndexError()
2763         return entries[0]
2764
2765
2766 class OnDemandPagedList(PagedList):
2767     """Download pages until a page with less than maximum results"""
2768
2769     def _getslice(self, start, end):
2770         for pagenum in itertools.count(start // self._pagesize):
2771             firstid = pagenum * self._pagesize
2772             nextfirstid = pagenum * self._pagesize + self._pagesize
2773             if start >= nextfirstid:
2774                 continue
2775
2776             startv = (
2777                 start % self._pagesize
2778                 if firstid <= start < nextfirstid
2779                 else 0)
2780             endv = (
2781                 ((end - 1) % self._pagesize) + 1
2782                 if (end is not None and firstid <= end <= nextfirstid)
2783                 else None)
2784
2785             try:
2786                 page_results = self.getpage(pagenum)
2787             except Exception:
2788                 self._pagecount = pagenum - 1
2789                 raise
2790             if startv != 0 or endv is not None:
2791                 page_results = page_results[startv:endv]
2792             yield from page_results
2793
2794             # A little optimization - if current page is not "full", ie. does
2795             # not contain page_size videos then we can assume that this page
2796             # is the last one - there are no more ids on further pages -
2797             # i.e. no need to query again.
2798             if len(page_results) + startv < self._pagesize:
2799                 break
2800
2801             # If we got the whole page, but the next page is not interesting,
2802             # break out early as well
2803             if end == nextfirstid:
2804                 break
2805
2806
2807 class InAdvancePagedList(PagedList):
2808     """PagedList with total number of pages known in advance"""
2809
2810     def __init__(self, pagefunc, pagecount, pagesize):
2811         PagedList.__init__(self, pagefunc, pagesize, True)
2812         self._pagecount = pagecount
2813
2814     def _getslice(self, start, end):
2815         start_page = start // self._pagesize
2816         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2817         skip_elems = start - start_page * self._pagesize
2818         only_more = None if end is None else end - start
2819         for pagenum in range(start_page, end_page):
2820             page_results = self.getpage(pagenum)
2821             if skip_elems:
2822                 page_results = page_results[skip_elems:]
2823                 skip_elems = None
2824             if only_more is not None:
2825                 if len(page_results) < only_more:
2826                     only_more -= len(page_results)
2827                 else:
2828                     yield from page_results[:only_more]
2829                     break
2830             yield from page_results
2831
2832
2833 class PlaylistEntries:
2834     MissingEntry = object()
2835     is_exhausted = False
2836
2837     def __init__(self, ydl, info_dict):
2838         self.ydl = ydl
2839
2840         # _entries must be assigned now since infodict can change during iteration
2841         entries = info_dict.get('entries')
2842         if entries is None:
2843             raise EntryNotInPlaylist('There are no entries')
2844         elif isinstance(entries, list):
2845             self.is_exhausted = True
2846
2847         requested_entries = info_dict.get('requested_entries')
2848         self.is_incomplete = bool(requested_entries)
2849         if self.is_incomplete:
2850             assert self.is_exhausted
2851             self._entries = [self.MissingEntry] * max(requested_entries)
2852             for i, entry in zip(requested_entries, entries):
2853                 self._entries[i - 1] = entry
2854         elif isinstance(entries, (list, PagedList, LazyList)):
2855             self._entries = entries
2856         else:
2857             self._entries = LazyList(entries)
2858
2859     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2860         (?P<start>[+-]?\d+)?
2861         (?P<range>[:-]
2862             (?P<end>[+-]?\d+|inf(?:inite)?)?
2863             (?::(?P<step>[+-]?\d+))?
2864         )?''')
2865
2866     @classmethod
2867     def parse_playlist_items(cls, string):
2868         for segment in string.split(','):
2869             if not segment:
2870                 raise ValueError('There is two or more consecutive commas')
2871             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2872             if not mobj:
2873                 raise ValueError(f'{segment!r} is not a valid specification')
2874             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2875             if int_or_none(step) == 0:
2876                 raise ValueError(f'Step in {segment!r} cannot be zero')
2877             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2878
2879     def get_requested_items(self):
2880         playlist_items = self.ydl.params.get('playlist_items')
2881         playlist_start = self.ydl.params.get('playliststart', 1)
2882         playlist_end = self.ydl.params.get('playlistend')
2883         # For backwards compatibility, interpret -1 as whole list
2884         if playlist_end in (-1, None):
2885             playlist_end = ''
2886         if not playlist_items:
2887             playlist_items = f'{playlist_start}:{playlist_end}'
2888         elif playlist_start != 1 or playlist_end:
2889             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2890
2891         for index in self.parse_playlist_items(playlist_items):
2892             for i, entry in self[index]:
2893                 yield i, entry
2894                 if not entry:
2895                     continue
2896                 try:
2897                     # TODO: Add auto-generated fields
2898                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2899                 except (ExistingVideoReached, RejectedVideoReached):
2900                     return
2901
2902     def get_full_count(self):
2903         if self.is_exhausted and not self.is_incomplete:
2904             return len(self)
2905         elif isinstance(self._entries, InAdvancePagedList):
2906             if self._entries._pagesize == 1:
2907                 return self._entries._pagecount
2908
2909     @functools.cached_property
2910     def _getter(self):
2911         if isinstance(self._entries, list):
2912             def get_entry(i):
2913                 try:
2914                     entry = self._entries[i]
2915                 except IndexError:
2916                     entry = self.MissingEntry
2917                     if not self.is_incomplete:
2918                         raise self.IndexError()
2919                 if entry is self.MissingEntry:
2920                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2921                 return entry
2922         else:
2923             def get_entry(i):
2924                 try:
2925                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2926                 except (LazyList.IndexError, PagedList.IndexError):
2927                     raise self.IndexError()
2928         return get_entry
2929
2930     def __getitem__(self, idx):
2931         if isinstance(idx, int):
2932             idx = slice(idx, idx)
2933
2934         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2935         step = 1 if idx.step is None else idx.step
2936         if idx.start is None:
2937             start = 0 if step > 0 else len(self) - 1
2938         else:
2939             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2940
2941         # NB: Do not call len(self) when idx == [:]
2942         if idx.stop is None:
2943             stop = 0 if step < 0 else float('inf')
2944         else:
2945             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2946         stop += [-1, 1][step > 0]
2947
2948         for i in frange(start, stop, step):
2949             if i < 0:
2950                 continue
2951             try:
2952                 entry = self._getter(i)
2953             except self.IndexError:
2954                 self.is_exhausted = True
2955                 if step > 0:
2956                     break
2957                 continue
2958             yield i + 1, entry
2959
2960     def __len__(self):
2961         return len(tuple(self[:]))
2962
2963     class IndexError(IndexError):
2964         pass
2965
2966
2967 def uppercase_escape(s):
2968     unicode_escape = codecs.getdecoder('unicode_escape')
2969     return re.sub(
2970         r'\\U[0-9a-fA-F]{8}',
2971         lambda m: unicode_escape(m.group(0))[0],
2972         s)
2973
2974
2975 def lowercase_escape(s):
2976     unicode_escape = codecs.getdecoder('unicode_escape')
2977     return re.sub(
2978         r'\\u[0-9a-fA-F]{4}',
2979         lambda m: unicode_escape(m.group(0))[0],
2980         s)
2981
2982
2983 def escape_rfc3986(s):
2984     """Escape non-ASCII characters as suggested by RFC 3986"""
2985     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2986
2987
2988 def escape_url(url):
2989     """Escape URL as suggested by RFC 3986"""
2990     url_parsed = urllib.parse.urlparse(url)
2991     return url_parsed._replace(
2992         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2993         path=escape_rfc3986(url_parsed.path),
2994         params=escape_rfc3986(url_parsed.params),
2995         query=escape_rfc3986(url_parsed.query),
2996         fragment=escape_rfc3986(url_parsed.fragment)
2997     ).geturl()
2998
2999
3000 def parse_qs(url):
3001     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3002
3003
3004 def read_batch_urls(batch_fd):
3005     def fixup(url):
3006         if not isinstance(url, str):
3007             url = url.decode('utf-8', 'replace')
3008         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3009         for bom in BOM_UTF8:
3010             if url.startswith(bom):
3011                 url = url[len(bom):]
3012         url = url.lstrip()
3013         if not url or url.startswith(('#', ';', ']')):
3014             return False
3015         # "#" cannot be stripped out since it is part of the URI
3016         # However, it can be safely stripped out if following a whitespace
3017         return re.split(r'\s#', url, 1)[0].rstrip()
3018
3019     with contextlib.closing(batch_fd) as fd:
3020         return [url for url in map(fixup, fd) if url]
3021
3022
3023 def urlencode_postdata(*args, **kargs):
3024     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3025
3026
3027 def update_url_query(url, query):
3028     if not query:
3029         return url
3030     parsed_url = urllib.parse.urlparse(url)
3031     qs = urllib.parse.parse_qs(parsed_url.query)
3032     qs.update(query)
3033     return urllib.parse.urlunparse(parsed_url._replace(
3034         query=urllib.parse.urlencode(qs, True)))
3035
3036
3037 def update_Request(req, url=None, data=None, headers=None, query=None):
3038     req_headers = req.headers.copy()
3039     req_headers.update(headers or {})
3040     req_data = data or req.data
3041     req_url = update_url_query(url or req.get_full_url(), query)
3042     req_get_method = req.get_method()
3043     if req_get_method == 'HEAD':
3044         req_type = HEADRequest
3045     elif req_get_method == 'PUT':
3046         req_type = PUTRequest
3047     else:
3048         req_type = urllib.request.Request
3049     new_req = req_type(
3050         req_url, data=req_data, headers=req_headers,
3051         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3052     if hasattr(req, 'timeout'):
3053         new_req.timeout = req.timeout
3054     return new_req
3055
3056
3057 def _multipart_encode_impl(data, boundary):
3058     content_type = 'multipart/form-data; boundary=%s' % boundary
3059
3060     out = b''
3061     for k, v in data.items():
3062         out += b'--' + boundary.encode('ascii') + b'\r\n'
3063         if isinstance(k, str):
3064             k = k.encode()
3065         if isinstance(v, str):
3066             v = v.encode()
3067         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3068         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3069         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3070         if boundary.encode('ascii') in content:
3071             raise ValueError('Boundary overlaps with data')
3072         out += content
3073
3074     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3075
3076     return out, content_type
3077
3078
3079 def multipart_encode(data, boundary=None):
3080     '''
3081     Encode a dict to RFC 7578-compliant form-data
3082
3083     data:
3084         A dict where keys and values can be either Unicode or bytes-like
3085         objects.
3086     boundary:
3087         If specified a Unicode object, it's used as the boundary. Otherwise
3088         a random boundary is generated.
3089
3090     Reference: https://tools.ietf.org/html/rfc7578
3091     '''
3092     has_specified_boundary = boundary is not None
3093
3094     while True:
3095         if boundary is None:
3096             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3097
3098         try:
3099             out, content_type = _multipart_encode_impl(data, boundary)
3100             break
3101         except ValueError:
3102             if has_specified_boundary:
3103                 raise
3104             boundary = None
3105
3106     return out, content_type
3107
3108
3109 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3110     for val in map(d.get, variadic(key_or_keys)):
3111         if val is not None and (val or not skip_false_values):
3112             return val
3113     return default
3114
3115
3116 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3117     for f in funcs:
3118         try:
3119             val = f(*args, **kwargs)
3120         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3121             pass
3122         else:
3123             if expected_type is None or isinstance(val, expected_type):
3124                 return val
3125
3126
3127 def try_get(src, getter, expected_type=None):
3128     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3129
3130
3131 def filter_dict(dct, cndn=lambda _, v: v is not None):
3132     return {k: v for k, v in dct.items() if cndn(k, v)}
3133
3134
3135 def merge_dicts(*dicts):
3136     merged = {}
3137     for a_dict in dicts:
3138         for k, v in a_dict.items():
3139             if (v is not None and k not in merged
3140                     or isinstance(v, str) and merged[k] == ''):
3141                 merged[k] = v
3142     return merged
3143
3144
3145 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3146     return string if isinstance(string, str) else str(string, encoding, errors)
3147
3148
3149 US_RATINGS = {
3150     'G': 0,
3151     'PG': 10,
3152     'PG-13': 13,
3153     'R': 16,
3154     'NC': 18,
3155 }
3156
3157
3158 TV_PARENTAL_GUIDELINES = {
3159     'TV-Y': 0,
3160     'TV-Y7': 7,
3161     'TV-G': 0,
3162     'TV-PG': 0,
3163     'TV-14': 14,
3164     'TV-MA': 17,
3165 }
3166
3167
3168 def parse_age_limit(s):
3169     # isinstance(False, int) is True. So type() must be used instead
3170     if type(s) is int:  # noqa: E721
3171         return s if 0 <= s <= 21 else None
3172     elif not isinstance(s, str):
3173         return None
3174     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3175     if m:
3176         return int(m.group('age'))
3177     s = s.upper()
3178     if s in US_RATINGS:
3179         return US_RATINGS[s]
3180     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3181     if m:
3182         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3183     return None
3184
3185
3186 def strip_jsonp(code):
3187     return re.sub(
3188         r'''(?sx)^
3189             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3190             (?:\s*&&\s*(?P=func_name))?
3191             \s*\(\s*(?P<callback_data>.*)\);?
3192             \s*?(?://[^\n]*)*$''',
3193         r'\g<callback_data>', code)
3194
3195
3196 def js_to_json(code, vars={}):
3197     # vars is a dict of var, val pairs to substitute
3198     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3199     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3200     INTEGER_TABLE = (
3201         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3202         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3203     )
3204
3205     def fix_kv(m):
3206         v = m.group(0)
3207         if v in ('true', 'false', 'null'):
3208             return v
3209         elif v in ('undefined', 'void 0'):
3210             return 'null'
3211         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3212             return ""
3213
3214         if v[0] in ("'", '"'):
3215             v = re.sub(r'(?s)\\.|"', lambda m: {
3216                 '"': '\\"',
3217                 "\\'": "'",
3218                 '\\\n': '',
3219                 '\\x': '\\u00',
3220             }.get(m.group(0), m.group(0)), v[1:-1])
3221         else:
3222             for regex, base in INTEGER_TABLE:
3223                 im = re.match(regex, v)
3224                 if im:
3225                     i = int(im.group(1), base)
3226                     return '"%d":' % i if v.endswith(':') else '%d' % i
3227
3228             if v in vars:
3229                 return vars[v]
3230
3231         return '"%s"' % v
3232
3233     def create_map(mobj):
3234         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3235
3236     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3237     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3238
3239     return re.sub(r'''(?sx)
3240         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3241         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3242         {comment}|,(?={skip}[\]}}])|
3243         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3244         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3245         [0-9]+(?={skip}:)|
3246         !+
3247         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3248
3249
3250 def qualities(quality_ids):
3251     """ Get a numeric quality value out of a list of possible values """
3252     def q(qid):
3253         try:
3254             return quality_ids.index(qid)
3255         except ValueError:
3256             return -1
3257     return q
3258
3259
3260 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3261
3262
3263 DEFAULT_OUTTMPL = {
3264     'default': '%(title)s [%(id)s].%(ext)s',
3265     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3266 }
3267 OUTTMPL_TYPES = {
3268     'chapter': None,
3269     'subtitle': None,
3270     'thumbnail': None,
3271     'description': 'description',
3272     'annotation': 'annotations.xml',
3273     'infojson': 'info.json',
3274     'link': None,
3275     'pl_video': None,
3276     'pl_thumbnail': None,
3277     'pl_description': 'description',
3278     'pl_infojson': 'info.json',
3279 }
3280
3281 # As of [1] format syntax is:
3282 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3283 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3284 STR_FORMAT_RE_TMPL = r'''(?x)
3285     (?<!%)(?P<prefix>(?:%%)*)
3286     %
3287     (?P<has_key>\((?P<key>{0})\))?
3288     (?P<format>
3289         (?P<conversion>[#0\-+ ]+)?
3290         (?P<min_width>\d+)?
3291         (?P<precision>\.\d+)?
3292         (?P<len_mod>[hlL])?  # unused in python
3293         {1}  # conversion type
3294     )
3295 '''
3296
3297
3298 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3299
3300
3301 def limit_length(s, length):
3302     """ Add ellipses to overly long strings """
3303     if s is None:
3304         return None
3305     ELLIPSES = '...'
3306     if len(s) > length:
3307         return s[:length - len(ELLIPSES)] + ELLIPSES
3308     return s
3309
3310
3311 def version_tuple(v):
3312     return tuple(int(e) for e in re.split(r'[-.]', v))
3313
3314
3315 def is_outdated_version(version, limit, assume_new=True):
3316     if not version:
3317         return not assume_new
3318     try:
3319         return version_tuple(version) < version_tuple(limit)
3320     except ValueError:
3321         return not assume_new
3322
3323
3324 def ytdl_is_updateable():
3325     """ Returns if yt-dlp can be updated with -U """
3326
3327     from .update import is_non_updateable
3328
3329     return not is_non_updateable()
3330
3331
3332 def args_to_str(args):
3333     # Get a short string representation for a subprocess command
3334     return ' '.join(compat_shlex_quote(a) for a in args)
3335
3336
3337 def error_to_compat_str(err):
3338     return str(err)
3339
3340
3341 def error_to_str(err):
3342     return f'{type(err).__name__}: {err}'
3343
3344
3345 def mimetype2ext(mt):
3346     if mt is None:
3347         return None
3348
3349     mt, _, params = mt.partition(';')
3350     mt = mt.strip()
3351
3352     FULL_MAP = {
3353         'audio/mp4': 'm4a',
3354         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3355         # it's the most popular one
3356         'audio/mpeg': 'mp3',
3357         'audio/x-wav': 'wav',
3358         'audio/wav': 'wav',
3359         'audio/wave': 'wav',
3360     }
3361
3362     ext = FULL_MAP.get(mt)
3363     if ext is not None:
3364         return ext
3365
3366     SUBTYPE_MAP = {
3367         '3gpp': '3gp',
3368         'smptett+xml': 'tt',
3369         'ttaf+xml': 'dfxp',
3370         'ttml+xml': 'ttml',
3371         'x-flv': 'flv',
3372         'x-mp4-fragmented': 'mp4',
3373         'x-ms-sami': 'sami',
3374         'x-ms-wmv': 'wmv',
3375         'mpegurl': 'm3u8',
3376         'x-mpegurl': 'm3u8',
3377         'vnd.apple.mpegurl': 'm3u8',
3378         'dash+xml': 'mpd',
3379         'f4m+xml': 'f4m',
3380         'hds+xml': 'f4m',
3381         'vnd.ms-sstr+xml': 'ism',
3382         'quicktime': 'mov',
3383         'mp2t': 'ts',
3384         'x-wav': 'wav',
3385         'filmstrip+json': 'fs',
3386         'svg+xml': 'svg',
3387     }
3388
3389     _, _, subtype = mt.rpartition('/')
3390     ext = SUBTYPE_MAP.get(subtype.lower())
3391     if ext is not None:
3392         return ext
3393
3394     SUFFIX_MAP = {
3395         'json': 'json',
3396         'xml': 'xml',
3397         'zip': 'zip',
3398         'gzip': 'gz',
3399     }
3400
3401     _, _, suffix = subtype.partition('+')
3402     ext = SUFFIX_MAP.get(suffix)
3403     if ext is not None:
3404         return ext
3405
3406     return subtype.replace('+', '.')
3407
3408
3409 def ext2mimetype(ext_or_url):
3410     if not ext_or_url:
3411         return None
3412     if '.' not in ext_or_url:
3413         ext_or_url = f'file.{ext_or_url}'
3414     return mimetypes.guess_type(ext_or_url)[0]
3415
3416
3417 def parse_codecs(codecs_str):
3418     # http://tools.ietf.org/html/rfc6381
3419     if not codecs_str:
3420         return {}
3421     split_codecs = list(filter(None, map(
3422         str.strip, codecs_str.strip().strip(',').split(','))))
3423     vcodec, acodec, scodec, hdr = None, None, None, None
3424     for full_codec in split_codecs:
3425         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3426         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3427                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3428             if vcodec:
3429                 continue
3430             vcodec = full_codec
3431             if parts[0] in ('dvh1', 'dvhe'):
3432                 hdr = 'DV'
3433             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3434                 hdr = 'HDR10'
3435             elif parts[:2] == ['vp9', '2']:
3436                 hdr = 'HDR10'
3437         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3438                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3439             acodec = acodec or full_codec
3440         elif parts[0] in ('stpp', 'wvtt'):
3441             scodec = scodec or full_codec
3442         else:
3443             write_string(f'WARNING: Unknown codec {full_codec}\n')
3444     if vcodec or acodec or scodec:
3445         return {
3446             'vcodec': vcodec or 'none',
3447             'acodec': acodec or 'none',
3448             'dynamic_range': hdr,
3449             **({'scodec': scodec} if scodec is not None else {}),
3450         }
3451     elif len(split_codecs) == 2:
3452         return {
3453             'vcodec': split_codecs[0],
3454             'acodec': split_codecs[1],
3455         }
3456     return {}
3457
3458
3459 def urlhandle_detect_ext(url_handle):
3460     getheader = url_handle.headers.get
3461
3462     cd = getheader('Content-Disposition')
3463     if cd:
3464         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3465         if m:
3466             e = determine_ext(m.group('filename'), default_ext=None)
3467             if e:
3468                 return e
3469
3470     return mimetype2ext(getheader('Content-Type'))
3471
3472
3473 def encode_data_uri(data, mime_type):
3474     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3475
3476
3477 def age_restricted(content_limit, age_limit):
3478     """ Returns True iff the content should be blocked """
3479
3480     if age_limit is None:  # No limit set
3481         return False
3482     if content_limit is None:
3483         return False  # Content available for everyone
3484     return age_limit < content_limit
3485
3486
3487 # List of known byte-order-marks (BOM)
3488 BOMS = [
3489     (b'\xef\xbb\xbf', 'utf-8'),
3490     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3491     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3492     (b'\xff\xfe', 'utf-16-le'),
3493     (b'\xfe\xff', 'utf-16-be'),
3494 ]
3495
3496
3497 def is_html(first_bytes):
3498     """ Detect whether a file contains HTML by examining its first bytes. """
3499
3500     encoding = 'utf-8'
3501     for bom, enc in BOMS:
3502         while first_bytes.startswith(bom):
3503             encoding, first_bytes = enc, first_bytes[len(bom):]
3504
3505     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3506
3507
3508 def determine_protocol(info_dict):
3509     protocol = info_dict.get('protocol')
3510     if protocol is not None:
3511         return protocol
3512
3513     url = sanitize_url(info_dict['url'])
3514     if url.startswith('rtmp'):
3515         return 'rtmp'
3516     elif url.startswith('mms'):
3517         return 'mms'
3518     elif url.startswith('rtsp'):
3519         return 'rtsp'
3520
3521     ext = determine_ext(url)
3522     if ext == 'm3u8':
3523         return 'm3u8'
3524     elif ext == 'f4m':
3525         return 'f4m'
3526
3527     return urllib.parse.urlparse(url).scheme
3528
3529
3530 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3531     """ Render a list of rows, each as a list of values.
3532     Text after a \t will be right aligned """
3533     def width(string):
3534         return len(remove_terminal_sequences(string).replace('\t', ''))
3535
3536     def get_max_lens(table):
3537         return [max(width(str(v)) for v in col) for col in zip(*table)]
3538
3539     def filter_using_list(row, filterArray):
3540         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3541
3542     max_lens = get_max_lens(data) if hide_empty else []
3543     header_row = filter_using_list(header_row, max_lens)
3544     data = [filter_using_list(row, max_lens) for row in data]
3545
3546     table = [header_row] + data
3547     max_lens = get_max_lens(table)
3548     extra_gap += 1
3549     if delim:
3550         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3551         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3552     for row in table:
3553         for pos, text in enumerate(map(str, row)):
3554             if '\t' in text:
3555                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3556             else:
3557                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3558     ret = '\n'.join(''.join(row).rstrip() for row in table)
3559     return ret
3560
3561
3562 def _match_one(filter_part, dct, incomplete):
3563     # TODO: Generalize code with YoutubeDL._build_format_filter
3564     STRING_OPERATORS = {
3565         '*=': operator.contains,
3566         '^=': lambda attr, value: attr.startswith(value),
3567         '$=': lambda attr, value: attr.endswith(value),
3568         '~=': lambda attr, value: re.search(value, attr),
3569     }
3570     COMPARISON_OPERATORS = {
3571         **STRING_OPERATORS,
3572         '<=': operator.le,  # "<=" must be defined above "<"
3573         '<': operator.lt,
3574         '>=': operator.ge,
3575         '>': operator.gt,
3576         '=': operator.eq,
3577     }
3578
3579     if isinstance(incomplete, bool):
3580         is_incomplete = lambda _: incomplete
3581     else:
3582         is_incomplete = lambda k: k in incomplete
3583
3584     operator_rex = re.compile(r'''(?x)
3585         (?P<key>[a-z_]+)
3586         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3587         (?:
3588             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3589             (?P<strval>.+?)
3590         )
3591         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3592     m = operator_rex.fullmatch(filter_part.strip())
3593     if m:
3594         m = m.groupdict()
3595         unnegated_op = COMPARISON_OPERATORS[m['op']]
3596         if m['negation']:
3597             op = lambda attr, value: not unnegated_op(attr, value)
3598         else:
3599             op = unnegated_op
3600         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3601         if m['quote']:
3602             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3603         actual_value = dct.get(m['key'])
3604         numeric_comparison = None
3605         if isinstance(actual_value, (int, float)):
3606             # If the original field is a string and matching comparisonvalue is
3607             # a number we should respect the origin of the original field
3608             # and process comparison value as a string (see
3609             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3610             try:
3611                 numeric_comparison = int(comparison_value)
3612             except ValueError:
3613                 numeric_comparison = parse_filesize(comparison_value)
3614                 if numeric_comparison is None:
3615                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3616                 if numeric_comparison is None:
3617                     numeric_comparison = parse_duration(comparison_value)
3618         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3619             raise ValueError('Operator %s only supports string values!' % m['op'])
3620         if actual_value is None:
3621             return is_incomplete(m['key']) or m['none_inclusive']
3622         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3623
3624     UNARY_OPERATORS = {
3625         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3626         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3627     }
3628     operator_rex = re.compile(r'''(?x)
3629         (?P<op>%s)\s*(?P<key>[a-z_]+)
3630         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3631     m = operator_rex.fullmatch(filter_part.strip())
3632     if m:
3633         op = UNARY_OPERATORS[m.group('op')]
3634         actual_value = dct.get(m.group('key'))
3635         if is_incomplete(m.group('key')) and actual_value is None:
3636             return True
3637         return op(actual_value)
3638
3639     raise ValueError('Invalid filter part %r' % filter_part)
3640
3641
3642 def match_str(filter_str, dct, incomplete=False):
3643     """ Filter a dictionary with a simple string syntax.
3644     @returns           Whether the filter passes
3645     @param incomplete  Set of keys that is expected to be missing from dct.
3646                        Can be True/False to indicate all/none of the keys may be missing.
3647                        All conditions on incomplete keys pass if the key is missing
3648     """
3649     return all(
3650         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3651         for filter_part in re.split(r'(?<!\\)&', filter_str))
3652
3653
3654 def match_filter_func(filters):
3655     if not filters:
3656         return None
3657     filters = set(variadic(filters))
3658
3659     interactive = '-' in filters
3660     if interactive:
3661         filters.remove('-')
3662
3663     def _match_func(info_dict, incomplete=False):
3664         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3665             return NO_DEFAULT if interactive and not incomplete else None
3666         else:
3667             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3668             filter_str = ') | ('.join(map(str.strip, filters))
3669             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3670     return _match_func
3671
3672
3673 class download_range_func:
3674     def __init__(self, chapters, ranges):
3675         self.chapters, self.ranges = chapters, ranges
3676
3677     def __call__(self, info_dict, ydl):
3678         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3679                    else 'Cannot match chapters since chapter information is unavailable')
3680         for regex in self.chapters or []:
3681             for i, chapter in enumerate(info_dict.get('chapters') or []):
3682                 if re.search(regex, chapter['title']):
3683                     warning = None
3684                     yield {**chapter, 'index': i}
3685         if self.chapters and warning:
3686             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3687
3688         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3689
3690     def __eq__(self, other):
3691         return (isinstance(other, download_range_func)
3692                 and self.chapters == other.chapters and self.ranges == other.ranges)
3693
3694
3695 def parse_dfxp_time_expr(time_expr):
3696     if not time_expr:
3697         return
3698
3699     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3700     if mobj:
3701         return float(mobj.group('time_offset'))
3702
3703     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3704     if mobj:
3705         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3706
3707
3708 def srt_subtitles_timecode(seconds):
3709     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3710
3711
3712 def ass_subtitles_timecode(seconds):
3713     time = timetuple_from_msec(seconds * 1000)
3714     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3715
3716
3717 def dfxp2srt(dfxp_data):
3718     '''
3719     @param dfxp_data A bytes-like object containing DFXP data
3720     @returns A unicode object containing converted SRT data
3721     '''
3722     LEGACY_NAMESPACES = (
3723         (b'http://www.w3.org/ns/ttml', [
3724             b'http://www.w3.org/2004/11/ttaf1',
3725             b'http://www.w3.org/2006/04/ttaf1',
3726             b'http://www.w3.org/2006/10/ttaf1',
3727         ]),
3728         (b'http://www.w3.org/ns/ttml#styling', [
3729             b'http://www.w3.org/ns/ttml#style',
3730         ]),
3731     )
3732
3733     SUPPORTED_STYLING = [
3734         'color',
3735         'fontFamily',
3736         'fontSize',
3737         'fontStyle',
3738         'fontWeight',
3739         'textDecoration'
3740     ]
3741
3742     _x = functools.partial(xpath_with_ns, ns_map={
3743         'xml': 'http://www.w3.org/XML/1998/namespace',
3744         'ttml': 'http://www.w3.org/ns/ttml',
3745         'tts': 'http://www.w3.org/ns/ttml#styling',
3746     })
3747
3748     styles = {}
3749     default_style = {}
3750
3751     class TTMLPElementParser:
3752         _out = ''
3753         _unclosed_elements = []
3754         _applied_styles = []
3755
3756         def start(self, tag, attrib):
3757             if tag in (_x('ttml:br'), 'br'):
3758                 self._out += '\n'
3759             else:
3760                 unclosed_elements = []
3761                 style = {}
3762                 element_style_id = attrib.get('style')
3763                 if default_style:
3764                     style.update(default_style)
3765                 if element_style_id:
3766                     style.update(styles.get(element_style_id, {}))
3767                 for prop in SUPPORTED_STYLING:
3768                     prop_val = attrib.get(_x('tts:' + prop))
3769                     if prop_val:
3770                         style[prop] = prop_val
3771                 if style:
3772                     font = ''
3773                     for k, v in sorted(style.items()):
3774                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3775                             continue
3776                         if k == 'color':
3777                             font += ' color="%s"' % v
3778                         elif k == 'fontSize':
3779                             font += ' size="%s"' % v
3780                         elif k == 'fontFamily':
3781                             font += ' face="%s"' % v
3782                         elif k == 'fontWeight' and v == 'bold':
3783                             self._out += '<b>'
3784                             unclosed_elements.append('b')
3785                         elif k == 'fontStyle' and v == 'italic':
3786                             self._out += '<i>'
3787                             unclosed_elements.append('i')
3788                         elif k == 'textDecoration' and v == 'underline':
3789                             self._out += '<u>'
3790                             unclosed_elements.append('u')
3791                     if font:
3792                         self._out += '<font' + font + '>'
3793                         unclosed_elements.append('font')
3794                     applied_style = {}
3795                     if self._applied_styles:
3796                         applied_style.update(self._applied_styles[-1])
3797                     applied_style.update(style)
3798                     self._applied_styles.append(applied_style)
3799                 self._unclosed_elements.append(unclosed_elements)
3800
3801         def end(self, tag):
3802             if tag not in (_x('ttml:br'), 'br'):
3803                 unclosed_elements = self._unclosed_elements.pop()
3804                 for element in reversed(unclosed_elements):
3805                     self._out += '</%s>' % element
3806                 if unclosed_elements and self._applied_styles:
3807                     self._applied_styles.pop()
3808
3809         def data(self, data):
3810             self._out += data
3811
3812         def close(self):
3813             return self._out.strip()
3814
3815     def parse_node(node):
3816         target = TTMLPElementParser()
3817         parser = xml.etree.ElementTree.XMLParser(target=target)
3818         parser.feed(xml.etree.ElementTree.tostring(node))
3819         return parser.close()
3820
3821     for k, v in LEGACY_NAMESPACES:
3822         for ns in v:
3823             dfxp_data = dfxp_data.replace(ns, k)
3824
3825     dfxp = compat_etree_fromstring(dfxp_data)
3826     out = []
3827     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3828
3829     if not paras:
3830         raise ValueError('Invalid dfxp/TTML subtitle')
3831
3832     repeat = False
3833     while True:
3834         for style in dfxp.findall(_x('.//ttml:style')):
3835             style_id = style.get('id') or style.get(_x('xml:id'))
3836             if not style_id:
3837                 continue
3838             parent_style_id = style.get('style')
3839             if parent_style_id:
3840                 if parent_style_id not in styles:
3841                     repeat = True
3842                     continue
3843                 styles[style_id] = styles[parent_style_id].copy()
3844             for prop in SUPPORTED_STYLING:
3845                 prop_val = style.get(_x('tts:' + prop))
3846                 if prop_val:
3847                     styles.setdefault(style_id, {})[prop] = prop_val
3848         if repeat:
3849             repeat = False
3850         else:
3851             break
3852
3853     for p in ('body', 'div'):
3854         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3855         if ele is None:
3856             continue
3857         style = styles.get(ele.get('style'))
3858         if not style:
3859             continue
3860         default_style.update(style)
3861
3862     for para, index in zip(paras, itertools.count(1)):
3863         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3864         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3865         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3866         if begin_time is None:
3867             continue
3868         if not end_time:
3869             if not dur:
3870                 continue
3871             end_time = begin_time + dur
3872         out.append('%d\n%s --> %s\n%s\n\n' % (
3873             index,
3874             srt_subtitles_timecode(begin_time),
3875             srt_subtitles_timecode(end_time),
3876             parse_node(para)))
3877
3878     return ''.join(out)
3879
3880
3881 def cli_option(params, command_option, param, separator=None):
3882     param = params.get(param)
3883     return ([] if param is None
3884             else [command_option, str(param)] if separator is None
3885             else [f'{command_option}{separator}{param}'])
3886
3887
3888 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3889     param = params.get(param)
3890     assert param in (True, False, None)
3891     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3892
3893
3894 def cli_valueless_option(params, command_option, param, expected_value=True):
3895     return [command_option] if params.get(param) == expected_value else []
3896
3897
3898 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3899     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3900         if use_compat:
3901             return argdict
3902         else:
3903             argdict = None
3904     if argdict is None:
3905         return default
3906     assert isinstance(argdict, dict)
3907
3908     assert isinstance(keys, (list, tuple))
3909     for key_list in keys:
3910         arg_list = list(filter(
3911             lambda x: x is not None,
3912             [argdict.get(key.lower()) for key in variadic(key_list)]))
3913         if arg_list:
3914             return [arg for args in arg_list for arg in args]
3915     return default
3916
3917
3918 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3919     main_key, exe = main_key.lower(), exe.lower()
3920     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3921     keys = [f'{root_key}{k}' for k in (keys or [''])]
3922     if root_key in keys:
3923         if main_key != exe:
3924             keys.append((main_key, exe))
3925         keys.append('default')
3926     else:
3927         use_compat = False
3928     return cli_configuration_args(argdict, keys, default, use_compat)
3929
3930
3931 class ISO639Utils:
3932     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3933     _lang_map = {
3934         'aa': 'aar',
3935         'ab': 'abk',
3936         'ae': 'ave',
3937         'af': 'afr',
3938         'ak': 'aka',
3939         'am': 'amh',
3940         'an': 'arg',
3941         'ar': 'ara',
3942         'as': 'asm',
3943         'av': 'ava',
3944         'ay': 'aym',
3945         'az': 'aze',
3946         'ba': 'bak',
3947         'be': 'bel',
3948         'bg': 'bul',
3949         'bh': 'bih',
3950         'bi': 'bis',
3951         'bm': 'bam',
3952         'bn': 'ben',
3953         'bo': 'bod',
3954         'br': 'bre',
3955         'bs': 'bos',
3956         'ca': 'cat',
3957         'ce': 'che',
3958         'ch': 'cha',
3959         'co': 'cos',
3960         'cr': 'cre',
3961         'cs': 'ces',
3962         'cu': 'chu',
3963         'cv': 'chv',
3964         'cy': 'cym',
3965         'da': 'dan',
3966         'de': 'deu',
3967         'dv': 'div',
3968         'dz': 'dzo',
3969         'ee': 'ewe',
3970         'el': 'ell',
3971         'en': 'eng',
3972         'eo': 'epo',
3973         'es': 'spa',
3974         'et': 'est',
3975         'eu': 'eus',
3976         'fa': 'fas',
3977         'ff': 'ful',
3978         'fi': 'fin',
3979         'fj': 'fij',
3980         'fo': 'fao',
3981         'fr': 'fra',
3982         'fy': 'fry',
3983         'ga': 'gle',
3984         'gd': 'gla',
3985         'gl': 'glg',
3986         'gn': 'grn',
3987         'gu': 'guj',
3988         'gv': 'glv',
3989         'ha': 'hau',
3990         'he': 'heb',
3991         'iw': 'heb',  # Replaced by he in 1989 revision
3992         'hi': 'hin',
3993         'ho': 'hmo',
3994         'hr': 'hrv',
3995         'ht': 'hat',
3996         'hu': 'hun',
3997         'hy': 'hye',
3998         'hz': 'her',
3999         'ia': 'ina',
4000         'id': 'ind',
4001         'in': 'ind',  # Replaced by id in 1989 revision
4002         'ie': 'ile',
4003         'ig': 'ibo',
4004         'ii': 'iii',
4005         'ik': 'ipk',
4006         'io': 'ido',
4007         'is': 'isl',
4008         'it': 'ita',
4009         'iu': 'iku',
4010         'ja': 'jpn',
4011         'jv': 'jav',
4012         'ka': 'kat',
4013         'kg': 'kon',
4014         'ki': 'kik',
4015         'kj': 'kua',
4016         'kk': 'kaz',
4017         'kl': 'kal',
4018         'km': 'khm',
4019         'kn': 'kan',
4020         'ko': 'kor',
4021         'kr': 'kau',
4022         'ks': 'kas',
4023         'ku': 'kur',
4024         'kv': 'kom',
4025         'kw': 'cor',
4026         'ky': 'kir',
4027         'la': 'lat',
4028         'lb': 'ltz',
4029         'lg': 'lug',
4030         'li': 'lim',
4031         'ln': 'lin',
4032         'lo': 'lao',
4033         'lt': 'lit',
4034         'lu': 'lub',
4035         'lv': 'lav',
4036         'mg': 'mlg',
4037         'mh': 'mah',
4038         'mi': 'mri',
4039         'mk': 'mkd',
4040         'ml': 'mal',
4041         'mn': 'mon',
4042         'mr': 'mar',
4043         'ms': 'msa',
4044         'mt': 'mlt',
4045         'my': 'mya',
4046         'na': 'nau',
4047         'nb': 'nob',
4048         'nd': 'nde',
4049         'ne': 'nep',
4050         'ng': 'ndo',
4051         'nl': 'nld',
4052         'nn': 'nno',
4053         'no': 'nor',
4054         'nr': 'nbl',
4055         'nv': 'nav',
4056         'ny': 'nya',
4057         'oc': 'oci',
4058         'oj': 'oji',
4059         'om': 'orm',
4060         'or': 'ori',
4061         'os': 'oss',
4062         'pa': 'pan',
4063         'pi': 'pli',
4064         'pl': 'pol',
4065         'ps': 'pus',
4066         'pt': 'por',
4067         'qu': 'que',
4068         'rm': 'roh',
4069         'rn': 'run',
4070         'ro': 'ron',
4071         'ru': 'rus',
4072         'rw': 'kin',
4073         'sa': 'san',
4074         'sc': 'srd',
4075         'sd': 'snd',
4076         'se': 'sme',
4077         'sg': 'sag',
4078         'si': 'sin',
4079         'sk': 'slk',
4080         'sl': 'slv',
4081         'sm': 'smo',
4082         'sn': 'sna',
4083         'so': 'som',
4084         'sq': 'sqi',
4085         'sr': 'srp',
4086         'ss': 'ssw',
4087         'st': 'sot',
4088         'su': 'sun',
4089         'sv': 'swe',
4090         'sw': 'swa',
4091         'ta': 'tam',
4092         'te': 'tel',
4093         'tg': 'tgk',
4094         'th': 'tha',
4095         'ti': 'tir',
4096         'tk': 'tuk',
4097         'tl': 'tgl',
4098         'tn': 'tsn',
4099         'to': 'ton',
4100         'tr': 'tur',
4101         'ts': 'tso',
4102         'tt': 'tat',
4103         'tw': 'twi',
4104         'ty': 'tah',
4105         'ug': 'uig',
4106         'uk': 'ukr',
4107         'ur': 'urd',
4108         'uz': 'uzb',
4109         've': 'ven',
4110         'vi': 'vie',
4111         'vo': 'vol',
4112         'wa': 'wln',
4113         'wo': 'wol',
4114         'xh': 'xho',
4115         'yi': 'yid',
4116         'ji': 'yid',  # Replaced by yi in 1989 revision
4117         'yo': 'yor',
4118         'za': 'zha',
4119         'zh': 'zho',
4120         'zu': 'zul',
4121     }
4122
4123     @classmethod
4124     def short2long(cls, code):
4125         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4126         return cls._lang_map.get(code[:2])
4127
4128     @classmethod
4129     def long2short(cls, code):
4130         """Convert language code from ISO 639-2/T to ISO 639-1"""
4131         for short_name, long_name in cls._lang_map.items():
4132             if long_name == code:
4133                 return short_name
4134
4135
4136 class ISO3166Utils:
4137     # From http://data.okfn.org/data/core/country-list
4138     _country_map = {
4139         'AF': 'Afghanistan',
4140         'AX': 'Åland Islands',
4141         'AL': 'Albania',
4142         'DZ': 'Algeria',
4143         'AS': 'American Samoa',
4144         'AD': 'Andorra',
4145         'AO': 'Angola',
4146         'AI': 'Anguilla',
4147         'AQ': 'Antarctica',
4148         'AG': 'Antigua and Barbuda',
4149         'AR': 'Argentina',
4150         'AM': 'Armenia',
4151         'AW': 'Aruba',
4152         'AU': 'Australia',
4153         'AT': 'Austria',
4154         'AZ': 'Azerbaijan',
4155         'BS': 'Bahamas',
4156         'BH': 'Bahrain',
4157         'BD': 'Bangladesh',
4158         'BB': 'Barbados',
4159         'BY': 'Belarus',
4160         'BE': 'Belgium',
4161         'BZ': 'Belize',
4162         'BJ': 'Benin',
4163         'BM': 'Bermuda',
4164         'BT': 'Bhutan',
4165         'BO': 'Bolivia, Plurinational State of',
4166         'BQ': 'Bonaire, Sint Eustatius and Saba',
4167         'BA': 'Bosnia and Herzegovina',
4168         'BW': 'Botswana',
4169         'BV': 'Bouvet Island',
4170         'BR': 'Brazil',
4171         'IO': 'British Indian Ocean Territory',
4172         'BN': 'Brunei Darussalam',
4173         'BG': 'Bulgaria',
4174         'BF': 'Burkina Faso',
4175         'BI': 'Burundi',
4176         'KH': 'Cambodia',
4177         'CM': 'Cameroon',
4178         'CA': 'Canada',
4179         'CV': 'Cape Verde',
4180         'KY': 'Cayman Islands',
4181         'CF': 'Central African Republic',
4182         'TD': 'Chad',
4183         'CL': 'Chile',
4184         'CN': 'China',
4185         'CX': 'Christmas Island',
4186         'CC': 'Cocos (Keeling) Islands',
4187         'CO': 'Colombia',
4188         'KM': 'Comoros',
4189         'CG': 'Congo',
4190         'CD': 'Congo, the Democratic Republic of the',
4191         'CK': 'Cook Islands',
4192         'CR': 'Costa Rica',
4193         'CI': 'Côte d\'Ivoire',
4194         'HR': 'Croatia',
4195         'CU': 'Cuba',
4196         'CW': 'Curaçao',
4197         'CY': 'Cyprus',
4198         'CZ': 'Czech Republic',
4199         'DK': 'Denmark',
4200         'DJ': 'Djibouti',
4201         'DM': 'Dominica',
4202         'DO': 'Dominican Republic',
4203         'EC': 'Ecuador',
4204         'EG': 'Egypt',
4205         'SV': 'El Salvador',
4206         'GQ': 'Equatorial Guinea',
4207         'ER': 'Eritrea',
4208         'EE': 'Estonia',
4209         'ET': 'Ethiopia',
4210         'FK': 'Falkland Islands (Malvinas)',
4211         'FO': 'Faroe Islands',
4212         'FJ': 'Fiji',
4213         'FI': 'Finland',
4214         'FR': 'France',
4215         'GF': 'French Guiana',
4216         'PF': 'French Polynesia',
4217         'TF': 'French Southern Territories',
4218         'GA': 'Gabon',
4219         'GM': 'Gambia',
4220         'GE': 'Georgia',
4221         'DE': 'Germany',
4222         'GH': 'Ghana',
4223         'GI': 'Gibraltar',
4224         'GR': 'Greece',
4225         'GL': 'Greenland',
4226         'GD': 'Grenada',
4227         'GP': 'Guadeloupe',
4228         'GU': 'Guam',
4229         'GT': 'Guatemala',
4230         'GG': 'Guernsey',
4231         'GN': 'Guinea',
4232         'GW': 'Guinea-Bissau',
4233         'GY': 'Guyana',
4234         'HT': 'Haiti',
4235         'HM': 'Heard Island and McDonald Islands',
4236         'VA': 'Holy See (Vatican City State)',
4237         'HN': 'Honduras',
4238         'HK': 'Hong Kong',
4239         'HU': 'Hungary',
4240         'IS': 'Iceland',
4241         'IN': 'India',
4242         'ID': 'Indonesia',
4243         'IR': 'Iran, Islamic Republic of',
4244         'IQ': 'Iraq',
4245         'IE': 'Ireland',
4246         'IM': 'Isle of Man',
4247         'IL': 'Israel',
4248         'IT': 'Italy',
4249         'JM': 'Jamaica',
4250         'JP': 'Japan',
4251         'JE': 'Jersey',
4252         'JO': 'Jordan',
4253         'KZ': 'Kazakhstan',
4254         'KE': 'Kenya',
4255         'KI': 'Kiribati',
4256         'KP': 'Korea, Democratic People\'s Republic of',
4257         'KR': 'Korea, Republic of',
4258         'KW': 'Kuwait',
4259         'KG': 'Kyrgyzstan',
4260         'LA': 'Lao People\'s Democratic Republic',
4261         'LV': 'Latvia',
4262         'LB': 'Lebanon',
4263         'LS': 'Lesotho',
4264         'LR': 'Liberia',
4265         'LY': 'Libya',
4266         'LI': 'Liechtenstein',
4267         'LT': 'Lithuania',
4268         'LU': 'Luxembourg',
4269         'MO': 'Macao',
4270         'MK': 'Macedonia, the Former Yugoslav Republic of',
4271         'MG': 'Madagascar',
4272         'MW': 'Malawi',
4273         'MY': 'Malaysia',
4274         'MV': 'Maldives',
4275         'ML': 'Mali',
4276         'MT': 'Malta',
4277         'MH': 'Marshall Islands',
4278         'MQ': 'Martinique',
4279         'MR': 'Mauritania',
4280         'MU': 'Mauritius',
4281         'YT': 'Mayotte',
4282         'MX': 'Mexico',
4283         'FM': 'Micronesia, Federated States of',
4284         'MD': 'Moldova, Republic of',
4285         'MC': 'Monaco',
4286         'MN': 'Mongolia',
4287         'ME': 'Montenegro',
4288         'MS': 'Montserrat',
4289         'MA': 'Morocco',
4290         'MZ': 'Mozambique',
4291         'MM': 'Myanmar',
4292         'NA': 'Namibia',
4293         'NR': 'Nauru',
4294         'NP': 'Nepal',
4295         'NL': 'Netherlands',
4296         'NC': 'New Caledonia',
4297         'NZ': 'New Zealand',
4298         'NI': 'Nicaragua',
4299         'NE': 'Niger',
4300         'NG': 'Nigeria',
4301         'NU': 'Niue',
4302         'NF': 'Norfolk Island',
4303         'MP': 'Northern Mariana Islands',
4304         'NO': 'Norway',
4305         'OM': 'Oman',
4306         'PK': 'Pakistan',
4307         'PW': 'Palau',
4308         'PS': 'Palestine, State of',
4309         'PA': 'Panama',
4310         'PG': 'Papua New Guinea',
4311         'PY': 'Paraguay',
4312         'PE': 'Peru',
4313         'PH': 'Philippines',
4314         'PN': 'Pitcairn',
4315         'PL': 'Poland',
4316         'PT': 'Portugal',
4317         'PR': 'Puerto Rico',
4318         'QA': 'Qatar',
4319         'RE': 'Réunion',
4320         'RO': 'Romania',
4321         'RU': 'Russian Federation',
4322         'RW': 'Rwanda',
4323         'BL': 'Saint Barthélemy',
4324         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4325         'KN': 'Saint Kitts and Nevis',
4326         'LC': 'Saint Lucia',
4327         'MF': 'Saint Martin (French part)',
4328         'PM': 'Saint Pierre and Miquelon',
4329         'VC': 'Saint Vincent and the Grenadines',
4330         'WS': 'Samoa',
4331         'SM': 'San Marino',
4332         'ST': 'Sao Tome and Principe',
4333         'SA': 'Saudi Arabia',
4334         'SN': 'Senegal',
4335         'RS': 'Serbia',
4336         'SC': 'Seychelles',
4337         'SL': 'Sierra Leone',
4338         'SG': 'Singapore',
4339         'SX': 'Sint Maarten (Dutch part)',
4340         'SK': 'Slovakia',
4341         'SI': 'Slovenia',
4342         'SB': 'Solomon Islands',
4343         'SO': 'Somalia',
4344         'ZA': 'South Africa',
4345         'GS': 'South Georgia and the South Sandwich Islands',
4346         'SS': 'South Sudan',
4347         'ES': 'Spain',
4348         'LK': 'Sri Lanka',
4349         'SD': 'Sudan',
4350         'SR': 'Suriname',
4351         'SJ': 'Svalbard and Jan Mayen',
4352         'SZ': 'Swaziland',
4353         'SE': 'Sweden',
4354         'CH': 'Switzerland',
4355         'SY': 'Syrian Arab Republic',
4356         'TW': 'Taiwan, Province of China',
4357         'TJ': 'Tajikistan',
4358         'TZ': 'Tanzania, United Republic of',
4359         'TH': 'Thailand',
4360         'TL': 'Timor-Leste',
4361         'TG': 'Togo',
4362         'TK': 'Tokelau',
4363         'TO': 'Tonga',
4364         'TT': 'Trinidad and Tobago',
4365         'TN': 'Tunisia',
4366         'TR': 'Turkey',
4367         'TM': 'Turkmenistan',
4368         'TC': 'Turks and Caicos Islands',
4369         'TV': 'Tuvalu',
4370         'UG': 'Uganda',
4371         'UA': 'Ukraine',
4372         'AE': 'United Arab Emirates',
4373         'GB': 'United Kingdom',
4374         'US': 'United States',
4375         'UM': 'United States Minor Outlying Islands',
4376         'UY': 'Uruguay',
4377         'UZ': 'Uzbekistan',
4378         'VU': 'Vanuatu',
4379         'VE': 'Venezuela, Bolivarian Republic of',
4380         'VN': 'Viet Nam',
4381         'VG': 'Virgin Islands, British',
4382         'VI': 'Virgin Islands, U.S.',
4383         'WF': 'Wallis and Futuna',
4384         'EH': 'Western Sahara',
4385         'YE': 'Yemen',
4386         'ZM': 'Zambia',
4387         'ZW': 'Zimbabwe',
4388         # Not ISO 3166 codes, but used for IP blocks
4389         'AP': 'Asia/Pacific Region',
4390         'EU': 'Europe',
4391     }
4392
4393     @classmethod
4394     def short2full(cls, code):
4395         """Convert an ISO 3166-2 country code to the corresponding full name"""
4396         return cls._country_map.get(code.upper())
4397
4398
4399 class GeoUtils:
4400     # Major IPv4 address blocks per country
4401     _country_ip_map = {
4402         'AD': '46.172.224.0/19',
4403         'AE': '94.200.0.0/13',
4404         'AF': '149.54.0.0/17',
4405         'AG': '209.59.64.0/18',
4406         'AI': '204.14.248.0/21',
4407         'AL': '46.99.0.0/16',
4408         'AM': '46.70.0.0/15',
4409         'AO': '105.168.0.0/13',
4410         'AP': '182.50.184.0/21',
4411         'AQ': '23.154.160.0/24',
4412         'AR': '181.0.0.0/12',
4413         'AS': '202.70.112.0/20',
4414         'AT': '77.116.0.0/14',
4415         'AU': '1.128.0.0/11',
4416         'AW': '181.41.0.0/18',
4417         'AX': '185.217.4.0/22',
4418         'AZ': '5.197.0.0/16',
4419         'BA': '31.176.128.0/17',
4420         'BB': '65.48.128.0/17',
4421         'BD': '114.130.0.0/16',
4422         'BE': '57.0.0.0/8',
4423         'BF': '102.178.0.0/15',
4424         'BG': '95.42.0.0/15',
4425         'BH': '37.131.0.0/17',
4426         'BI': '154.117.192.0/18',
4427         'BJ': '137.255.0.0/16',
4428         'BL': '185.212.72.0/23',
4429         'BM': '196.12.64.0/18',
4430         'BN': '156.31.0.0/16',
4431         'BO': '161.56.0.0/16',
4432         'BQ': '161.0.80.0/20',
4433         'BR': '191.128.0.0/12',
4434         'BS': '24.51.64.0/18',
4435         'BT': '119.2.96.0/19',
4436         'BW': '168.167.0.0/16',
4437         'BY': '178.120.0.0/13',
4438         'BZ': '179.42.192.0/18',
4439         'CA': '99.224.0.0/11',
4440         'CD': '41.243.0.0/16',
4441         'CF': '197.242.176.0/21',
4442         'CG': '160.113.0.0/16',
4443         'CH': '85.0.0.0/13',
4444         'CI': '102.136.0.0/14',
4445         'CK': '202.65.32.0/19',
4446         'CL': '152.172.0.0/14',
4447         'CM': '102.244.0.0/14',
4448         'CN': '36.128.0.0/10',
4449         'CO': '181.240.0.0/12',
4450         'CR': '201.192.0.0/12',
4451         'CU': '152.206.0.0/15',
4452         'CV': '165.90.96.0/19',
4453         'CW': '190.88.128.0/17',
4454         'CY': '31.153.0.0/16',
4455         'CZ': '88.100.0.0/14',
4456         'DE': '53.0.0.0/8',
4457         'DJ': '197.241.0.0/17',
4458         'DK': '87.48.0.0/12',
4459         'DM': '192.243.48.0/20',
4460         'DO': '152.166.0.0/15',
4461         'DZ': '41.96.0.0/12',
4462         'EC': '186.68.0.0/15',
4463         'EE': '90.190.0.0/15',
4464         'EG': '156.160.0.0/11',
4465         'ER': '196.200.96.0/20',
4466         'ES': '88.0.0.0/11',
4467         'ET': '196.188.0.0/14',
4468         'EU': '2.16.0.0/13',
4469         'FI': '91.152.0.0/13',
4470         'FJ': '144.120.0.0/16',
4471         'FK': '80.73.208.0/21',
4472         'FM': '119.252.112.0/20',
4473         'FO': '88.85.32.0/19',
4474         'FR': '90.0.0.0/9',
4475         'GA': '41.158.0.0/15',
4476         'GB': '25.0.0.0/8',
4477         'GD': '74.122.88.0/21',
4478         'GE': '31.146.0.0/16',
4479         'GF': '161.22.64.0/18',
4480         'GG': '62.68.160.0/19',
4481         'GH': '154.160.0.0/12',
4482         'GI': '95.164.0.0/16',
4483         'GL': '88.83.0.0/19',
4484         'GM': '160.182.0.0/15',
4485         'GN': '197.149.192.0/18',
4486         'GP': '104.250.0.0/19',
4487         'GQ': '105.235.224.0/20',
4488         'GR': '94.64.0.0/13',
4489         'GT': '168.234.0.0/16',
4490         'GU': '168.123.0.0/16',
4491         'GW': '197.214.80.0/20',
4492         'GY': '181.41.64.0/18',
4493         'HK': '113.252.0.0/14',
4494         'HN': '181.210.0.0/16',
4495         'HR': '93.136.0.0/13',
4496         'HT': '148.102.128.0/17',
4497         'HU': '84.0.0.0/14',
4498         'ID': '39.192.0.0/10',
4499         'IE': '87.32.0.0/12',
4500         'IL': '79.176.0.0/13',
4501         'IM': '5.62.80.0/20',
4502         'IN': '117.192.0.0/10',
4503         'IO': '203.83.48.0/21',
4504         'IQ': '37.236.0.0/14',
4505         'IR': '2.176.0.0/12',
4506         'IS': '82.221.0.0/16',
4507         'IT': '79.0.0.0/10',
4508         'JE': '87.244.64.0/18',
4509         'JM': '72.27.0.0/17',
4510         'JO': '176.29.0.0/16',
4511         'JP': '133.0.0.0/8',
4512         'KE': '105.48.0.0/12',
4513         'KG': '158.181.128.0/17',
4514         'KH': '36.37.128.0/17',
4515         'KI': '103.25.140.0/22',
4516         'KM': '197.255.224.0/20',
4517         'KN': '198.167.192.0/19',
4518         'KP': '175.45.176.0/22',
4519         'KR': '175.192.0.0/10',
4520         'KW': '37.36.0.0/14',
4521         'KY': '64.96.0.0/15',
4522         'KZ': '2.72.0.0/13',
4523         'LA': '115.84.64.0/18',
4524         'LB': '178.135.0.0/16',
4525         'LC': '24.92.144.0/20',
4526         'LI': '82.117.0.0/19',
4527         'LK': '112.134.0.0/15',
4528         'LR': '102.183.0.0/16',
4529         'LS': '129.232.0.0/17',
4530         'LT': '78.56.0.0/13',
4531         'LU': '188.42.0.0/16',
4532         'LV': '46.109.0.0/16',
4533         'LY': '41.252.0.0/14',
4534         'MA': '105.128.0.0/11',
4535         'MC': '88.209.64.0/18',
4536         'MD': '37.246.0.0/16',
4537         'ME': '178.175.0.0/17',
4538         'MF': '74.112.232.0/21',
4539         'MG': '154.126.0.0/17',
4540         'MH': '117.103.88.0/21',
4541         'MK': '77.28.0.0/15',
4542         'ML': '154.118.128.0/18',
4543         'MM': '37.111.0.0/17',
4544         'MN': '49.0.128.0/17',
4545         'MO': '60.246.0.0/16',
4546         'MP': '202.88.64.0/20',
4547         'MQ': '109.203.224.0/19',
4548         'MR': '41.188.64.0/18',
4549         'MS': '208.90.112.0/22',
4550         'MT': '46.11.0.0/16',
4551         'MU': '105.16.0.0/12',
4552         'MV': '27.114.128.0/18',
4553         'MW': '102.70.0.0/15',
4554         'MX': '187.192.0.0/11',
4555         'MY': '175.136.0.0/13',
4556         'MZ': '197.218.0.0/15',
4557         'NA': '41.182.0.0/16',
4558         'NC': '101.101.0.0/18',
4559         'NE': '197.214.0.0/18',
4560         'NF': '203.17.240.0/22',
4561         'NG': '105.112.0.0/12',
4562         'NI': '186.76.0.0/15',
4563         'NL': '145.96.0.0/11',
4564         'NO': '84.208.0.0/13',
4565         'NP': '36.252.0.0/15',
4566         'NR': '203.98.224.0/19',
4567         'NU': '49.156.48.0/22',
4568         'NZ': '49.224.0.0/14',
4569         'OM': '5.36.0.0/15',
4570         'PA': '186.72.0.0/15',
4571         'PE': '186.160.0.0/14',
4572         'PF': '123.50.64.0/18',
4573         'PG': '124.240.192.0/19',
4574         'PH': '49.144.0.0/13',
4575         'PK': '39.32.0.0/11',
4576         'PL': '83.0.0.0/11',
4577         'PM': '70.36.0.0/20',
4578         'PR': '66.50.0.0/16',
4579         'PS': '188.161.0.0/16',
4580         'PT': '85.240.0.0/13',
4581         'PW': '202.124.224.0/20',
4582         'PY': '181.120.0.0/14',
4583         'QA': '37.210.0.0/15',
4584         'RE': '102.35.0.0/16',
4585         'RO': '79.112.0.0/13',
4586         'RS': '93.86.0.0/15',
4587         'RU': '5.136.0.0/13',
4588         'RW': '41.186.0.0/16',
4589         'SA': '188.48.0.0/13',
4590         'SB': '202.1.160.0/19',
4591         'SC': '154.192.0.0/11',
4592         'SD': '102.120.0.0/13',
4593         'SE': '78.64.0.0/12',
4594         'SG': '8.128.0.0/10',
4595         'SI': '188.196.0.0/14',
4596         'SK': '78.98.0.0/15',
4597         'SL': '102.143.0.0/17',
4598         'SM': '89.186.32.0/19',
4599         'SN': '41.82.0.0/15',
4600         'SO': '154.115.192.0/18',
4601         'SR': '186.179.128.0/17',
4602         'SS': '105.235.208.0/21',
4603         'ST': '197.159.160.0/19',
4604         'SV': '168.243.0.0/16',
4605         'SX': '190.102.0.0/20',
4606         'SY': '5.0.0.0/16',
4607         'SZ': '41.84.224.0/19',
4608         'TC': '65.255.48.0/20',
4609         'TD': '154.68.128.0/19',
4610         'TG': '196.168.0.0/14',
4611         'TH': '171.96.0.0/13',
4612         'TJ': '85.9.128.0/18',
4613         'TK': '27.96.24.0/21',
4614         'TL': '180.189.160.0/20',
4615         'TM': '95.85.96.0/19',
4616         'TN': '197.0.0.0/11',
4617         'TO': '175.176.144.0/21',
4618         'TR': '78.160.0.0/11',
4619         'TT': '186.44.0.0/15',
4620         'TV': '202.2.96.0/19',
4621         'TW': '120.96.0.0/11',
4622         'TZ': '156.156.0.0/14',
4623         'UA': '37.52.0.0/14',
4624         'UG': '102.80.0.0/13',
4625         'US': '6.0.0.0/8',
4626         'UY': '167.56.0.0/13',
4627         'UZ': '84.54.64.0/18',
4628         'VA': '212.77.0.0/19',
4629         'VC': '207.191.240.0/21',
4630         'VE': '186.88.0.0/13',
4631         'VG': '66.81.192.0/20',
4632         'VI': '146.226.0.0/16',
4633         'VN': '14.160.0.0/11',
4634         'VU': '202.80.32.0/20',
4635         'WF': '117.20.32.0/21',
4636         'WS': '202.4.32.0/19',
4637         'YE': '134.35.0.0/16',
4638         'YT': '41.242.116.0/22',
4639         'ZA': '41.0.0.0/11',
4640         'ZM': '102.144.0.0/13',
4641         'ZW': '102.177.192.0/18',
4642     }
4643
4644     @classmethod
4645     def random_ipv4(cls, code_or_block):
4646         if len(code_or_block) == 2:
4647             block = cls._country_ip_map.get(code_or_block.upper())
4648             if not block:
4649                 return None
4650         else:
4651             block = code_or_block
4652         addr, preflen = block.split('/')
4653         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4654         addr_max = addr_min | (0xffffffff >> int(preflen))
4655         return str(socket.inet_ntoa(
4656             struct.pack('!L', random.randint(addr_min, addr_max))))
4657
4658
4659 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4660     def __init__(self, proxies=None):
4661         # Set default handlers
4662         for type in ('http', 'https'):
4663             setattr(self, '%s_open' % type,
4664                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4665                         meth(r, proxy, type))
4666         urllib.request.ProxyHandler.__init__(self, proxies)
4667
4668     def proxy_open(self, req, proxy, type):
4669         req_proxy = req.headers.get('Ytdl-request-proxy')
4670         if req_proxy is not None:
4671             proxy = req_proxy
4672             del req.headers['Ytdl-request-proxy']
4673
4674         if proxy == '__noproxy__':
4675             return None  # No Proxy
4676         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4677             req.add_header('Ytdl-socks-proxy', proxy)
4678             # yt-dlp's http/https handlers do wrapping the socket with socks
4679             return None
4680         return urllib.request.ProxyHandler.proxy_open(
4681             self, req, proxy, type)
4682
4683
4684 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4685 # released into Public Domain
4686 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4687
4688 def long_to_bytes(n, blocksize=0):
4689     """long_to_bytes(n:long, blocksize:int) : string
4690     Convert a long integer to a byte string.
4691
4692     If optional blocksize is given and greater than zero, pad the front of the
4693     byte string with binary zeros so that the length is a multiple of
4694     blocksize.
4695     """
4696     # after much testing, this algorithm was deemed to be the fastest
4697     s = b''
4698     n = int(n)
4699     while n > 0:
4700         s = struct.pack('>I', n & 0xffffffff) + s
4701         n = n >> 32
4702     # strip off leading zeros
4703     for i in range(len(s)):
4704         if s[i] != b'\000'[0]:
4705             break
4706     else:
4707         # only happens when n == 0
4708         s = b'\000'
4709         i = 0
4710     s = s[i:]
4711     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4712     # de-padding being done above, but sigh...
4713     if blocksize > 0 and len(s) % blocksize:
4714         s = (blocksize - len(s) % blocksize) * b'\000' + s
4715     return s
4716
4717
4718 def bytes_to_long(s):
4719     """bytes_to_long(string) : long
4720     Convert a byte string to a long integer.
4721
4722     This is (essentially) the inverse of long_to_bytes().
4723     """
4724     acc = 0
4725     length = len(s)
4726     if length % 4:
4727         extra = (4 - length % 4)
4728         s = b'\000' * extra + s
4729         length = length + extra
4730     for i in range(0, length, 4):
4731         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4732     return acc
4733
4734
4735 def ohdave_rsa_encrypt(data, exponent, modulus):
4736     '''
4737     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4738
4739     Input:
4740         data: data to encrypt, bytes-like object
4741         exponent, modulus: parameter e and N of RSA algorithm, both integer
4742     Output: hex string of encrypted data
4743
4744     Limitation: supports one block encryption only
4745     '''
4746
4747     payload = int(binascii.hexlify(data[::-1]), 16)
4748     encrypted = pow(payload, exponent, modulus)
4749     return '%x' % encrypted
4750
4751
4752 def pkcs1pad(data, length):
4753     """
4754     Padding input data with PKCS#1 scheme
4755
4756     @param {int[]} data        input data
4757     @param {int}   length      target length
4758     @returns {int[]}           padded data
4759     """
4760     if len(data) > length - 11:
4761         raise ValueError('Input data too long for PKCS#1 padding')
4762
4763     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4764     return [0, 2] + pseudo_random + [0] + data
4765
4766
4767 def _base_n_table(n, table):
4768     if not table and not n:
4769         raise ValueError('Either table or n must be specified')
4770     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4771
4772     if n and n != len(table):
4773         raise ValueError(f'base {n} exceeds table length {len(table)}')
4774     return table
4775
4776
4777 def encode_base_n(num, n=None, table=None):
4778     """Convert given int to a base-n string"""
4779     table = _base_n_table(n, table)
4780     if not num:
4781         return table[0]
4782
4783     result, base = '', len(table)
4784     while num:
4785         result = table[num % base] + result
4786         num = num // base
4787     return result
4788
4789
4790 def decode_base_n(string, n=None, table=None):
4791     """Convert given base-n string to int"""
4792     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4793     result, base = 0, len(table)
4794     for char in string:
4795         result = result * base + table[char]
4796     return result
4797
4798
4799 def decode_base(value, digits):
4800     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4801                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4802     return decode_base_n(value, table=digits)
4803
4804
4805 def decode_packed_codes(code):
4806     mobj = re.search(PACKED_CODES_RE, code)
4807     obfuscated_code, base, count, symbols = mobj.groups()
4808     base = int(base)
4809     count = int(count)
4810     symbols = symbols.split('|')
4811     symbol_table = {}
4812
4813     while count:
4814         count -= 1
4815         base_n_count = encode_base_n(count, base)
4816         symbol_table[base_n_count] = symbols[count] or base_n_count
4817
4818     return re.sub(
4819         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4820         obfuscated_code)
4821
4822
4823 def caesar(s, alphabet, shift):
4824     if shift == 0:
4825         return s
4826     l = len(alphabet)
4827     return ''.join(
4828         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4829         for c in s)
4830
4831
4832 def rot47(s):
4833     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4834
4835
4836 def parse_m3u8_attributes(attrib):
4837     info = {}
4838     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4839         if val.startswith('"'):
4840             val = val[1:-1]
4841         info[key] = val
4842     return info
4843
4844
4845 def urshift(val, n):
4846     return val >> n if val >= 0 else (val + 0x100000000) >> n
4847
4848
4849 # Based on png2str() written by @gdkchan and improved by @yokrysty
4850 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4851 def decode_png(png_data):
4852     # Reference: https://www.w3.org/TR/PNG/
4853     header = png_data[8:]
4854
4855     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4856         raise OSError('Not a valid PNG file.')
4857
4858     int_map = {1: '>B', 2: '>H', 4: '>I'}
4859     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4860
4861     chunks = []
4862
4863     while header:
4864         length = unpack_integer(header[:4])
4865         header = header[4:]
4866
4867         chunk_type = header[:4]
4868         header = header[4:]
4869
4870         chunk_data = header[:length]
4871         header = header[length:]
4872
4873         header = header[4:]  # Skip CRC
4874
4875         chunks.append({
4876             'type': chunk_type,
4877             'length': length,
4878             'data': chunk_data
4879         })
4880
4881     ihdr = chunks[0]['data']
4882
4883     width = unpack_integer(ihdr[:4])
4884     height = unpack_integer(ihdr[4:8])
4885
4886     idat = b''
4887
4888     for chunk in chunks:
4889         if chunk['type'] == b'IDAT':
4890             idat += chunk['data']
4891
4892     if not idat:
4893         raise OSError('Unable to read PNG data.')
4894
4895     decompressed_data = bytearray(zlib.decompress(idat))
4896
4897     stride = width * 3
4898     pixels = []
4899
4900     def _get_pixel(idx):
4901         x = idx % stride
4902         y = idx // stride
4903         return pixels[y][x]
4904
4905     for y in range(height):
4906         basePos = y * (1 + stride)
4907         filter_type = decompressed_data[basePos]
4908
4909         current_row = []
4910
4911         pixels.append(current_row)
4912
4913         for x in range(stride):
4914             color = decompressed_data[1 + basePos + x]
4915             basex = y * stride + x
4916             left = 0
4917             up = 0
4918
4919             if x > 2:
4920                 left = _get_pixel(basex - 3)
4921             if y > 0:
4922                 up = _get_pixel(basex - stride)
4923
4924             if filter_type == 1:  # Sub
4925                 color = (color + left) & 0xff
4926             elif filter_type == 2:  # Up
4927                 color = (color + up) & 0xff
4928             elif filter_type == 3:  # Average
4929                 color = (color + ((left + up) >> 1)) & 0xff
4930             elif filter_type == 4:  # Paeth
4931                 a = left
4932                 b = up
4933                 c = 0
4934
4935                 if x > 2 and y > 0:
4936                     c = _get_pixel(basex - stride - 3)
4937
4938                 p = a + b - c
4939
4940                 pa = abs(p - a)
4941                 pb = abs(p - b)
4942                 pc = abs(p - c)
4943
4944                 if pa <= pb and pa <= pc:
4945                     color = (color + a) & 0xff
4946                 elif pb <= pc:
4947                     color = (color + b) & 0xff
4948                 else:
4949                     color = (color + c) & 0xff
4950
4951             current_row.append(color)
4952
4953     return width, height, pixels
4954
4955
4956 def write_xattr(path, key, value):
4957     # Windows: Write xattrs to NTFS Alternate Data Streams:
4958     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4959     if compat_os_name == 'nt':
4960         assert ':' not in key
4961         assert os.path.exists(path)
4962
4963         try:
4964             with open(f'{path}:{key}', 'wb') as f:
4965                 f.write(value)
4966         except OSError as e:
4967             raise XAttrMetadataError(e.errno, e.strerror)
4968         return
4969
4970     # UNIX Method 1. Use xattrs/pyxattrs modules
4971
4972     setxattr = None
4973     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4974         # Unicode arguments are not supported in pyxattr until version 0.5.0
4975         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4976         if version_tuple(xattr.__version__) >= (0, 5, 0):
4977             setxattr = xattr.set
4978     elif xattr:
4979         setxattr = xattr.setxattr
4980
4981     if setxattr:
4982         try:
4983             setxattr(path, key, value)
4984         except OSError as e:
4985             raise XAttrMetadataError(e.errno, e.strerror)
4986         return
4987
4988     # UNIX Method 2. Use setfattr/xattr executables
4989     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4990            else 'xattr' if check_executable('xattr', ['-h']) else None)
4991     if not exe:
4992         raise XAttrUnavailableError(
4993             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4994             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4995
4996     value = value.decode()
4997     try:
4998         _, stderr, returncode = Popen.run(
4999             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5000             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5001     except OSError as e:
5002         raise XAttrMetadataError(e.errno, e.strerror)
5003     if returncode:
5004         raise XAttrMetadataError(returncode, stderr)
5005
5006
5007 def random_birthday(year_field, month_field, day_field):
5008     start_date = datetime.date(1950, 1, 1)
5009     end_date = datetime.date(1995, 12, 31)
5010     offset = random.randint(0, (end_date - start_date).days)
5011     random_date = start_date + datetime.timedelta(offset)
5012     return {
5013         year_field: str(random_date.year),
5014         month_field: str(random_date.month),
5015         day_field: str(random_date.day),
5016     }
5017
5018
5019 # Templates for internet shortcut files, which are plain text files.
5020 DOT_URL_LINK_TEMPLATE = '''\
5021 [InternetShortcut]
5022 URL=%(url)s
5023 '''
5024
5025 DOT_WEBLOC_LINK_TEMPLATE = '''\
5026 <?xml version="1.0" encoding="UTF-8"?>
5027 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5028 <plist version="1.0">
5029 <dict>
5030 \t<key>URL</key>
5031 \t<string>%(url)s</string>
5032 </dict>
5033 </plist>
5034 '''
5035
5036 DOT_DESKTOP_LINK_TEMPLATE = '''\
5037 [Desktop Entry]
5038 Encoding=UTF-8
5039 Name=%(filename)s
5040 Type=Link
5041 URL=%(url)s
5042 Icon=text-html
5043 '''
5044
5045 LINK_TEMPLATES = {
5046     'url': DOT_URL_LINK_TEMPLATE,
5047     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5048     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5049 }
5050
5051
5052 def iri_to_uri(iri):
5053     """
5054     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5055
5056     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5057     """
5058
5059     iri_parts = urllib.parse.urlparse(iri)
5060
5061     if '[' in iri_parts.netloc:
5062         raise ValueError('IPv6 URIs are not, yet, supported.')
5063         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5064
5065     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5066
5067     net_location = ''
5068     if iri_parts.username:
5069         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5070         if iri_parts.password is not None:
5071             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5072         net_location += '@'
5073
5074     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5075     # The 'idna' encoding produces ASCII text.
5076     if iri_parts.port is not None and iri_parts.port != 80:
5077         net_location += ':' + str(iri_parts.port)
5078
5079     return urllib.parse.urlunparse(
5080         (iri_parts.scheme,
5081             net_location,
5082
5083             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5084
5085             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5086             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5087
5088             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5089             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5090
5091             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5092
5093     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5094
5095
5096 def to_high_limit_path(path):
5097     if sys.platform in ['win32', 'cygwin']:
5098         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5099         return '\\\\?\\' + os.path.abspath(path)
5100
5101     return path
5102
5103
5104 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5105     val = traverse_obj(obj, *variadic(field))
5106     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5107         return default
5108     return template % func(val)
5109
5110
5111 def clean_podcast_url(url):
5112     return re.sub(r'''(?x)
5113         (?:
5114             (?:
5115                 chtbl\.com/track|
5116                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5117                 play\.podtrac\.com
5118             )/[^/]+|
5119             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5120             flex\.acast\.com|
5121             pd(?:
5122                 cn\.co| # https://podcorn.com/analytics-prefix/
5123                 st\.fm # https://podsights.com/docs/
5124             )/e
5125         )/''', '', url)
5126
5127
5128 _HEX_TABLE = '0123456789abcdef'
5129
5130
5131 def random_uuidv4():
5132     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5133
5134
5135 def make_dir(path, to_screen=None):
5136     try:
5137         dn = os.path.dirname(path)
5138         if dn and not os.path.exists(dn):
5139             os.makedirs(dn)
5140         return True
5141     except OSError as err:
5142         if callable(to_screen) is not None:
5143             to_screen('unable to create directory ' + error_to_compat_str(err))
5144         return False
5145
5146
5147 def get_executable_path():
5148     from .update import _get_variant_and_executable_path
5149
5150     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5151
5152
5153 def load_plugins(name, suffix, namespace):
5154     classes = {}
5155     with contextlib.suppress(FileNotFoundError):
5156         plugins_spec = importlib.util.spec_from_file_location(
5157             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5158         plugins = importlib.util.module_from_spec(plugins_spec)
5159         sys.modules[plugins_spec.name] = plugins
5160         plugins_spec.loader.exec_module(plugins)
5161         for name in dir(plugins):
5162             if name in namespace:
5163                 continue
5164             if not name.endswith(suffix):
5165                 continue
5166             klass = getattr(plugins, name)
5167             classes[name] = namespace[name] = klass
5168     return classes
5169
5170
5171 def traverse_obj(
5172         obj, *path_list, default=None, expected_type=None, get_all=True,
5173         casesense=True, is_user_input=False, traverse_string=False):
5174     ''' Traverse nested list/dict/tuple
5175     @param path_list        A list of paths which are checked one by one.
5176                             Each path is a list of keys where each key is a:
5177                               - None:     Do nothing
5178                               - string:   A dictionary key
5179                               - int:      An index into a list
5180                               - tuple:    A list of keys all of which will be traversed
5181                               - Ellipsis: Fetch all values in the object
5182                               - Function: Takes the key and value as arguments
5183                                           and returns whether the key matches or not
5184     @param default          Default value to return
5185     @param expected_type    Only accept final value of this type (Can also be any callable)
5186     @param get_all          Return all the values obtained from a path or only the first one
5187     @param casesense        Whether to consider dictionary keys as case sensitive
5188     @param is_user_input    Whether the keys are generated from user input. If True,
5189                             strings are converted to int/slice if necessary
5190     @param traverse_string  Whether to traverse inside strings. If True, any
5191                             non-compatible object will also be converted into a string
5192     # TODO: Write tests
5193     '''
5194     if not casesense:
5195         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5196         path_list = (map(_lower, variadic(path)) for path in path_list)
5197
5198     def _traverse_obj(obj, path, _current_depth=0):
5199         nonlocal depth
5200         path = tuple(variadic(path))
5201         for i, key in enumerate(path):
5202             if None in (key, obj):
5203                 return obj
5204             if isinstance(key, (list, tuple)):
5205                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5206                 key = ...
5207             if key is ...:
5208                 obj = (obj.values() if isinstance(obj, dict)
5209                        else obj if isinstance(obj, (list, tuple, LazyList))
5210                        else str(obj) if traverse_string else [])
5211                 _current_depth += 1
5212                 depth = max(depth, _current_depth)
5213                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5214             elif callable(key):
5215                 if isinstance(obj, (list, tuple, LazyList)):
5216                     obj = enumerate(obj)
5217                 elif isinstance(obj, dict):
5218                     obj = obj.items()
5219                 else:
5220                     if not traverse_string:
5221                         return None
5222                     obj = str(obj)
5223                 _current_depth += 1
5224                 depth = max(depth, _current_depth)
5225                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5226             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5227                 obj = (obj.get(key) if casesense or (key in obj)
5228                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5229             else:
5230                 if is_user_input:
5231                     key = (int_or_none(key) if ':' not in key
5232                            else slice(*map(int_or_none, key.split(':'))))
5233                     if key == slice(None):
5234                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5235                 if not isinstance(key, (int, slice)):
5236                     return None
5237                 if not isinstance(obj, (list, tuple, LazyList)):
5238                     if not traverse_string:
5239                         return None
5240                     obj = str(obj)
5241                 try:
5242                     obj = obj[key]
5243                 except IndexError:
5244                     return None
5245         return obj
5246
5247     if isinstance(expected_type, type):
5248         type_test = lambda val: val if isinstance(val, expected_type) else None
5249     else:
5250         type_test = expected_type or IDENTITY
5251
5252     for path in path_list:
5253         depth = 0
5254         val = _traverse_obj(obj, path)
5255         if val is not None:
5256             if depth:
5257                 for _ in range(depth - 1):
5258                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5259                 val = [v for v in map(type_test, val) if v is not None]
5260                 if val:
5261                     return val if get_all else val[0]
5262             else:
5263                 val = type_test(val)
5264                 if val is not None:
5265                     return val
5266     return default
5267
5268
5269 def traverse_dict(dictn, keys, casesense=True):
5270     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5271                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5272     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5273
5274
5275 def get_first(obj, keys, **kwargs):
5276     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5277
5278
5279 def variadic(x, allowed_types=(str, bytes, dict)):
5280     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5281
5282
5283 def time_seconds(**kwargs):
5284     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5285     return t.timestamp()
5286
5287
5288 # create a JSON Web Signature (jws) with HS256 algorithm
5289 # the resulting format is in JWS Compact Serialization
5290 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5291 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5292 def jwt_encode_hs256(payload_data, key, headers={}):
5293     header_data = {
5294         'alg': 'HS256',
5295         'typ': 'JWT',
5296     }
5297     if headers:
5298         header_data.update(headers)
5299     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5300     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5301     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5302     signature_b64 = base64.b64encode(h.digest())
5303     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5304     return token
5305
5306
5307 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5308 def jwt_decode_hs256(jwt):
5309     header_b64, payload_b64, signature_b64 = jwt.split('.')
5310     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5311     return payload_data
5312
5313
5314 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5315
5316
5317 @functools.cache
5318 def supports_terminal_sequences(stream):
5319     if compat_os_name == 'nt':
5320         if not WINDOWS_VT_MODE:
5321             return False
5322     elif not os.getenv('TERM'):
5323         return False
5324     try:
5325         return stream.isatty()
5326     except BaseException:
5327         return False
5328
5329
5330 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5331     if get_windows_version() < (10, 0, 10586):
5332         return
5333     global WINDOWS_VT_MODE
5334     try:
5335         Popen.run('', shell=True)
5336     except Exception:
5337         return
5338
5339     WINDOWS_VT_MODE = True
5340     supports_terminal_sequences.cache_clear()
5341
5342
5343 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5344
5345
5346 def remove_terminal_sequences(string):
5347     return _terminal_sequences_re.sub('', string)
5348
5349
5350 def number_of_digits(number):
5351     return len('%d' % number)
5352
5353
5354 def join_nonempty(*values, delim='-', from_dict=None):
5355     if from_dict is not None:
5356         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5357     return delim.join(map(str, filter(None, values)))
5358
5359
5360 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5361     """
5362     Find the largest format dimensions in terms of video width and, for each thumbnail:
5363     * Modify the URL: Match the width with the provided regex and replace with the former width
5364     * Update dimensions
5365
5366     This function is useful with video services that scale the provided thumbnails on demand
5367     """
5368     _keys = ('width', 'height')
5369     max_dimensions = max(
5370         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5371         default=(0, 0))
5372     if not max_dimensions[0]:
5373         return thumbnails
5374     return [
5375         merge_dicts(
5376             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5377             dict(zip(_keys, max_dimensions)), thumbnail)
5378         for thumbnail in thumbnails
5379     ]
5380
5381
5382 def parse_http_range(range):
5383     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5384     if not range:
5385         return None, None, None
5386     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5387     if not crg:
5388         return None, None, None
5389     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5390
5391
5392 def read_stdin(what):
5393     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5394     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5395     return sys.stdin
5396
5397
5398 def determine_file_encoding(data):
5399     """
5400     Detect the text encoding used
5401     @returns (encoding, bytes to skip)
5402     """
5403
5404     # BOM marks are given priority over declarations
5405     for bom, enc in BOMS:
5406         if data.startswith(bom):
5407             return enc, len(bom)
5408
5409     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5410     # We ignore the endianness to get a good enough match
5411     data = data.replace(b'\0', b'')
5412     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5413     return mobj.group(1).decode() if mobj else None, 0
5414
5415
5416 class Config:
5417     own_args = None
5418     parsed_args = None
5419     filename = None
5420     __initialized = False
5421
5422     def __init__(self, parser, label=None):
5423         self.parser, self.label = parser, label
5424         self._loaded_paths, self.configs = set(), []
5425
5426     def init(self, args=None, filename=None):
5427         assert not self.__initialized
5428         self.own_args, self.filename = args, filename
5429         return self.load_configs()
5430
5431     def load_configs(self):
5432         directory = ''
5433         if self.filename:
5434             location = os.path.realpath(self.filename)
5435             directory = os.path.dirname(location)
5436             if location in self._loaded_paths:
5437                 return False
5438             self._loaded_paths.add(location)
5439
5440         self.__initialized = True
5441         opts, _ = self.parser.parse_known_args(self.own_args)
5442         self.parsed_args = self.own_args
5443         for location in opts.config_locations or []:
5444             if location == '-':
5445                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5446                 continue
5447             location = os.path.join(directory, expand_path(location))
5448             if os.path.isdir(location):
5449                 location = os.path.join(location, 'yt-dlp.conf')
5450             if not os.path.exists(location):
5451                 self.parser.error(f'config location {location} does not exist')
5452             self.append_config(self.read_file(location), location)
5453         return True
5454
5455     def __str__(self):
5456         label = join_nonempty(
5457             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5458             delim=' ')
5459         return join_nonempty(
5460             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5461             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5462             delim='\n')
5463
5464     @staticmethod
5465     def read_file(filename, default=[]):
5466         try:
5467             optionf = open(filename, 'rb')
5468         except OSError:
5469             return default  # silently skip if file is not present
5470         try:
5471             enc, skip = determine_file_encoding(optionf.read(512))
5472             optionf.seek(skip, io.SEEK_SET)
5473         except OSError:
5474             enc = None  # silently skip read errors
5475         try:
5476             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5477             contents = optionf.read().decode(enc or preferredencoding())
5478             res = shlex.split(contents, comments=True)
5479         except Exception as err:
5480             raise ValueError(f'Unable to parse "{filename}": {err}')
5481         finally:
5482             optionf.close()
5483         return res
5484
5485     @staticmethod
5486     def hide_login_info(opts):
5487         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5488         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5489
5490         def _scrub_eq(o):
5491             m = eqre.match(o)
5492             if m:
5493                 return m.group('key') + '=PRIVATE'
5494             else:
5495                 return o
5496
5497         opts = list(map(_scrub_eq, opts))
5498         for idx, opt in enumerate(opts):
5499             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5500                 opts[idx + 1] = 'PRIVATE'
5501         return opts
5502
5503     def append_config(self, *args, label=None):
5504         config = type(self)(self.parser, label)
5505         config._loaded_paths = self._loaded_paths
5506         if config.init(*args):
5507             self.configs.append(config)
5508
5509     @property
5510     def all_args(self):
5511         for config in reversed(self.configs):
5512             yield from config.all_args
5513         yield from self.parsed_args or []
5514
5515     def parse_known_args(self, **kwargs):
5516         return self.parser.parse_known_args(self.all_args, **kwargs)
5517
5518     def parse_args(self):
5519         return self.parser.parse_args(self.all_args)
5520
5521
5522 class WebSocketsWrapper():
5523     """Wraps websockets module to use in non-async scopes"""
5524     pool = None
5525
5526     def __init__(self, url, headers=None, connect=True):
5527         self.loop = asyncio.new_event_loop()
5528         # XXX: "loop" is deprecated
5529         self.conn = websockets.connect(
5530             url, extra_headers=headers, ping_interval=None,
5531             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5532         if connect:
5533             self.__enter__()
5534         atexit.register(self.__exit__, None, None, None)
5535
5536     def __enter__(self):
5537         if not self.pool:
5538             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5539         return self
5540
5541     def send(self, *args):
5542         self.run_with_loop(self.pool.send(*args), self.loop)
5543
5544     def recv(self, *args):
5545         return self.run_with_loop(self.pool.recv(*args), self.loop)
5546
5547     def __exit__(self, type, value, traceback):
5548         try:
5549             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5550         finally:
5551             self.loop.close()
5552             self._cancel_all_tasks(self.loop)
5553
5554     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5555     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5556     @staticmethod
5557     def run_with_loop(main, loop):
5558         if not asyncio.iscoroutine(main):
5559             raise ValueError(f'a coroutine was expected, got {main!r}')
5560
5561         try:
5562             return loop.run_until_complete(main)
5563         finally:
5564             loop.run_until_complete(loop.shutdown_asyncgens())
5565             if hasattr(loop, 'shutdown_default_executor'):
5566                 loop.run_until_complete(loop.shutdown_default_executor())
5567
5568     @staticmethod
5569     def _cancel_all_tasks(loop):
5570         to_cancel = asyncio.all_tasks(loop)
5571
5572         if not to_cancel:
5573             return
5574
5575         for task in to_cancel:
5576             task.cancel()
5577
5578         # XXX: "loop" is removed in python 3.10+
5579         loop.run_until_complete(
5580             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5581
5582         for task in to_cancel:
5583             if task.cancelled():
5584                 continue
5585             if task.exception() is not None:
5586                 loop.call_exception_handler({
5587                     'message': 'unhandled exception during asyncio.run() shutdown',
5588                     'exception': task.exception(),
5589                     'task': task,
5590                 })
5591
5592
5593 def merge_headers(*dicts):
5594     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5595     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5596
5597
5598 def cached_method(f):
5599     """Cache a method"""
5600     signature = inspect.signature(f)
5601
5602     @functools.wraps(f)
5603     def wrapper(self, *args, **kwargs):
5604         bound_args = signature.bind(self, *args, **kwargs)
5605         bound_args.apply_defaults()
5606         key = tuple(bound_args.arguments.values())
5607
5608         if not hasattr(self, '__cached_method__cache'):
5609             self.__cached_method__cache = {}
5610         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5611         if key not in cache:
5612             cache[key] = f(self, *args, **kwargs)
5613         return cache[key]
5614     return wrapper
5615
5616
5617 class classproperty:
5618     """property access for class methods"""
5619
5620     def __init__(self, func):
5621         functools.update_wrapper(self, func)
5622         self.func = func
5623
5624     def __get__(self, _, cls):
5625         return self.func(cls)
5626
5627
5628 class Namespace(types.SimpleNamespace):
5629     """Immutable namespace"""
5630
5631     def __iter__(self):
5632         return iter(self.__dict__.values())
5633
5634     @property
5635     def items_(self):
5636         return self.__dict__.items()
5637
5638
5639 MEDIA_EXTENSIONS = Namespace(
5640     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5641     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5642     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5643     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5644     thumbnails=('jpg', 'png', 'webp'),
5645     storyboards=('mhtml', ),
5646     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5647     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5648 )
5649 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5650 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5651
5652 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5653
5654
5655 class RetryManager:
5656     """Usage:
5657         for retry in RetryManager(...):
5658             try:
5659                 ...
5660             except SomeException as err:
5661                 retry.error = err
5662                 continue
5663     """
5664     attempt, _error = 0, None
5665
5666     def __init__(self, _retries, _error_callback, **kwargs):
5667         self.retries = _retries or 0
5668         self.error_callback = functools.partial(_error_callback, **kwargs)
5669
5670     def _should_retry(self):
5671         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5672
5673     @property
5674     def error(self):
5675         if self._error is NO_DEFAULT:
5676             return None
5677         return self._error
5678
5679     @error.setter
5680     def error(self, value):
5681         self._error = value
5682
5683     def __iter__(self):
5684         while self._should_retry():
5685             self.error = NO_DEFAULT
5686             self.attempt += 1
5687             yield self
5688             if self.error:
5689                 self.error_callback(self.error, self.attempt, self.retries)
5690
5691     @staticmethod
5692     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5693         """Utility function for reporting retries"""
5694         if count > retries:
5695             if error:
5696                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5697             raise e
5698
5699         if not count:
5700             return warn(e)
5701         elif isinstance(e, ExtractorError):
5702             e = remove_end(e.cause or e.orig_msg, '.')
5703         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5704
5705         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5706         if delay:
5707             info(f'Sleeping {delay:.2f} seconds ...')
5708             time.sleep(delay)
5709
5710
5711 def make_archive_id(ie, video_id):
5712     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5713     return f'{ie_key.lower()} {video_id}'
5714
5715
5716 # Deprecated
5717 has_certifi = bool(certifi)
5718 has_websockets = bool(websockets)