yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import importlib.util
  22 import inspect
  23 import io
  24 import itertools
  25 import json
  26 import locale
  27 import math
  28 import mimetypes
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from .compat import functools  # isort: split
  51 from .compat import (
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_HTMLParseError,
  55     compat_os_name,
  56     compat_shlex_quote,
  57 )
  58 from .dependencies import brotli, certifi, websockets, xattr
  59 from .socks import ProxyType, sockssocket
  60
  61
  62 def register_socks_protocols():
  63     # "Register" SOCKS protocols
  64     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  65     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  66     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  67         if scheme not in urllib.parse.uses_netloc:
  68             urllib.parse.uses_netloc.append(scheme)
  69
  70
  71 # This is not clearly defined otherwise
  72 compiled_regex_type = type(re.compile(''))
  73
  74
  75 def random_user_agent():
  76     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  77     _CHROME_VERSIONS = (
  78         '90.0.4430.212',
  79         '90.0.4430.24',
  80         '90.0.4430.70',
  81         '90.0.4430.72',
  82         '90.0.4430.85',
  83         '90.0.4430.93',
  84         '91.0.4472.101',
  85         '91.0.4472.106',
  86         '91.0.4472.114',
  87         '91.0.4472.124',
  88         '91.0.4472.164',
  89         '91.0.4472.19',
  90         '91.0.4472.77',
  91         '92.0.4515.107',
  92         '92.0.4515.115',
  93         '92.0.4515.131',
  94         '92.0.4515.159',
  95         '92.0.4515.43',
  96         '93.0.4556.0',
  97         '93.0.4577.15',
  98         '93.0.4577.63',
  99         '93.0.4577.82',
 100         '94.0.4606.41',
 101         '94.0.4606.54',
 102         '94.0.4606.61',
 103         '94.0.4606.71',
 104         '94.0.4606.81',
 105         '94.0.4606.85',
 106         '95.0.4638.17',
 107         '95.0.4638.50',
 108         '95.0.4638.54',
 109         '95.0.4638.69',
 110         '95.0.4638.74',
 111         '96.0.4664.18',
 112         '96.0.4664.45',
 113         '96.0.4664.55',
 114         '96.0.4664.93',
 115         '97.0.4692.20',
 116     )
 117     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 118
 119
 120 SUPPORTED_ENCODINGS = [
 121     'gzip', 'deflate'
 122 ]
 123 if brotli:
 124     SUPPORTED_ENCODINGS.append('br')
 125
 126 std_headers = {
 127     'User-Agent': random_user_agent(),
 128     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 129     'Accept-Language': 'en-us,en;q=0.5',
 130     'Sec-Fetch-Mode': 'navigate',
 131 }
 132
 133
 134 USER_AGENTS = {
 135     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 136 }
 137
 138
 139 NO_DEFAULT = object()
 140 IDENTITY = lambda x: x
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151 }
 152
 153 # needed for sanitizing filenames in restricted mode
 154 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 155                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 156                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 157
 158 DATE_FORMATS = (
 159     '%d %B %Y',
 160     '%d %b %Y',
 161     '%B %d %Y',
 162     '%B %dst %Y',
 163     '%B %dnd %Y',
 164     '%B %drd %Y',
 165     '%B %dth %Y',
 166     '%b %d %Y',
 167     '%b %dst %Y',
 168     '%b %dnd %Y',
 169     '%b %drd %Y',
 170     '%b %dth %Y',
 171     '%b %dst %Y %I:%M',
 172     '%b %dnd %Y %I:%M',
 173     '%b %drd %Y %I:%M',
 174     '%b %dth %Y %I:%M',
 175     '%Y %m %d',
 176     '%Y-%m-%d',
 177     '%Y.%m.%d.',
 178     '%Y/%m/%d',
 179     '%Y/%m/%d %H:%M',
 180     '%Y/%m/%d %H:%M:%S',
 181     '%Y%m%d%H%M',
 182     '%Y%m%d%H%M%S',
 183     '%Y%m%d',
 184     '%Y-%m-%d %H:%M',
 185     '%Y-%m-%d %H:%M:%S',
 186     '%Y-%m-%d %H:%M:%S.%f',
 187     '%Y-%m-%d %H:%M:%S:%f',
 188     '%d.%m.%Y %H:%M',
 189     '%d.%m.%Y %H.%M',
 190     '%Y-%m-%dT%H:%M:%SZ',
 191     '%Y-%m-%dT%H:%M:%S.%fZ',
 192     '%Y-%m-%dT%H:%M:%S.%f0Z',
 193     '%Y-%m-%dT%H:%M:%S',
 194     '%Y-%m-%dT%H:%M:%S.%f',
 195     '%Y-%m-%dT%H:%M',
 196     '%b %d %Y at %H:%M',
 197     '%b %d %Y at %H:%M:%S',
 198     '%B %d %Y at %H:%M',
 199     '%B %d %Y at %H:%M:%S',
 200     '%H:%M %d-%b-%Y',
 201 )
 202
 203 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 204 DATE_FORMATS_DAY_FIRST.extend([
 205     '%d-%m-%Y',
 206     '%d.%m.%Y',
 207     '%d.%m.%y',
 208     '%d/%m/%Y',
 209     '%d/%m/%y',
 210     '%d/%m/%Y %H:%M:%S',
 211     '%d-%m-%Y %H:%M',
 212 ])
 213
 214 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 215 DATE_FORMATS_MONTH_FIRST.extend([
 216     '%m-%d-%Y',
 217     '%m.%d.%Y',
 218     '%m/%d/%Y',
 219     '%m/%d/%y',
 220     '%m/%d/%Y %H:%M:%S',
 221 ])
 222
 223 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 224 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 225
 226 NUMBER_RE = r'\d+(?:\.\d+)?'
 227
 228
 229 @functools.cache
 230 def preferredencoding():
 231     """Get preferred encoding.
 232
 233     Returns the best encoding scheme for the system, based on
 234     locale.getpreferredencoding() and some further tweaks.
 235     """
 236     try:
 237         pref = locale.getpreferredencoding()
 238         'TEST'.encode(pref)
 239     except Exception:
 240         pref = 'UTF-8'
 241
 242     return pref
 243
 244
 245 def write_json_file(obj, fn):
 246     """ Encode obj as JSON and write it to fn, atomically if possible """
 247
 248     tf = tempfile.NamedTemporaryFile(
 249         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 250         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 251
 252     try:
 253         with tf:
 254             json.dump(obj, tf, ensure_ascii=False)
 255         if sys.platform == 'win32':
 256             # Need to remove existing file on Windows, else os.rename raises
 257             # WindowsError or FileExistsError.
 258             with contextlib.suppress(OSError):
 259                 os.unlink(fn)
 260         with contextlib.suppress(OSError):
 261             mask = os.umask(0)
 262             os.umask(mask)
 263             os.chmod(tf.name, 0o666 & ~mask)
 264         os.rename(tf.name, fn)
 265     except Exception:
 266         with contextlib.suppress(OSError):
 267             os.remove(tf.name)
 268         raise
 269
 270
 271 def find_xpath_attr(node, xpath, key, val=None):
 272     """ Find the xpath xpath[@key=val] """
 273     assert re.match(r'^[a-zA-Z_-]+$', key)
 274     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 275     return node.find(expr)
 276
 277 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 278 # the namespace parameter
 279
 280
 281 def xpath_with_ns(path, ns_map):
 282     components = [c.split(':') for c in path.split('/')]
 283     replaced = []
 284     for c in components:
 285         if len(c) == 1:
 286             replaced.append(c[0])
 287         else:
 288             ns, tag = c
 289             replaced.append('{%s}%s' % (ns_map[ns], tag))
 290     return '/'.join(replaced)
 291
 292
 293 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 294     def _find_xpath(xpath):
 295         return node.find(xpath)
 296
 297     if isinstance(xpath, str):
 298         n = _find_xpath(xpath)
 299     else:
 300         for xp in xpath:
 301             n = _find_xpath(xp)
 302             if n is not None:
 303                 break
 304
 305     if n is None:
 306         if default is not NO_DEFAULT:
 307             return default
 308         elif fatal:
 309             name = xpath if name is None else name
 310             raise ExtractorError('Could not find XML element %s' % name)
 311         else:
 312             return None
 313     return n
 314
 315
 316 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 317     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 318     if n is None or n == default:
 319         return n
 320     if n.text is None:
 321         if default is not NO_DEFAULT:
 322             return default
 323         elif fatal:
 324             name = xpath if name is None else name
 325             raise ExtractorError('Could not find XML element\'s text %s' % name)
 326         else:
 327             return None
 328     return n.text
 329
 330
 331 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 332     n = find_xpath_attr(node, xpath, key)
 333     if n is None:
 334         if default is not NO_DEFAULT:
 335             return default
 336         elif fatal:
 337             name = f'{xpath}[@{key}]' if name is None else name
 338             raise ExtractorError('Could not find XML attribute %s' % name)
 339         else:
 340             return None
 341     return n.attrib[key]
 342
 343
 344 def get_element_by_id(id, html, **kwargs):
 345     """Return the content of the tag with the specified ID in the passed HTML document"""
 346     return get_element_by_attribute('id', id, html, **kwargs)
 347
 348
 349 def get_element_html_by_id(id, html, **kwargs):
 350     """Return the html of the tag with the specified ID in the passed HTML document"""
 351     return get_element_html_by_attribute('id', id, html, **kwargs)
 352
 353
 354 def get_element_by_class(class_name, html):
 355     """Return the content of the first tag with the specified class in the passed HTML document"""
 356     retval = get_elements_by_class(class_name, html)
 357     return retval[0] if retval else None
 358
 359
 360 def get_element_html_by_class(class_name, html):
 361     """Return the html of the first tag with the specified class in the passed HTML document"""
 362     retval = get_elements_html_by_class(class_name, html)
 363     return retval[0] if retval else None
 364
 365
 366 def get_element_by_attribute(attribute, value, html, **kwargs):
 367     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 368     return retval[0] if retval else None
 369
 370
 371 def get_element_html_by_attribute(attribute, value, html, **kargs):
 372     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 373     return retval[0] if retval else None
 374
 375
 376 def get_elements_by_class(class_name, html, **kargs):
 377     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 378     return get_elements_by_attribute(
 379         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 380         html, escape_value=False)
 381
 382
 383 def get_elements_html_by_class(class_name, html):
 384     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 385     return get_elements_html_by_attribute(
 386         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 387         html, escape_value=False)
 388
 389
 390 def get_elements_by_attribute(*args, **kwargs):
 391     """Return the content of the tag with the specified attribute in the passed HTML document"""
 392     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 393
 394
 395 def get_elements_html_by_attribute(*args, **kwargs):
 396     """Return the html of the tag with the specified attribute in the passed HTML document"""
 397     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 398
 399
 400 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 401     """
 402     Return the text (content) and the html (whole) of the tag with the specified
 403     attribute in the passed HTML document
 404     """
 405
 406     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 407
 408     value = re.escape(value) if escape_value else value
 409
 410     partial_element_re = rf'''(?x)
 411         <(?P<tag>[a-zA-Z0-9:._-]+)
 412          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 413          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 414         '''
 415
 416     for m in re.finditer(partial_element_re, html):
 417         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 418
 419         yield (
 420             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 421             whole
 422         )
 423
 424
 425 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 426     """
 427     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 428     closing tag for the first opening tag it has encountered, and can be used
 429     as a context manager
 430     """
 431
 432     class HTMLBreakOnClosingTagException(Exception):
 433         pass
 434
 435     def __init__(self):
 436         self.tagstack = collections.deque()
 437         html.parser.HTMLParser.__init__(self)
 438
 439     def __enter__(self):
 440         return self
 441
 442     def __exit__(self, *_):
 443         self.close()
 444
 445     def close(self):
 446         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 447         # so data remains buffered; we no longer have any interest in it, thus
 448         # override this method to discard it
 449         pass
 450
 451     def handle_starttag(self, tag, _):
 452         self.tagstack.append(tag)
 453
 454     def handle_endtag(self, tag):
 455         if not self.tagstack:
 456             raise compat_HTMLParseError('no tags in the stack')
 457         while self.tagstack:
 458             inner_tag = self.tagstack.pop()
 459             if inner_tag == tag:
 460                 break
 461         else:
 462             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 463         if not self.tagstack:
 464             raise self.HTMLBreakOnClosingTagException()
 465
 466
 467 def get_element_text_and_html_by_tag(tag, html):
 468     """
 469     For the first element with the specified tag in the passed HTML document
 470     return its' content (text) and the whole element (html)
 471     """
 472     def find_or_raise(haystack, needle, exc):
 473         try:
 474             return haystack.index(needle)
 475         except ValueError:
 476             raise exc
 477     closing_tag = f'</{tag}>'
 478     whole_start = find_or_raise(
 479         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 480     content_start = find_or_raise(
 481         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 482     content_start += whole_start + 1
 483     with HTMLBreakOnClosingTagParser() as parser:
 484         parser.feed(html[whole_start:content_start])
 485         if not parser.tagstack or parser.tagstack[0] != tag:
 486             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 487         offset = content_start
 488         while offset < len(html):
 489             next_closing_tag_start = find_or_raise(
 490                 html[offset:], closing_tag,
 491                 compat_HTMLParseError(f'closing {tag} tag not found'))
 492             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 493             try:
 494                 parser.feed(html[offset:offset + next_closing_tag_end])
 495                 offset += next_closing_tag_end
 496             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 497                 return html[content_start:offset + next_closing_tag_start], \
 498                     html[whole_start:offset + next_closing_tag_end]
 499         raise compat_HTMLParseError('unexpected end of html')
 500
 501
 502 class HTMLAttributeParser(html.parser.HTMLParser):
 503     """Trivial HTML parser to gather the attributes for a single element"""
 504
 505     def __init__(self):
 506         self.attrs = {}
 507         html.parser.HTMLParser.__init__(self)
 508
 509     def handle_starttag(self, tag, attrs):
 510         self.attrs = dict(attrs)
 511
 512
 513 class HTMLListAttrsParser(html.parser.HTMLParser):
 514     """HTML parser to gather the attributes for the elements of a list"""
 515
 516     def __init__(self):
 517         html.parser.HTMLParser.__init__(self)
 518         self.items = []
 519         self._level = 0
 520
 521     def handle_starttag(self, tag, attrs):
 522         if tag == 'li' and self._level == 0:
 523             self.items.append(dict(attrs))
 524         self._level += 1
 525
 526     def handle_endtag(self, tag):
 527         self._level -= 1
 528
 529
 530 def extract_attributes(html_element):
 531     """Given a string for an HTML element such as
 532     <el
 533          a="foo" B="bar" c="&98;az" d=boz
 534          empty= noval entity="&amp;"
 535          sq='"' dq="'"
 536     >
 537     Decode and return a dictionary of attributes.
 538     {
 539         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 540         'empty': '', 'noval': None, 'entity': '&',
 541         'sq': '"', 'dq': '\''
 542     }.
 543     """
 544     parser = HTMLAttributeParser()
 545     with contextlib.suppress(compat_HTMLParseError):
 546         parser.feed(html_element)
 547         parser.close()
 548     return parser.attrs
 549
 550
 551 def parse_list(webpage):
 552     """Given a string for an series of HTML <li> elements,
 553     return a dictionary of their attributes"""
 554     parser = HTMLListAttrsParser()
 555     parser.feed(webpage)
 556     parser.close()
 557     return parser.items
 558
 559
 560 def clean_html(html):
 561     """Clean an HTML snippet into a readable string"""
 562
 563     if html is None:  # Convenience for sanitizing descriptions etc.
 564         return html
 565
 566     html = re.sub(r'\s+', ' ', html)
 567     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 568     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 569     # Strip html tags
 570     html = re.sub('<.*?>', '', html)
 571     # Replace html entities
 572     html = unescapeHTML(html)
 573     return html.strip()
 574
 575
 576 class LenientJSONDecoder(json.JSONDecoder):
 577     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 578         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 579         super().__init__(*args, **kwargs)
 580
 581     def decode(self, s):
 582         if self.transform_source:
 583             s = self.transform_source(s)
 584         if self.ignore_extra:
 585             return self.raw_decode(s.lstrip())[0]
 586         return super().decode(s)
 587
 588
 589 def sanitize_open(filename, open_mode):
 590     """Try to open the given filename, and slightly tweak it if this fails.
 591
 592     Attempts to open the given filename. If this fails, it tries to change
 593     the filename slightly, step by step, until it's either able to open it
 594     or it fails and raises a final exception, like the standard open()
 595     function.
 596
 597     It returns the tuple (stream, definitive_file_name).
 598     """
 599     if filename == '-':
 600         if sys.platform == 'win32':
 601             import msvcrt
 602
 603             # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout
 604             with contextlib.suppress(io.UnsupportedOperation):
 605                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 606         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 607
 608     for attempt in range(2):
 609         try:
 610             try:
 611                 if sys.platform == 'win32':
 612                     # FIXME: An exclusive lock also locks the file from being read.
 613                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 614                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 615                     raise LockingUnsupportedError()
 616                 stream = locked_file(filename, open_mode, block=False).__enter__()
 617             except OSError:
 618                 stream = open(filename, open_mode)
 619             return stream, filename
 620         except OSError as err:
 621             if attempt or err.errno in (errno.EACCES,):
 622                 raise
 623             old_filename, filename = filename, sanitize_path(filename)
 624             if old_filename == filename:
 625                 raise
 626
 627
 628 def timeconvert(timestr):
 629     """Convert RFC 2822 defined time string into system timestamp"""
 630     timestamp = None
 631     timetuple = email.utils.parsedate_tz(timestr)
 632     if timetuple is not None:
 633         timestamp = email.utils.mktime_tz(timetuple)
 634     return timestamp
 635
 636
 637 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 638     """Sanitizes a string so it could be used as part of a filename.
 639     @param restricted   Use a stricter subset of allowed characters
 640     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 641                         If unset, yt-dlp's new sanitization rules are in effect
 642     """
 643     if s == '':
 644         return ''
 645
 646     def replace_insane(char):
 647         if restricted and char in ACCENT_CHARS:
 648             return ACCENT_CHARS[char]
 649         elif not restricted and char == '\n':
 650             return '\0 '
 651         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 652             return ''
 653         elif char == '"':
 654             return '' if restricted else '\''
 655         elif char == ':':
 656             return '\0_\0-' if restricted else '\0 \0-'
 657         elif char in '\\/|*<>':
 658             return '\0_'
 659         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 660             return '\0_'
 661         return char
 662
 663     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 664     result = ''.join(map(replace_insane, s))
 665     if is_id is NO_DEFAULT:
 666         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 667         STRIP_RE = r'(?:\0.|[ _-])*'
 668         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 669     result = result.replace('\0', '') or '_'
 670
 671     if not is_id:
 672         while '__' in result:
 673             result = result.replace('__', '_')
 674         result = result.strip('_')
 675         # Common case of "Foreign band name - English song title"
 676         if restricted and result.startswith('-_'):
 677             result = result[2:]
 678         if result.startswith('-'):
 679             result = '_' + result[len('-'):]
 680         result = result.lstrip('.')
 681         if not result:
 682             result = '_'
 683     return result
 684
 685
 686 def sanitize_path(s, force=False):
 687     """Sanitizes and normalizes path on Windows"""
 688     if sys.platform == 'win32':
 689         force = False
 690         drive_or_unc, _ = os.path.splitdrive(s)
 691     elif force:
 692         drive_or_unc = ''
 693     else:
 694         return s
 695
 696     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 697     if drive_or_unc:
 698         norm_path.pop(0)
 699     sanitized_path = [
 700         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 701         for path_part in norm_path]
 702     if drive_or_unc:
 703         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 704     elif force and s and s[0] == os.path.sep:
 705         sanitized_path.insert(0, os.path.sep)
 706     return os.path.join(*sanitized_path)
 707
 708
 709 def sanitize_url(url, *, scheme='http'):
 710     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 711     # the number of unwanted failures due to missing protocol
 712     if url is None:
 713         return
 714     elif url.startswith('//'):
 715         return f'{scheme}:{url}'
 716     # Fix some common typos seen so far
 717     COMMON_TYPOS = (
 718         # https://github.com/ytdl-org/youtube-dl/issues/15649
 719         (r'^httpss://', r'https://'),
 720         # https://bx1.be/lives/direct-tv/
 721         (r'^rmtp([es]?)://', r'rtmp\1://'),
 722     )
 723     for mistake, fixup in COMMON_TYPOS:
 724         if re.match(mistake, url):
 725             return re.sub(mistake, fixup, url)
 726     return url
 727
 728
 729 def extract_basic_auth(url):
 730     parts = urllib.parse.urlsplit(url)
 731     if parts.username is None:
 732         return url, None
 733     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 734         parts.hostname if parts.port is None
 735         else '%s:%d' % (parts.hostname, parts.port))))
 736     auth_payload = base64.b64encode(
 737         ('%s:%s' % (parts.username, parts.password or '')).encode())
 738     return url, f'Basic {auth_payload.decode()}'
 739
 740
 741 def sanitized_Request(url, *args, **kwargs):
 742     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 743     if auth_header is not None:
 744         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 745         headers['Authorization'] = auth_header
 746     return urllib.request.Request(url, *args, **kwargs)
 747
 748
 749 def expand_path(s):
 750     """Expand shell variables and ~"""
 751     return os.path.expandvars(compat_expanduser(s))
 752
 753
 754 def orderedSet(iterable, *, lazy=False):
 755     """Remove all duplicates from the input iterable"""
 756     def _iter():
 757         seen = []  # Do not use set since the items can be unhashable
 758         for x in iterable:
 759             if x not in seen:
 760                 seen.append(x)
 761                 yield x
 762
 763     return _iter() if lazy else list(_iter())
 764
 765
 766 def _htmlentity_transform(entity_with_semicolon):
 767     """Transforms an HTML entity to a character."""
 768     entity = entity_with_semicolon[:-1]
 769
 770     # Known non-numeric HTML entity
 771     if entity in html.entities.name2codepoint:
 772         return chr(html.entities.name2codepoint[entity])
 773
 774     # TODO: HTML5 allows entities without a semicolon. For example,
 775     # '&Eacuteric' should be decoded as 'Éric'.
 776     if entity_with_semicolon in html.entities.html5:
 777         return html.entities.html5[entity_with_semicolon]
 778
 779     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 780     if mobj is not None:
 781         numstr = mobj.group(1)
 782         if numstr.startswith('x'):
 783             base = 16
 784             numstr = '0%s' % numstr
 785         else:
 786             base = 10
 787         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 788         with contextlib.suppress(ValueError):
 789             return chr(int(numstr, base))
 790
 791     # Unknown entity in name, return its literal representation
 792     return '&%s;' % entity
 793
 794
 795 def unescapeHTML(s):
 796     if s is None:
 797         return None
 798     assert isinstance(s, str)
 799
 800     return re.sub(
 801         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 802
 803
 804 def escapeHTML(text):
 805     return (
 806         text
 807         .replace('&', '&amp;')
 808         .replace('<', '&lt;')
 809         .replace('>', '&gt;')
 810         .replace('"', '&quot;')
 811         .replace("'", '&#39;')
 812     )
 813
 814
 815 def process_communicate_or_kill(p, *args, **kwargs):
 816     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 817                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 818     return Popen.communicate_or_kill(p, *args, **kwargs)
 819
 820
 821 class Popen(subprocess.Popen):
 822     if sys.platform == 'win32':
 823         _startupinfo = subprocess.STARTUPINFO()
 824         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 825     else:
 826         _startupinfo = None
 827
 828     def __init__(self, *args, text=False, **kwargs):
 829         if text is True:
 830             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 831             kwargs.setdefault('encoding', 'utf-8')
 832             kwargs.setdefault('errors', 'replace')
 833         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 834
 835     def communicate_or_kill(self, *args, **kwargs):
 836         try:
 837             return self.communicate(*args, **kwargs)
 838         except BaseException:  # Including KeyboardInterrupt
 839             self.kill(timeout=None)
 840             raise
 841
 842     def kill(self, *, timeout=0):
 843         super().kill()
 844         if timeout != 0:
 845             self.wait(timeout=timeout)
 846
 847     @classmethod
 848     def run(cls, *args, **kwargs):
 849         with cls(*args, **kwargs) as proc:
 850             stdout, stderr = proc.communicate_or_kill()
 851             return stdout or '', stderr or '', proc.returncode
 852
 853
 854 def get_subprocess_encoding():
 855     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 856         # For subprocess calls, encode with locale encoding
 857         # Refer to http://stackoverflow.com/a/9951851/35070
 858         encoding = preferredencoding()
 859     else:
 860         encoding = sys.getfilesystemencoding()
 861     if encoding is None:
 862         encoding = 'utf-8'
 863     return encoding
 864
 865
 866 def encodeFilename(s, for_subprocess=False):
 867     assert isinstance(s, str)
 868     return s
 869
 870
 871 def decodeFilename(b, for_subprocess=False):
 872     return b
 873
 874
 875 def encodeArgument(s):
 876     # Legacy code that uses byte strings
 877     # Uncomment the following line after fixing all post processors
 878     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 879     return s if isinstance(s, str) else s.decode('ascii')
 880
 881
 882 def decodeArgument(b):
 883     return b
 884
 885
 886 def decodeOption(optval):
 887     if optval is None:
 888         return optval
 889     if isinstance(optval, bytes):
 890         optval = optval.decode(preferredencoding())
 891
 892     assert isinstance(optval, str)
 893     return optval
 894
 895
 896 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 897
 898
 899 def timetuple_from_msec(msec):
 900     secs, msec = divmod(msec, 1000)
 901     mins, secs = divmod(secs, 60)
 902     hrs, mins = divmod(mins, 60)
 903     return _timetuple(hrs, mins, secs, msec)
 904
 905
 906 def formatSeconds(secs, delim=':', msec=False):
 907     time = timetuple_from_msec(secs * 1000)
 908     if time.hours:
 909         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 910     elif time.minutes:
 911         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 912     else:
 913         ret = '%d' % time.seconds
 914     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 915
 916
 917 def _ssl_load_windows_store_certs(ssl_context, storename):
 918     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 919     try:
 920         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 921                  if encoding == 'x509_asn' and (
 922                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 923     except PermissionError:
 924         return
 925     for cert in certs:
 926         with contextlib.suppress(ssl.SSLError):
 927             ssl_context.load_verify_locations(cadata=cert)
 928
 929
 930 def make_HTTPS_handler(params, **kwargs):
 931     opts_check_certificate = not params.get('nocheckcertificate')
 932     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 933     context.check_hostname = opts_check_certificate
 934     if params.get('legacyserverconnect'):
 935         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 936         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 937         context.set_ciphers('DEFAULT')
 938
 939     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 940     if opts_check_certificate:
 941         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 942             context.load_verify_locations(cafile=certifi.where())
 943         else:
 944             try:
 945                 context.load_default_certs()
 946                 # Work around the issue in load_default_certs when there are bad certificates. See:
 947                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 948                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 949             except ssl.SSLError:
 950                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 951                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 952                     for storename in ('CA', 'ROOT'):
 953                         _ssl_load_windows_store_certs(context, storename)
 954                 context.set_default_verify_paths()
 955
 956     client_certfile = params.get('client_certificate')
 957     if client_certfile:
 958         try:
 959             context.load_cert_chain(
 960                 client_certfile, keyfile=params.get('client_certificate_key'),
 961                 password=params.get('client_certificate_password'))
 962         except ssl.SSLError:
 963             raise YoutubeDLError('Unable to load client certificate')
 964
 965     # Some servers may reject requests if ALPN extension is not sent. See:
 966     # https://github.com/python/cpython/issues/85140
 967     # https://github.com/yt-dlp/yt-dlp/issues/3878
 968     with contextlib.suppress(NotImplementedError):
 969         context.set_alpn_protocols(['http/1.1'])
 970
 971     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 972
 973
 974 def bug_reports_message(before=';'):
 975     from .update import REPOSITORY
 976
 977     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 978            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 979
 980     before = before.rstrip()
 981     if not before or before.endswith(('.', '!', '?')):
 982         msg = msg[0].title() + msg[1:]
 983
 984     return (before + ' ' if before else '') + msg
 985
 986
 987 class YoutubeDLError(Exception):
 988     """Base exception for YoutubeDL errors."""
 989     msg = None
 990
 991     def __init__(self, msg=None):
 992         if msg is not None:
 993             self.msg = msg
 994         elif self.msg is None:
 995             self.msg = type(self).__name__
 996         super().__init__(self.msg)
 997
 998
 999 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1000 if hasattr(ssl, 'CertificateError'):
1001     network_exceptions.append(ssl.CertificateError)
1002 network_exceptions = tuple(network_exceptions)
1003
1004
1005 class ExtractorError(YoutubeDLError):
1006     """Error during info extraction."""
1007
1008     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1009         """ tb, if given, is the original traceback (so that it can be printed out).
1010         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1011         """
1012         if sys.exc_info()[0] in network_exceptions:
1013             expected = True
1014
1015         self.orig_msg = str(msg)
1016         self.traceback = tb
1017         self.expected = expected
1018         self.cause = cause
1019         self.video_id = video_id
1020         self.ie = ie
1021         self.exc_info = sys.exc_info()  # preserve original exception
1022         if isinstance(self.exc_info[1], ExtractorError):
1023             self.exc_info = self.exc_info[1].exc_info
1024
1025         super().__init__(''.join((
1026             format_field(ie, None, '[%s] '),
1027             format_field(video_id, None, '%s: '),
1028             msg,
1029             format_field(cause, None, ' (caused by %r)'),
1030             '' if expected else bug_reports_message())))
1031
1032     def format_traceback(self):
1033         return join_nonempty(
1034             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1035             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1036             delim='\n') or None
1037
1038
1039 class UnsupportedError(ExtractorError):
1040     def __init__(self, url):
1041         super().__init__(
1042             'Unsupported URL: %s' % url, expected=True)
1043         self.url = url
1044
1045
1046 class RegexNotFoundError(ExtractorError):
1047     """Error when a regex didn't match"""
1048     pass
1049
1050
1051 class GeoRestrictedError(ExtractorError):
1052     """Geographic restriction Error exception.
1053
1054     This exception may be thrown when a video is not available from your
1055     geographic location due to geographic restrictions imposed by a website.
1056     """
1057
1058     def __init__(self, msg, countries=None, **kwargs):
1059         kwargs['expected'] = True
1060         super().__init__(msg, **kwargs)
1061         self.countries = countries
1062
1063
1064 class UserNotLive(ExtractorError):
1065     """Error when a channel/user is not live"""
1066
1067     def __init__(self, msg=None, **kwargs):
1068         kwargs['expected'] = True
1069         super().__init__(msg or 'The channel is not currently live', **kwargs)
1070
1071
1072 class DownloadError(YoutubeDLError):
1073     """Download Error exception.
1074
1075     This exception may be thrown by FileDownloader objects if they are not
1076     configured to continue on errors. They will contain the appropriate
1077     error message.
1078     """
1079
1080     def __init__(self, msg, exc_info=None):
1081         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1082         super().__init__(msg)
1083         self.exc_info = exc_info
1084
1085
1086 class EntryNotInPlaylist(YoutubeDLError):
1087     """Entry not in playlist exception.
1088
1089     This exception will be thrown by YoutubeDL when a requested entry
1090     is not found in the playlist info_dict
1091     """
1092     msg = 'Entry not found in info'
1093
1094
1095 class SameFileError(YoutubeDLError):
1096     """Same File exception.
1097
1098     This exception will be thrown by FileDownloader objects if they detect
1099     multiple files would have to be downloaded to the same file on disk.
1100     """
1101     msg = 'Fixed output name but more than one file to download'
1102
1103     def __init__(self, filename=None):
1104         if filename is not None:
1105             self.msg += f': {filename}'
1106         super().__init__(self.msg)
1107
1108
1109 class PostProcessingError(YoutubeDLError):
1110     """Post Processing exception.
1111
1112     This exception may be raised by PostProcessor's .run() method to
1113     indicate an error in the postprocessing task.
1114     """
1115
1116
1117 class DownloadCancelled(YoutubeDLError):
1118     """ Exception raised when the download queue should be interrupted """
1119     msg = 'The download was cancelled'
1120
1121
1122 class ExistingVideoReached(DownloadCancelled):
1123     """ --break-on-existing triggered """
1124     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1125
1126
1127 class RejectedVideoReached(DownloadCancelled):
1128     """ --break-on-reject triggered """
1129     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1130
1131
1132 class MaxDownloadsReached(DownloadCancelled):
1133     """ --max-downloads limit has been reached. """
1134     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1135
1136
1137 class ReExtractInfo(YoutubeDLError):
1138     """ Video info needs to be re-extracted. """
1139
1140     def __init__(self, msg, expected=False):
1141         super().__init__(msg)
1142         self.expected = expected
1143
1144
1145 class ThrottledDownload(ReExtractInfo):
1146     """ Download speed below --throttled-rate. """
1147     msg = 'The download speed is below throttle limit'
1148
1149     def __init__(self):
1150         super().__init__(self.msg, expected=False)
1151
1152
1153 class UnavailableVideoError(YoutubeDLError):
1154     """Unavailable Format exception.
1155
1156     This exception will be thrown when a video is requested
1157     in a format that is not available for that video.
1158     """
1159     msg = 'Unable to download video'
1160
1161     def __init__(self, err=None):
1162         if err is not None:
1163             self.msg += f': {err}'
1164         super().__init__(self.msg)
1165
1166
1167 class ContentTooShortError(YoutubeDLError):
1168     """Content Too Short exception.
1169
1170     This exception may be raised by FileDownloader objects when a file they
1171     download is too small for what the server announced first, indicating
1172     the connection was probably interrupted.
1173     """
1174
1175     def __init__(self, downloaded, expected):
1176         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1177         # Both in bytes
1178         self.downloaded = downloaded
1179         self.expected = expected
1180
1181
1182 class XAttrMetadataError(YoutubeDLError):
1183     def __init__(self, code=None, msg='Unknown error'):
1184         super().__init__(msg)
1185         self.code = code
1186         self.msg = msg
1187
1188         # Parsing code and msg
1189         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1190                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1191             self.reason = 'NO_SPACE'
1192         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1193             self.reason = 'VALUE_TOO_LONG'
1194         else:
1195             self.reason = 'NOT_SUPPORTED'
1196
1197
1198 class XAttrUnavailableError(YoutubeDLError):
1199     pass
1200
1201
1202 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1203     hc = http_class(*args, **kwargs)
1204     source_address = ydl_handler._params.get('source_address')
1205
1206     if source_address is not None:
1207         # This is to workaround _create_connection() from socket where it will try all
1208         # address data from getaddrinfo() including IPv6. This filters the result from
1209         # getaddrinfo() based on the source_address value.
1210         # This is based on the cpython socket.create_connection() function.
1211         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1212         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1213             host, port = address
1214             err = None
1215             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1216             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1217             ip_addrs = [addr for addr in addrs if addr[0] == af]
1218             if addrs and not ip_addrs:
1219                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1220                 raise OSError(
1221                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1222                     % (ip_version, source_address[0]))
1223             for res in ip_addrs:
1224                 af, socktype, proto, canonname, sa = res
1225                 sock = None
1226                 try:
1227                     sock = socket.socket(af, socktype, proto)
1228                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1229                         sock.settimeout(timeout)
1230                     sock.bind(source_address)
1231                     sock.connect(sa)
1232                     err = None  # Explicitly break reference cycle
1233                     return sock
1234                 except OSError as _:
1235                     err = _
1236                     if sock is not None:
1237                         sock.close()
1238             if err is not None:
1239                 raise err
1240             else:
1241                 raise OSError('getaddrinfo returns an empty list')
1242         if hasattr(hc, '_create_connection'):
1243             hc._create_connection = _create_connection
1244         hc.source_address = (source_address, 0)
1245
1246     return hc
1247
1248
1249 def handle_youtubedl_headers(headers):
1250     filtered_headers = headers
1251
1252     if 'Youtubedl-no-compression' in filtered_headers:
1253         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1254         del filtered_headers['Youtubedl-no-compression']
1255
1256     return filtered_headers
1257
1258
1259 class YoutubeDLHandler(urllib.request.HTTPHandler):
1260     """Handler for HTTP requests and responses.
1261
1262     This class, when installed with an OpenerDirector, automatically adds
1263     the standard headers to every HTTP request and handles gzipped and
1264     deflated responses from web servers. If compression is to be avoided in
1265     a particular request, the original request in the program code only has
1266     to include the HTTP header "Youtubedl-no-compression", which will be
1267     removed before making the real request.
1268
1269     Part of this code was copied from:
1270
1271     http://techknack.net/python-urllib2-handlers/
1272
1273     Andrew Rowls, the author of that code, agreed to release it to the
1274     public domain.
1275     """
1276
1277     def __init__(self, params, *args, **kwargs):
1278         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1279         self._params = params
1280
1281     def http_open(self, req):
1282         conn_class = http.client.HTTPConnection
1283
1284         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1285         if socks_proxy:
1286             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1287             del req.headers['Ytdl-socks-proxy']
1288
1289         return self.do_open(functools.partial(
1290             _create_http_connection, self, conn_class, False),
1291             req)
1292
1293     @staticmethod
1294     def deflate(data):
1295         if not data:
1296             return data
1297         try:
1298             return zlib.decompress(data, -zlib.MAX_WBITS)
1299         except zlib.error:
1300             return zlib.decompress(data)
1301
1302     @staticmethod
1303     def brotli(data):
1304         if not data:
1305             return data
1306         return brotli.decompress(data)
1307
1308     def http_request(self, req):
1309         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1310         # always respected by websites, some tend to give out URLs with non percent-encoded
1311         # non-ASCII characters (see telemb.py, ard.py [#3412])
1312         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1313         # To work around aforementioned issue we will replace request's original URL with
1314         # percent-encoded one
1315         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1316         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1317         url = req.get_full_url()
1318         url_escaped = escape_url(url)
1319
1320         # Substitute URL if any change after escaping
1321         if url != url_escaped:
1322             req = update_Request(req, url=url_escaped)
1323
1324         for h, v in self._params.get('http_headers', std_headers).items():
1325             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1326             # The dict keys are capitalized because of this bug by urllib
1327             if h.capitalize() not in req.headers:
1328                 req.add_header(h, v)
1329
1330         if 'Accept-encoding' not in req.headers:
1331             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1332
1333         req.headers = handle_youtubedl_headers(req.headers)
1334
1335         return super().do_request_(req)
1336
1337     def http_response(self, req, resp):
1338         old_resp = resp
1339         # gzip
1340         if resp.headers.get('Content-encoding', '') == 'gzip':
1341             content = resp.read()
1342             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1343             try:
1344                 uncompressed = io.BytesIO(gz.read())
1345             except OSError as original_ioerror:
1346                 # There may be junk add the end of the file
1347                 # See http://stackoverflow.com/q/4928560/35070 for details
1348                 for i in range(1, 1024):
1349                     try:
1350                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1351                         uncompressed = io.BytesIO(gz.read())
1352                     except OSError:
1353                         continue
1354                     break
1355                 else:
1356                     raise original_ioerror
1357             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1358             resp.msg = old_resp.msg
1359             del resp.headers['Content-encoding']
1360         # deflate
1361         if resp.headers.get('Content-encoding', '') == 'deflate':
1362             gz = io.BytesIO(self.deflate(resp.read()))
1363             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1364             resp.msg = old_resp.msg
1365             del resp.headers['Content-encoding']
1366         # brotli
1367         if resp.headers.get('Content-encoding', '') == 'br':
1368             resp = urllib.request.addinfourl(
1369                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1370             resp.msg = old_resp.msg
1371             del resp.headers['Content-encoding']
1372         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1373         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1374         if 300 <= resp.code < 400:
1375             location = resp.headers.get('Location')
1376             if location:
1377                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1378                 location = location.encode('iso-8859-1').decode()
1379                 location_escaped = escape_url(location)
1380                 if location != location_escaped:
1381                     del resp.headers['Location']
1382                     resp.headers['Location'] = location_escaped
1383         return resp
1384
1385     https_request = http_request
1386     https_response = http_response
1387
1388
1389 def make_socks_conn_class(base_class, socks_proxy):
1390     assert issubclass(base_class, (
1391         http.client.HTTPConnection, http.client.HTTPSConnection))
1392
1393     url_components = urllib.parse.urlparse(socks_proxy)
1394     if url_components.scheme.lower() == 'socks5':
1395         socks_type = ProxyType.SOCKS5
1396     elif url_components.scheme.lower() in ('socks', 'socks4'):
1397         socks_type = ProxyType.SOCKS4
1398     elif url_components.scheme.lower() == 'socks4a':
1399         socks_type = ProxyType.SOCKS4A
1400
1401     def unquote_if_non_empty(s):
1402         if not s:
1403             return s
1404         return urllib.parse.unquote_plus(s)
1405
1406     proxy_args = (
1407         socks_type,
1408         url_components.hostname, url_components.port or 1080,
1409         True,  # Remote DNS
1410         unquote_if_non_empty(url_components.username),
1411         unquote_if_non_empty(url_components.password),
1412     )
1413
1414     class SocksConnection(base_class):
1415         def connect(self):
1416             self.sock = sockssocket()
1417             self.sock.setproxy(*proxy_args)
1418             if isinstance(self.timeout, (int, float)):
1419                 self.sock.settimeout(self.timeout)
1420             self.sock.connect((self.host, self.port))
1421
1422             if isinstance(self, http.client.HTTPSConnection):
1423                 if hasattr(self, '_context'):  # Python > 2.6
1424                     self.sock = self._context.wrap_socket(
1425                         self.sock, server_hostname=self.host)
1426                 else:
1427                     self.sock = ssl.wrap_socket(self.sock)
1428
1429     return SocksConnection
1430
1431
1432 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1433     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1434         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1435         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1436         self._params = params
1437
1438     def https_open(self, req):
1439         kwargs = {}
1440         conn_class = self._https_conn_class
1441
1442         if hasattr(self, '_context'):  # python > 2.6
1443             kwargs['context'] = self._context
1444         if hasattr(self, '_check_hostname'):  # python 3.x
1445             kwargs['check_hostname'] = self._check_hostname
1446
1447         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1448         if socks_proxy:
1449             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1450             del req.headers['Ytdl-socks-proxy']
1451
1452         try:
1453             return self.do_open(
1454                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1455         except urllib.error.URLError as e:
1456             if (isinstance(e.reason, ssl.SSLError)
1457                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1458                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1459             raise
1460
1461
1462 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1463     """
1464     See [1] for cookie file format.
1465
1466     1. https://curl.haxx.se/docs/http-cookies.html
1467     """
1468     _HTTPONLY_PREFIX = '#HttpOnly_'
1469     _ENTRY_LEN = 7
1470     _HEADER = '''# Netscape HTTP Cookie File
1471 # This file is generated by yt-dlp.  Do not edit.
1472
1473 '''
1474     _CookieFileEntry = collections.namedtuple(
1475         'CookieFileEntry',
1476         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1477
1478     def __init__(self, filename=None, *args, **kwargs):
1479         super().__init__(None, *args, **kwargs)
1480         if self.is_path(filename):
1481             filename = os.fspath(filename)
1482         self.filename = filename
1483
1484     @staticmethod
1485     def _true_or_false(cndn):
1486         return 'TRUE' if cndn else 'FALSE'
1487
1488     @staticmethod
1489     def is_path(file):
1490         return isinstance(file, (str, bytes, os.PathLike))
1491
1492     @contextlib.contextmanager
1493     def open(self, file, *, write=False):
1494         if self.is_path(file):
1495             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1496                 yield f
1497         else:
1498             if write:
1499                 file.truncate(0)
1500             yield file
1501
1502     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1503         now = time.time()
1504         for cookie in self:
1505             if (not ignore_discard and cookie.discard
1506                     or not ignore_expires and cookie.is_expired(now)):
1507                 continue
1508             name, value = cookie.name, cookie.value
1509             if value is None:
1510                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1511                 # with no name, whereas http.cookiejar regards it as a
1512                 # cookie with no value.
1513                 name, value = '', name
1514             f.write('%s\n' % '\t'.join((
1515                 cookie.domain,
1516                 self._true_or_false(cookie.domain.startswith('.')),
1517                 cookie.path,
1518                 self._true_or_false(cookie.secure),
1519                 str_or_none(cookie.expires, default=''),
1520                 name, value
1521             )))
1522
1523     def save(self, filename=None, *args, **kwargs):
1524         """
1525         Save cookies to a file.
1526         Code is taken from CPython 3.6
1527         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1528
1529         if filename is None:
1530             if self.filename is not None:
1531                 filename = self.filename
1532             else:
1533                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1534
1535         # Store session cookies with `expires` set to 0 instead of an empty string
1536         for cookie in self:
1537             if cookie.expires is None:
1538                 cookie.expires = 0
1539
1540         with self.open(filename, write=True) as f:
1541             f.write(self._HEADER)
1542             self._really_save(f, *args, **kwargs)
1543
1544     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1545         """Load cookies from a file."""
1546         if filename is None:
1547             if self.filename is not None:
1548                 filename = self.filename
1549             else:
1550                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1551
1552         def prepare_line(line):
1553             if line.startswith(self._HTTPONLY_PREFIX):
1554                 line = line[len(self._HTTPONLY_PREFIX):]
1555             # comments and empty lines are fine
1556             if line.startswith('#') or not line.strip():
1557                 return line
1558             cookie_list = line.split('\t')
1559             if len(cookie_list) != self._ENTRY_LEN:
1560                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1561             cookie = self._CookieFileEntry(*cookie_list)
1562             if cookie.expires_at and not cookie.expires_at.isdigit():
1563                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1564             return line
1565
1566         cf = io.StringIO()
1567         with self.open(filename) as f:
1568             for line in f:
1569                 try:
1570                     cf.write(prepare_line(line))
1571                 except http.cookiejar.LoadError as e:
1572                     if f'{line.strip()} '[0] in '[{"':
1573                         raise http.cookiejar.LoadError(
1574                             'Cookies file must be Netscape formatted, not JSON. See  '
1575                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1576                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1577                     continue
1578         cf.seek(0)
1579         self._really_load(cf, filename, ignore_discard, ignore_expires)
1580         # Session cookies are denoted by either `expires` field set to
1581         # an empty string or 0. MozillaCookieJar only recognizes the former
1582         # (see [1]). So we need force the latter to be recognized as session
1583         # cookies on our own.
1584         # Session cookies may be important for cookies-based authentication,
1585         # e.g. usually, when user does not check 'Remember me' check box while
1586         # logging in on a site, some important cookies are stored as session
1587         # cookies so that not recognizing them will result in failed login.
1588         # 1. https://bugs.python.org/issue17164
1589         for cookie in self:
1590             # Treat `expires=0` cookies as session cookies
1591             if cookie.expires == 0:
1592                 cookie.expires = None
1593                 cookie.discard = True
1594
1595
1596 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1597     def __init__(self, cookiejar=None):
1598         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1599
1600     def http_response(self, request, response):
1601         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1602
1603     https_request = urllib.request.HTTPCookieProcessor.http_request
1604     https_response = http_response
1605
1606
1607 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1608     """YoutubeDL redirect handler
1609
1610     The code is based on HTTPRedirectHandler implementation from CPython [1].
1611
1612     This redirect handler solves two issues:
1613      - ensures redirect URL is always unicode under python 2
1614      - introduces support for experimental HTTP response status code
1615        308 Permanent Redirect [2] used by some sites [3]
1616
1617     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1618     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1619     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1620     """
1621
1622     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1623
1624     def redirect_request(self, req, fp, code, msg, headers, newurl):
1625         """Return a Request or None in response to a redirect.
1626
1627         This is called by the http_error_30x methods when a
1628         redirection response is received.  If a redirection should
1629         take place, return a new Request to allow http_error_30x to
1630         perform the redirect.  Otherwise, raise HTTPError if no-one
1631         else should try to handle this url.  Return None if you can't
1632         but another Handler might.
1633         """
1634         m = req.get_method()
1635         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1636                  or code in (301, 302, 303) and m == "POST")):
1637             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1638         # Strictly (according to RFC 2616), 301 or 302 in response to
1639         # a POST MUST NOT cause a redirection without confirmation
1640         # from the user (of urllib.request, in this case).  In practice,
1641         # essentially all clients do redirect in this case, so we do
1642         # the same.
1643
1644         # Be conciliant with URIs containing a space.  This is mainly
1645         # redundant with the more complete encoding done in http_error_302(),
1646         # but it is kept for compatibility with other callers.
1647         newurl = newurl.replace(' ', '%20')
1648
1649         CONTENT_HEADERS = ("content-length", "content-type")
1650         # NB: don't use dict comprehension for python 2.6 compatibility
1651         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1652
1653         # A 303 must either use GET or HEAD for subsequent request
1654         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1655         if code == 303 and m != 'HEAD':
1656             m = 'GET'
1657         # 301 and 302 redirects are commonly turned into a GET from a POST
1658         # for subsequent requests by browsers, so we'll do the same.
1659         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1660         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1661         if code in (301, 302) and m == 'POST':
1662             m = 'GET'
1663
1664         return urllib.request.Request(
1665             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1666             unverifiable=True, method=m)
1667
1668
1669 def extract_timezone(date_str):
1670     m = re.search(
1671         r'''(?x)
1672             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1673             (?P<tz>Z|                                            # just the UTC Z, or
1674                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1675                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1676                    [ ]?                                          # optional space
1677                 (?P<sign>\+|-)                                   # +/-
1678                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1679             $)
1680         ''', date_str)
1681     if not m:
1682         timezone = datetime.timedelta()
1683     else:
1684         date_str = date_str[:-len(m.group('tz'))]
1685         if not m.group('sign'):
1686             timezone = datetime.timedelta()
1687         else:
1688             sign = 1 if m.group('sign') == '+' else -1
1689             timezone = datetime.timedelta(
1690                 hours=sign * int(m.group('hours')),
1691                 minutes=sign * int(m.group('minutes')))
1692     return timezone, date_str
1693
1694
1695 def parse_iso8601(date_str, delimiter='T', timezone=None):
1696     """ Return a UNIX timestamp from the given date """
1697
1698     if date_str is None:
1699         return None
1700
1701     date_str = re.sub(r'\.[0-9]+', '', date_str)
1702
1703     if timezone is None:
1704         timezone, date_str = extract_timezone(date_str)
1705
1706     with contextlib.suppress(ValueError):
1707         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1708         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1709         return calendar.timegm(dt.timetuple())
1710
1711
1712 def date_formats(day_first=True):
1713     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1714
1715
1716 def unified_strdate(date_str, day_first=True):
1717     """Return a string with the date in the format YYYYMMDD"""
1718
1719     if date_str is None:
1720         return None
1721     upload_date = None
1722     # Replace commas
1723     date_str = date_str.replace(',', ' ')
1724     # Remove AM/PM + timezone
1725     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1726     _, date_str = extract_timezone(date_str)
1727
1728     for expression in date_formats(day_first):
1729         with contextlib.suppress(ValueError):
1730             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1731     if upload_date is None:
1732         timetuple = email.utils.parsedate_tz(date_str)
1733         if timetuple:
1734             with contextlib.suppress(ValueError):
1735                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1736     if upload_date is not None:
1737         return str(upload_date)
1738
1739
1740 def unified_timestamp(date_str, day_first=True):
1741     if date_str is None:
1742         return None
1743
1744     date_str = re.sub(r'[,|]', '', date_str)
1745
1746     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1747     timezone, date_str = extract_timezone(date_str)
1748
1749     # Remove AM/PM + timezone
1750     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1751
1752     # Remove unrecognized timezones from ISO 8601 alike timestamps
1753     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1754     if m:
1755         date_str = date_str[:-len(m.group('tz'))]
1756
1757     # Python only supports microseconds, so remove nanoseconds
1758     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1759     if m:
1760         date_str = m.group(1)
1761
1762     for expression in date_formats(day_first):
1763         with contextlib.suppress(ValueError):
1764             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1765             return calendar.timegm(dt.timetuple())
1766     timetuple = email.utils.parsedate_tz(date_str)
1767     if timetuple:
1768         return calendar.timegm(timetuple) + pm_delta * 3600
1769
1770
1771 def determine_ext(url, default_ext='unknown_video'):
1772     if url is None or '.' not in url:
1773         return default_ext
1774     guess = url.partition('?')[0].rpartition('.')[2]
1775     if re.match(r'^[A-Za-z0-9]+$', guess):
1776         return guess
1777     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1778     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1779         return guess.rstrip('/')
1780     else:
1781         return default_ext
1782
1783
1784 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1785     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1786
1787
1788 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1789     R"""
1790     Return a datetime object from a string.
1791     Supported format:
1792         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1793
1794     @param format       strftime format of DATE
1795     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1796                         auto: round to the unit provided in date_str (if applicable).
1797     """
1798     auto_precision = False
1799     if precision == 'auto':
1800         auto_precision = True
1801         precision = 'microsecond'
1802     today = datetime_round(datetime.datetime.utcnow(), precision)
1803     if date_str in ('now', 'today'):
1804         return today
1805     if date_str == 'yesterday':
1806         return today - datetime.timedelta(days=1)
1807     match = re.match(
1808         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1809         date_str)
1810     if match is not None:
1811         start_time = datetime_from_str(match.group('start'), precision, format)
1812         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1813         unit = match.group('unit')
1814         if unit == 'month' or unit == 'year':
1815             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1816             unit = 'day'
1817         else:
1818             if unit == 'week':
1819                 unit = 'day'
1820                 time *= 7
1821             delta = datetime.timedelta(**{unit + 's': time})
1822             new_date = start_time + delta
1823         if auto_precision:
1824             return datetime_round(new_date, unit)
1825         return new_date
1826
1827     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1828
1829
1830 def date_from_str(date_str, format='%Y%m%d', strict=False):
1831     R"""
1832     Return a date object from a string using datetime_from_str
1833
1834     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1835                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1836     """
1837     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1838         raise ValueError(f'Invalid date format "{date_str}"')
1839     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1840
1841
1842 def datetime_add_months(dt, months):
1843     """Increment/Decrement a datetime object by months."""
1844     month = dt.month + months - 1
1845     year = dt.year + month // 12
1846     month = month % 12 + 1
1847     day = min(dt.day, calendar.monthrange(year, month)[1])
1848     return dt.replace(year, month, day)
1849
1850
1851 def datetime_round(dt, precision='day'):
1852     """
1853     Round a datetime object's time to a specific precision
1854     """
1855     if precision == 'microsecond':
1856         return dt
1857
1858     unit_seconds = {
1859         'day': 86400,
1860         'hour': 3600,
1861         'minute': 60,
1862         'second': 1,
1863     }
1864     roundto = lambda x, n: ((x + n / 2) // n) * n
1865     timestamp = calendar.timegm(dt.timetuple())
1866     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1867
1868
1869 def hyphenate_date(date_str):
1870     """
1871     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1872     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1873     if match is not None:
1874         return '-'.join(match.groups())
1875     else:
1876         return date_str
1877
1878
1879 class DateRange:
1880     """Represents a time interval between two dates"""
1881
1882     def __init__(self, start=None, end=None):
1883         """start and end must be strings in the format accepted by date"""
1884         if start is not None:
1885             self.start = date_from_str(start, strict=True)
1886         else:
1887             self.start = datetime.datetime.min.date()
1888         if end is not None:
1889             self.end = date_from_str(end, strict=True)
1890         else:
1891             self.end = datetime.datetime.max.date()
1892         if self.start > self.end:
1893             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1894
1895     @classmethod
1896     def day(cls, day):
1897         """Returns a range that only contains the given day"""
1898         return cls(day, day)
1899
1900     def __contains__(self, date):
1901         """Check if the date is in the range"""
1902         if not isinstance(date, datetime.date):
1903             date = date_from_str(date)
1904         return self.start <= date <= self.end
1905
1906     def __str__(self):
1907         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1908
1909     def __eq__(self, other):
1910         return (isinstance(other, DateRange)
1911                 and self.start == other.start and self.end == other.end)
1912
1913
1914 def platform_name():
1915     """ Returns the platform name as a str """
1916     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1917     return platform.platform()
1918
1919
1920 @functools.cache
1921 def system_identifier():
1922     python_implementation = platform.python_implementation()
1923     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1924         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1925
1926     return 'Python %s (%s %s) - %s %s' % (
1927         platform.python_version(),
1928         python_implementation,
1929         platform.architecture()[0],
1930         platform.platform(),
1931         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1932     )
1933
1934
1935 @functools.cache
1936 def get_windows_version():
1937     ''' Get Windows version. returns () if it's not running on Windows '''
1938     if compat_os_name == 'nt':
1939         return version_tuple(platform.win32_ver()[1])
1940     else:
1941         return ()
1942
1943
1944 def write_string(s, out=None, encoding=None):
1945     assert isinstance(s, str)
1946     out = out or sys.stderr
1947
1948     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1949         s = re.sub(r'([\r\n]+)', r' \1', s)
1950
1951     enc, buffer = None, out
1952     if 'b' in getattr(out, 'mode', ''):
1953         enc = encoding or preferredencoding()
1954     elif hasattr(out, 'buffer'):
1955         buffer = out.buffer
1956         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1957
1958     buffer.write(s.encode(enc, 'ignore') if enc else s)
1959     out.flush()
1960
1961
1962 def bytes_to_intlist(bs):
1963     if not bs:
1964         return []
1965     if isinstance(bs[0], int):  # Python 3
1966         return list(bs)
1967     else:
1968         return [ord(c) for c in bs]
1969
1970
1971 def intlist_to_bytes(xs):
1972     if not xs:
1973         return b''
1974     return struct.pack('%dB' % len(xs), *xs)
1975
1976
1977 class LockingUnsupportedError(OSError):
1978     msg = 'File locking is not supported'
1979
1980     def __init__(self):
1981         super().__init__(self.msg)
1982
1983
1984 # Cross-platform file locking
1985 if sys.platform == 'win32':
1986     import ctypes.wintypes
1987     import msvcrt
1988
1989     class OVERLAPPED(ctypes.Structure):
1990         _fields_ = [
1991             ('Internal', ctypes.wintypes.LPVOID),
1992             ('InternalHigh', ctypes.wintypes.LPVOID),
1993             ('Offset', ctypes.wintypes.DWORD),
1994             ('OffsetHigh', ctypes.wintypes.DWORD),
1995             ('hEvent', ctypes.wintypes.HANDLE),
1996         ]
1997
1998     kernel32 = ctypes.windll.kernel32
1999     LockFileEx = kernel32.LockFileEx
2000     LockFileEx.argtypes = [
2001         ctypes.wintypes.HANDLE,     # hFile
2002         ctypes.wintypes.DWORD,      # dwFlags
2003         ctypes.wintypes.DWORD,      # dwReserved
2004         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2005         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2006         ctypes.POINTER(OVERLAPPED)  # Overlapped
2007     ]
2008     LockFileEx.restype = ctypes.wintypes.BOOL
2009     UnlockFileEx = kernel32.UnlockFileEx
2010     UnlockFileEx.argtypes = [
2011         ctypes.wintypes.HANDLE,     # hFile
2012         ctypes.wintypes.DWORD,      # dwReserved
2013         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2014         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2015         ctypes.POINTER(OVERLAPPED)  # Overlapped
2016     ]
2017     UnlockFileEx.restype = ctypes.wintypes.BOOL
2018     whole_low = 0xffffffff
2019     whole_high = 0x7fffffff
2020
2021     def _lock_file(f, exclusive, block):
2022         overlapped = OVERLAPPED()
2023         overlapped.Offset = 0
2024         overlapped.OffsetHigh = 0
2025         overlapped.hEvent = 0
2026         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2027
2028         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2029                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2030                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2031             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2032             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2033
2034     def _unlock_file(f):
2035         assert f._lock_file_overlapped_p
2036         handle = msvcrt.get_osfhandle(f.fileno())
2037         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2038             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2039
2040 else:
2041     try:
2042         import fcntl
2043
2044         def _lock_file(f, exclusive, block):
2045             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2046             if not block:
2047                 flags |= fcntl.LOCK_NB
2048             try:
2049                 fcntl.flock(f, flags)
2050             except BlockingIOError:
2051                 raise
2052             except OSError:  # AOSP does not have flock()
2053                 fcntl.lockf(f, flags)
2054
2055         def _unlock_file(f):
2056             try:
2057                 fcntl.flock(f, fcntl.LOCK_UN)
2058             except OSError:
2059                 fcntl.lockf(f, fcntl.LOCK_UN)
2060
2061     except ImportError:
2062
2063         def _lock_file(f, exclusive, block):
2064             raise LockingUnsupportedError()
2065
2066         def _unlock_file(f):
2067             raise LockingUnsupportedError()
2068
2069
2070 class locked_file:
2071     locked = False
2072
2073     def __init__(self, filename, mode, block=True, encoding=None):
2074         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2075             raise NotImplementedError(mode)
2076         self.mode, self.block = mode, block
2077
2078         writable = any(f in mode for f in 'wax+')
2079         readable = any(f in mode for f in 'r+')
2080         flags = functools.reduce(operator.ior, (
2081             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2082             getattr(os, 'O_BINARY', 0),  # Windows only
2083             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2084             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2085             os.O_APPEND if 'a' in mode else 0,
2086             os.O_EXCL if 'x' in mode else 0,
2087             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2088         ))
2089
2090         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2091
2092     def __enter__(self):
2093         exclusive = 'r' not in self.mode
2094         try:
2095             _lock_file(self.f, exclusive, self.block)
2096             self.locked = True
2097         except OSError:
2098             self.f.close()
2099             raise
2100         if 'w' in self.mode:
2101             try:
2102                 self.f.truncate()
2103             except OSError as e:
2104                 if e.errno not in (
2105                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2106                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2107                 ):
2108                     raise
2109         return self
2110
2111     def unlock(self):
2112         if not self.locked:
2113             return
2114         try:
2115             _unlock_file(self.f)
2116         finally:
2117             self.locked = False
2118
2119     def __exit__(self, *_):
2120         try:
2121             self.unlock()
2122         finally:
2123             self.f.close()
2124
2125     open = __enter__
2126     close = __exit__
2127
2128     def __getattr__(self, attr):
2129         return getattr(self.f, attr)
2130
2131     def __iter__(self):
2132         return iter(self.f)
2133
2134
2135 @functools.cache
2136 def get_filesystem_encoding():
2137     encoding = sys.getfilesystemencoding()
2138     return encoding if encoding is not None else 'utf-8'
2139
2140
2141 def shell_quote(args):
2142     quoted_args = []
2143     encoding = get_filesystem_encoding()
2144     for a in args:
2145         if isinstance(a, bytes):
2146             # We may get a filename encoded with 'encodeFilename'
2147             a = a.decode(encoding)
2148         quoted_args.append(compat_shlex_quote(a))
2149     return ' '.join(quoted_args)
2150
2151
2152 def smuggle_url(url, data):
2153     """ Pass additional data in a URL for internal use. """
2154
2155     url, idata = unsmuggle_url(url, {})
2156     data.update(idata)
2157     sdata = urllib.parse.urlencode(
2158         {'__youtubedl_smuggle': json.dumps(data)})
2159     return url + '#' + sdata
2160
2161
2162 def unsmuggle_url(smug_url, default=None):
2163     if '#__youtubedl_smuggle' not in smug_url:
2164         return smug_url, default
2165     url, _, sdata = smug_url.rpartition('#')
2166     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2167     data = json.loads(jsond)
2168     return url, data
2169
2170
2171 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2172     """ Formats numbers with decimal sufixes like K, M, etc """
2173     num, factor = float_or_none(num), float(factor)
2174     if num is None or num < 0:
2175         return None
2176     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2177     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2178     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2179     if factor == 1024:
2180         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2181     converted = num / (factor ** exponent)
2182     return fmt % (converted, suffix)
2183
2184
2185 def format_bytes(bytes):
2186     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2187
2188
2189 def lookup_unit_table(unit_table, s):
2190     units_re = '|'.join(re.escape(u) for u in unit_table)
2191     m = re.match(
2192         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2193     if not m:
2194         return None
2195     num_str = m.group('num').replace(',', '.')
2196     mult = unit_table[m.group('unit')]
2197     return int(float(num_str) * mult)
2198
2199
2200 def parse_filesize(s):
2201     if s is None:
2202         return None
2203
2204     # The lower-case forms are of course incorrect and unofficial,
2205     # but we support those too
2206     _UNIT_TABLE = {
2207         'B': 1,
2208         'b': 1,
2209         'bytes': 1,
2210         'KiB': 1024,
2211         'KB': 1000,
2212         'kB': 1024,
2213         'Kb': 1000,
2214         'kb': 1000,
2215         'kilobytes': 1000,
2216         'kibibytes': 1024,
2217         'MiB': 1024 ** 2,
2218         'MB': 1000 ** 2,
2219         'mB': 1024 ** 2,
2220         'Mb': 1000 ** 2,
2221         'mb': 1000 ** 2,
2222         'megabytes': 1000 ** 2,
2223         'mebibytes': 1024 ** 2,
2224         'GiB': 1024 ** 3,
2225         'GB': 1000 ** 3,
2226         'gB': 1024 ** 3,
2227         'Gb': 1000 ** 3,
2228         'gb': 1000 ** 3,
2229         'gigabytes': 1000 ** 3,
2230         'gibibytes': 1024 ** 3,
2231         'TiB': 1024 ** 4,
2232         'TB': 1000 ** 4,
2233         'tB': 1024 ** 4,
2234         'Tb': 1000 ** 4,
2235         'tb': 1000 ** 4,
2236         'terabytes': 1000 ** 4,
2237         'tebibytes': 1024 ** 4,
2238         'PiB': 1024 ** 5,
2239         'PB': 1000 ** 5,
2240         'pB': 1024 ** 5,
2241         'Pb': 1000 ** 5,
2242         'pb': 1000 ** 5,
2243         'petabytes': 1000 ** 5,
2244         'pebibytes': 1024 ** 5,
2245         'EiB': 1024 ** 6,
2246         'EB': 1000 ** 6,
2247         'eB': 1024 ** 6,
2248         'Eb': 1000 ** 6,
2249         'eb': 1000 ** 6,
2250         'exabytes': 1000 ** 6,
2251         'exbibytes': 1024 ** 6,
2252         'ZiB': 1024 ** 7,
2253         'ZB': 1000 ** 7,
2254         'zB': 1024 ** 7,
2255         'Zb': 1000 ** 7,
2256         'zb': 1000 ** 7,
2257         'zettabytes': 1000 ** 7,
2258         'zebibytes': 1024 ** 7,
2259         'YiB': 1024 ** 8,
2260         'YB': 1000 ** 8,
2261         'yB': 1024 ** 8,
2262         'Yb': 1000 ** 8,
2263         'yb': 1000 ** 8,
2264         'yottabytes': 1000 ** 8,
2265         'yobibytes': 1024 ** 8,
2266     }
2267
2268     return lookup_unit_table(_UNIT_TABLE, s)
2269
2270
2271 def parse_count(s):
2272     if s is None:
2273         return None
2274
2275     s = re.sub(r'^[^\d]+\s', '', s).strip()
2276
2277     if re.match(r'^[\d,.]+$', s):
2278         return str_to_int(s)
2279
2280     _UNIT_TABLE = {
2281         'k': 1000,
2282         'K': 1000,
2283         'm': 1000 ** 2,
2284         'M': 1000 ** 2,
2285         'kk': 1000 ** 2,
2286         'KK': 1000 ** 2,
2287         'b': 1000 ** 3,
2288         'B': 1000 ** 3,
2289     }
2290
2291     ret = lookup_unit_table(_UNIT_TABLE, s)
2292     if ret is not None:
2293         return ret
2294
2295     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2296     if mobj:
2297         return str_to_int(mobj.group(1))
2298
2299
2300 def parse_resolution(s, *, lenient=False):
2301     if s is None:
2302         return {}
2303
2304     if lenient:
2305         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2306     else:
2307         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2308     if mobj:
2309         return {
2310             'width': int(mobj.group('w')),
2311             'height': int(mobj.group('h')),
2312         }
2313
2314     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2315     if mobj:
2316         return {'height': int(mobj.group(1))}
2317
2318     mobj = re.search(r'\b([48])[kK]\b', s)
2319     if mobj:
2320         return {'height': int(mobj.group(1)) * 540}
2321
2322     return {}
2323
2324
2325 def parse_bitrate(s):
2326     if not isinstance(s, str):
2327         return
2328     mobj = re.search(r'\b(\d+)\s*kbps', s)
2329     if mobj:
2330         return int(mobj.group(1))
2331
2332
2333 def month_by_name(name, lang='en'):
2334     """ Return the number of a month by (locale-independently) English name """
2335
2336     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2337
2338     try:
2339         return month_names.index(name) + 1
2340     except ValueError:
2341         return None
2342
2343
2344 def month_by_abbreviation(abbrev):
2345     """ Return the number of a month by (locale-independently) English
2346         abbreviations """
2347
2348     try:
2349         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2350     except ValueError:
2351         return None
2352
2353
2354 def fix_xml_ampersands(xml_str):
2355     """Replace all the '&' by '&amp;' in XML"""
2356     return re.sub(
2357         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2358         '&amp;',
2359         xml_str)
2360
2361
2362 def setproctitle(title):
2363     assert isinstance(title, str)
2364
2365     # ctypes in Jython is not complete
2366     # http://bugs.jython.org/issue2148
2367     if sys.platform.startswith('java'):
2368         return
2369
2370     try:
2371         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2372     except OSError:
2373         return
2374     except TypeError:
2375         # LoadLibrary in Windows Python 2.7.13 only expects
2376         # a bytestring, but since unicode_literals turns
2377         # every string into a unicode string, it fails.
2378         return
2379     title_bytes = title.encode()
2380     buf = ctypes.create_string_buffer(len(title_bytes))
2381     buf.value = title_bytes
2382     try:
2383         libc.prctl(15, buf, 0, 0, 0)
2384     except AttributeError:
2385         return  # Strange libc, just skip this
2386
2387
2388 def remove_start(s, start):
2389     return s[len(start):] if s is not None and s.startswith(start) else s
2390
2391
2392 def remove_end(s, end):
2393     return s[:-len(end)] if s is not None and s.endswith(end) else s
2394
2395
2396 def remove_quotes(s):
2397     if s is None or len(s) < 2:
2398         return s
2399     for quote in ('"', "'", ):
2400         if s[0] == quote and s[-1] == quote:
2401             return s[1:-1]
2402     return s
2403
2404
2405 def get_domain(url):
2406     """
2407     This implementation is inconsistent, but is kept for compatibility.
2408     Use this only for "webpage_url_domain"
2409     """
2410     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2411
2412
2413 def url_basename(url):
2414     path = urllib.parse.urlparse(url).path
2415     return path.strip('/').split('/')[-1]
2416
2417
2418 def base_url(url):
2419     return re.match(r'https?://[^?#&]+/', url).group()
2420
2421
2422 def urljoin(base, path):
2423     if isinstance(path, bytes):
2424         path = path.decode()
2425     if not isinstance(path, str) or not path:
2426         return None
2427     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2428         return path
2429     if isinstance(base, bytes):
2430         base = base.decode()
2431     if not isinstance(base, str) or not re.match(
2432             r'^(?:https?:)?//', base):
2433         return None
2434     return urllib.parse.urljoin(base, path)
2435
2436
2437 class HEADRequest(urllib.request.Request):
2438     def get_method(self):
2439         return 'HEAD'
2440
2441
2442 class PUTRequest(urllib.request.Request):
2443     def get_method(self):
2444         return 'PUT'
2445
2446
2447 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2448     if get_attr and v is not None:
2449         v = getattr(v, get_attr, None)
2450     try:
2451         return int(v) * invscale // scale
2452     except (ValueError, TypeError, OverflowError):
2453         return default
2454
2455
2456 def str_or_none(v, default=None):
2457     return default if v is None else str(v)
2458
2459
2460 def str_to_int(int_str):
2461     """ A more relaxed version of int_or_none """
2462     if isinstance(int_str, int):
2463         return int_str
2464     elif isinstance(int_str, str):
2465         int_str = re.sub(r'[,\.\+]', '', int_str)
2466         return int_or_none(int_str)
2467
2468
2469 def float_or_none(v, scale=1, invscale=1, default=None):
2470     if v is None:
2471         return default
2472     try:
2473         return float(v) * invscale / scale
2474     except (ValueError, TypeError):
2475         return default
2476
2477
2478 def bool_or_none(v, default=None):
2479     return v if isinstance(v, bool) else default
2480
2481
2482 def strip_or_none(v, default=None):
2483     return v.strip() if isinstance(v, str) else default
2484
2485
2486 def url_or_none(url):
2487     if not url or not isinstance(url, str):
2488         return None
2489     url = url.strip()
2490     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2491
2492
2493 def request_to_url(req):
2494     if isinstance(req, urllib.request.Request):
2495         return req.get_full_url()
2496     else:
2497         return req
2498
2499
2500 def strftime_or_none(timestamp, date_format, default=None):
2501     datetime_object = None
2502     try:
2503         if isinstance(timestamp, (int, float)):  # unix timestamp
2504             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2505         elif isinstance(timestamp, str):  # assume YYYYMMDD
2506             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2507         return datetime_object.strftime(date_format)
2508     except (ValueError, TypeError, AttributeError):
2509         return default
2510
2511
2512 def parse_duration(s):
2513     if not isinstance(s, str):
2514         return None
2515     s = s.strip()
2516     if not s:
2517         return None
2518
2519     days, hours, mins, secs, ms = [None] * 5
2520     m = re.match(r'''(?x)
2521             (?P<before_secs>
2522                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2523             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2524             (?P<ms>[.:][0-9]+)?Z?$
2525         ''', s)
2526     if m:
2527         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2528     else:
2529         m = re.match(
2530             r'''(?ix)(?:P?
2531                 (?:
2532                     [0-9]+\s*y(?:ears?)?,?\s*
2533                 )?
2534                 (?:
2535                     [0-9]+\s*m(?:onths?)?,?\s*
2536                 )?
2537                 (?:
2538                     [0-9]+\s*w(?:eeks?)?,?\s*
2539                 )?
2540                 (?:
2541                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2542                 )?
2543                 T)?
2544                 (?:
2545                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2546                 )?
2547                 (?:
2548                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2549                 )?
2550                 (?:
2551                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2552                 )?Z?$''', s)
2553         if m:
2554             days, hours, mins, secs, ms = m.groups()
2555         else:
2556             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2557             if m:
2558                 hours, mins = m.groups()
2559             else:
2560                 return None
2561
2562     if ms:
2563         ms = ms.replace(':', '.')
2564     return sum(float(part or 0) * mult for part, mult in (
2565         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2566
2567
2568 def prepend_extension(filename, ext, expected_real_ext=None):
2569     name, real_ext = os.path.splitext(filename)
2570     return (
2571         f'{name}.{ext}{real_ext}'
2572         if not expected_real_ext or real_ext[1:] == expected_real_ext
2573         else f'{filename}.{ext}')
2574
2575
2576 def replace_extension(filename, ext, expected_real_ext=None):
2577     name, real_ext = os.path.splitext(filename)
2578     return '{}.{}'.format(
2579         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2580         ext)
2581
2582
2583 def check_executable(exe, args=[]):
2584     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2585     args can be a list of arguments for a short output (like -version) """
2586     try:
2587         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2588     except OSError:
2589         return False
2590     return exe
2591
2592
2593 def _get_exe_version_output(exe, args, *, to_screen=None):
2594     if to_screen:
2595         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2596     try:
2597         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2598         # SIGTTOU if yt-dlp is run in the background.
2599         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2600         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2601                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2602     except OSError:
2603         return False
2604     return stdout
2605
2606
2607 def detect_exe_version(output, version_re=None, unrecognized='present'):
2608     assert isinstance(output, str)
2609     if version_re is None:
2610         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2611     m = re.search(version_re, output)
2612     if m:
2613         return m.group(1)
2614     else:
2615         return unrecognized
2616
2617
2618 def get_exe_version(exe, args=['--version'],
2619                     version_re=None, unrecognized='present'):
2620     """ Returns the version of the specified executable,
2621     or False if the executable is not present """
2622     out = _get_exe_version_output(exe, args)
2623     return detect_exe_version(out, version_re, unrecognized) if out else False
2624
2625
2626 def frange(start=0, stop=None, step=1):
2627     """Float range"""
2628     if stop is None:
2629         start, stop = 0, start
2630     sign = [-1, 1][step > 0] if step else 0
2631     while sign * start < sign * stop:
2632         yield start
2633         start += step
2634
2635
2636 class LazyList(collections.abc.Sequence):
2637     """Lazy immutable list from an iterable
2638     Note that slices of a LazyList are lists and not LazyList"""
2639
2640     class IndexError(IndexError):
2641         pass
2642
2643     def __init__(self, iterable, *, reverse=False, _cache=None):
2644         self._iterable = iter(iterable)
2645         self._cache = [] if _cache is None else _cache
2646         self._reversed = reverse
2647
2648     def __iter__(self):
2649         if self._reversed:
2650             # We need to consume the entire iterable to iterate in reverse
2651             yield from self.exhaust()
2652             return
2653         yield from self._cache
2654         for item in self._iterable:
2655             self._cache.append(item)
2656             yield item
2657
2658     def _exhaust(self):
2659         self._cache.extend(self._iterable)
2660         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2661         return self._cache
2662
2663     def exhaust(self):
2664         """Evaluate the entire iterable"""
2665         return self._exhaust()[::-1 if self._reversed else 1]
2666
2667     @staticmethod
2668     def _reverse_index(x):
2669         return None if x is None else ~x
2670
2671     def __getitem__(self, idx):
2672         if isinstance(idx, slice):
2673             if self._reversed:
2674                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2675             start, stop, step = idx.start, idx.stop, idx.step or 1
2676         elif isinstance(idx, int):
2677             if self._reversed:
2678                 idx = self._reverse_index(idx)
2679             start, stop, step = idx, idx, 0
2680         else:
2681             raise TypeError('indices must be integers or slices')
2682         if ((start or 0) < 0 or (stop or 0) < 0
2683                 or (start is None and step < 0)
2684                 or (stop is None and step > 0)):
2685             # We need to consume the entire iterable to be able to slice from the end
2686             # Obviously, never use this with infinite iterables
2687             self._exhaust()
2688             try:
2689                 return self._cache[idx]
2690             except IndexError as e:
2691                 raise self.IndexError(e) from e
2692         n = max(start or 0, stop or 0) - len(self._cache) + 1
2693         if n > 0:
2694             self._cache.extend(itertools.islice(self._iterable, n))
2695         try:
2696             return self._cache[idx]
2697         except IndexError as e:
2698             raise self.IndexError(e) from e
2699
2700     def __bool__(self):
2701         try:
2702             self[-1] if self._reversed else self[0]
2703         except self.IndexError:
2704             return False
2705         return True
2706
2707     def __len__(self):
2708         self._exhaust()
2709         return len(self._cache)
2710
2711     def __reversed__(self):
2712         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2713
2714     def __copy__(self):
2715         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2716
2717     def __repr__(self):
2718         # repr and str should mimic a list. So we exhaust the iterable
2719         return repr(self.exhaust())
2720
2721     def __str__(self):
2722         return repr(self.exhaust())
2723
2724
2725 class PagedList:
2726
2727     class IndexError(IndexError):
2728         pass
2729
2730     def __len__(self):
2731         # This is only useful for tests
2732         return len(self.getslice())
2733
2734     def __init__(self, pagefunc, pagesize, use_cache=True):
2735         self._pagefunc = pagefunc
2736         self._pagesize = pagesize
2737         self._pagecount = float('inf')
2738         self._use_cache = use_cache
2739         self._cache = {}
2740
2741     def getpage(self, pagenum):
2742         page_results = self._cache.get(pagenum)
2743         if page_results is None:
2744             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2745         if self._use_cache:
2746             self._cache[pagenum] = page_results
2747         return page_results
2748
2749     def getslice(self, start=0, end=None):
2750         return list(self._getslice(start, end))
2751
2752     def _getslice(self, start, end):
2753         raise NotImplementedError('This method must be implemented by subclasses')
2754
2755     def __getitem__(self, idx):
2756         assert self._use_cache, 'Indexing PagedList requires cache'
2757         if not isinstance(idx, int) or idx < 0:
2758             raise TypeError('indices must be non-negative integers')
2759         entries = self.getslice(idx, idx + 1)
2760         if not entries:
2761             raise self.IndexError()
2762         return entries[0]
2763
2764
2765 class OnDemandPagedList(PagedList):
2766     """Download pages until a page with less than maximum results"""
2767
2768     def _getslice(self, start, end):
2769         for pagenum in itertools.count(start // self._pagesize):
2770             firstid = pagenum * self._pagesize
2771             nextfirstid = pagenum * self._pagesize + self._pagesize
2772             if start >= nextfirstid:
2773                 continue
2774
2775             startv = (
2776                 start % self._pagesize
2777                 if firstid <= start < nextfirstid
2778                 else 0)
2779             endv = (
2780                 ((end - 1) % self._pagesize) + 1
2781                 if (end is not None and firstid <= end <= nextfirstid)
2782                 else None)
2783
2784             try:
2785                 page_results = self.getpage(pagenum)
2786             except Exception:
2787                 self._pagecount = pagenum - 1
2788                 raise
2789             if startv != 0 or endv is not None:
2790                 page_results = page_results[startv:endv]
2791             yield from page_results
2792
2793             # A little optimization - if current page is not "full", ie. does
2794             # not contain page_size videos then we can assume that this page
2795             # is the last one - there are no more ids on further pages -
2796             # i.e. no need to query again.
2797             if len(page_results) + startv < self._pagesize:
2798                 break
2799
2800             # If we got the whole page, but the next page is not interesting,
2801             # break out early as well
2802             if end == nextfirstid:
2803                 break
2804
2805
2806 class InAdvancePagedList(PagedList):
2807     """PagedList with total number of pages known in advance"""
2808
2809     def __init__(self, pagefunc, pagecount, pagesize):
2810         PagedList.__init__(self, pagefunc, pagesize, True)
2811         self._pagecount = pagecount
2812
2813     def _getslice(self, start, end):
2814         start_page = start // self._pagesize
2815         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2816         skip_elems = start - start_page * self._pagesize
2817         only_more = None if end is None else end - start
2818         for pagenum in range(start_page, end_page):
2819             page_results = self.getpage(pagenum)
2820             if skip_elems:
2821                 page_results = page_results[skip_elems:]
2822                 skip_elems = None
2823             if only_more is not None:
2824                 if len(page_results) < only_more:
2825                     only_more -= len(page_results)
2826                 else:
2827                     yield from page_results[:only_more]
2828                     break
2829             yield from page_results
2830
2831
2832 class PlaylistEntries:
2833     MissingEntry = object()
2834     is_exhausted = False
2835
2836     def __init__(self, ydl, info_dict):
2837         self.ydl = ydl
2838
2839         # _entries must be assigned now since infodict can change during iteration
2840         entries = info_dict.get('entries')
2841         if entries is None:
2842             raise EntryNotInPlaylist('There are no entries')
2843         elif isinstance(entries, list):
2844             self.is_exhausted = True
2845
2846         requested_entries = info_dict.get('requested_entries')
2847         self.is_incomplete = bool(requested_entries)
2848         if self.is_incomplete:
2849             assert self.is_exhausted
2850             self._entries = [self.MissingEntry] * max(requested_entries)
2851             for i, entry in zip(requested_entries, entries):
2852                 self._entries[i - 1] = entry
2853         elif isinstance(entries, (list, PagedList, LazyList)):
2854             self._entries = entries
2855         else:
2856             self._entries = LazyList(entries)
2857
2858     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2859         (?P<start>[+-]?\d+)?
2860         (?P<range>[:-]
2861             (?P<end>[+-]?\d+|inf(?:inite)?)?
2862             (?::(?P<step>[+-]?\d+))?
2863         )?''')
2864
2865     @classmethod
2866     def parse_playlist_items(cls, string):
2867         for segment in string.split(','):
2868             if not segment:
2869                 raise ValueError('There is two or more consecutive commas')
2870             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2871             if not mobj:
2872                 raise ValueError(f'{segment!r} is not a valid specification')
2873             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2874             if int_or_none(step) == 0:
2875                 raise ValueError(f'Step in {segment!r} cannot be zero')
2876             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2877
2878     def get_requested_items(self):
2879         playlist_items = self.ydl.params.get('playlist_items')
2880         playlist_start = self.ydl.params.get('playliststart', 1)
2881         playlist_end = self.ydl.params.get('playlistend')
2882         # For backwards compatibility, interpret -1 as whole list
2883         if playlist_end in (-1, None):
2884             playlist_end = ''
2885         if not playlist_items:
2886             playlist_items = f'{playlist_start}:{playlist_end}'
2887         elif playlist_start != 1 or playlist_end:
2888             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2889
2890         for index in self.parse_playlist_items(playlist_items):
2891             for i, entry in self[index]:
2892                 yield i, entry
2893                 if not entry:
2894                     continue
2895                 try:
2896                     # TODO: Add auto-generated fields
2897                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2898                 except (ExistingVideoReached, RejectedVideoReached):
2899                     return
2900
2901     def get_full_count(self):
2902         if self.is_exhausted and not self.is_incomplete:
2903             return len(self)
2904         elif isinstance(self._entries, InAdvancePagedList):
2905             if self._entries._pagesize == 1:
2906                 return self._entries._pagecount
2907
2908     @functools.cached_property
2909     def _getter(self):
2910         if isinstance(self._entries, list):
2911             def get_entry(i):
2912                 try:
2913                     entry = self._entries[i]
2914                 except IndexError:
2915                     entry = self.MissingEntry
2916                     if not self.is_incomplete:
2917                         raise self.IndexError()
2918                 if entry is self.MissingEntry:
2919                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2920                 return entry
2921         else:
2922             def get_entry(i):
2923                 try:
2924                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2925                 except (LazyList.IndexError, PagedList.IndexError):
2926                     raise self.IndexError()
2927         return get_entry
2928
2929     def __getitem__(self, idx):
2930         if isinstance(idx, int):
2931             idx = slice(idx, idx)
2932
2933         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2934         step = 1 if idx.step is None else idx.step
2935         if idx.start is None:
2936             start = 0 if step > 0 else len(self) - 1
2937         else:
2938             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2939
2940         # NB: Do not call len(self) when idx == [:]
2941         if idx.stop is None:
2942             stop = 0 if step < 0 else float('inf')
2943         else:
2944             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2945         stop += [-1, 1][step > 0]
2946
2947         for i in frange(start, stop, step):
2948             if i < 0:
2949                 continue
2950             try:
2951                 entry = self._getter(i)
2952             except self.IndexError:
2953                 self.is_exhausted = True
2954                 if step > 0:
2955                     break
2956                 continue
2957             yield i + 1, entry
2958
2959     def __len__(self):
2960         return len(tuple(self[:]))
2961
2962     class IndexError(IndexError):
2963         pass
2964
2965
2966 def uppercase_escape(s):
2967     unicode_escape = codecs.getdecoder('unicode_escape')
2968     return re.sub(
2969         r'\\U[0-9a-fA-F]{8}',
2970         lambda m: unicode_escape(m.group(0))[0],
2971         s)
2972
2973
2974 def lowercase_escape(s):
2975     unicode_escape = codecs.getdecoder('unicode_escape')
2976     return re.sub(
2977         r'\\u[0-9a-fA-F]{4}',
2978         lambda m: unicode_escape(m.group(0))[0],
2979         s)
2980
2981
2982 def escape_rfc3986(s):
2983     """Escape non-ASCII characters as suggested by RFC 3986"""
2984     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2985
2986
2987 def escape_url(url):
2988     """Escape URL as suggested by RFC 3986"""
2989     url_parsed = urllib.parse.urlparse(url)
2990     return url_parsed._replace(
2991         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2992         path=escape_rfc3986(url_parsed.path),
2993         params=escape_rfc3986(url_parsed.params),
2994         query=escape_rfc3986(url_parsed.query),
2995         fragment=escape_rfc3986(url_parsed.fragment)
2996     ).geturl()
2997
2998
2999 def parse_qs(url):
3000     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3001
3002
3003 def read_batch_urls(batch_fd):
3004     def fixup(url):
3005         if not isinstance(url, str):
3006             url = url.decode('utf-8', 'replace')
3007         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3008         for bom in BOM_UTF8:
3009             if url.startswith(bom):
3010                 url = url[len(bom):]
3011         url = url.lstrip()
3012         if not url or url.startswith(('#', ';', ']')):
3013             return False
3014         # "#" cannot be stripped out since it is part of the URI
3015         # However, it can be safely stripped out if following a whitespace
3016         return re.split(r'\s#', url, 1)[0].rstrip()
3017
3018     with contextlib.closing(batch_fd) as fd:
3019         return [url for url in map(fixup, fd) if url]
3020
3021
3022 def urlencode_postdata(*args, **kargs):
3023     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3024
3025
3026 def update_url_query(url, query):
3027     if not query:
3028         return url
3029     parsed_url = urllib.parse.urlparse(url)
3030     qs = urllib.parse.parse_qs(parsed_url.query)
3031     qs.update(query)
3032     return urllib.parse.urlunparse(parsed_url._replace(
3033         query=urllib.parse.urlencode(qs, True)))
3034
3035
3036 def update_Request(req, url=None, data=None, headers=None, query=None):
3037     req_headers = req.headers.copy()
3038     req_headers.update(headers or {})
3039     req_data = data or req.data
3040     req_url = update_url_query(url or req.get_full_url(), query)
3041     req_get_method = req.get_method()
3042     if req_get_method == 'HEAD':
3043         req_type = HEADRequest
3044     elif req_get_method == 'PUT':
3045         req_type = PUTRequest
3046     else:
3047         req_type = urllib.request.Request
3048     new_req = req_type(
3049         req_url, data=req_data, headers=req_headers,
3050         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3051     if hasattr(req, 'timeout'):
3052         new_req.timeout = req.timeout
3053     return new_req
3054
3055
3056 def _multipart_encode_impl(data, boundary):
3057     content_type = 'multipart/form-data; boundary=%s' % boundary
3058
3059     out = b''
3060     for k, v in data.items():
3061         out += b'--' + boundary.encode('ascii') + b'\r\n'
3062         if isinstance(k, str):
3063             k = k.encode()
3064         if isinstance(v, str):
3065             v = v.encode()
3066         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3067         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3068         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3069         if boundary.encode('ascii') in content:
3070             raise ValueError('Boundary overlaps with data')
3071         out += content
3072
3073     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3074
3075     return out, content_type
3076
3077
3078 def multipart_encode(data, boundary=None):
3079     '''
3080     Encode a dict to RFC 7578-compliant form-data
3081
3082     data:
3083         A dict where keys and values can be either Unicode or bytes-like
3084         objects.
3085     boundary:
3086         If specified a Unicode object, it's used as the boundary. Otherwise
3087         a random boundary is generated.
3088
3089     Reference: https://tools.ietf.org/html/rfc7578
3090     '''
3091     has_specified_boundary = boundary is not None
3092
3093     while True:
3094         if boundary is None:
3095             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3096
3097         try:
3098             out, content_type = _multipart_encode_impl(data, boundary)
3099             break
3100         except ValueError:
3101             if has_specified_boundary:
3102                 raise
3103             boundary = None
3104
3105     return out, content_type
3106
3107
3108 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3109     for val in map(d.get, variadic(key_or_keys)):
3110         if val is not None and (val or not skip_false_values):
3111             return val
3112     return default
3113
3114
3115 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3116     for f in funcs:
3117         try:
3118             val = f(*args, **kwargs)
3119         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3120             pass
3121         else:
3122             if expected_type is None or isinstance(val, expected_type):
3123                 return val
3124
3125
3126 def try_get(src, getter, expected_type=None):
3127     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3128
3129
3130 def filter_dict(dct, cndn=lambda _, v: v is not None):
3131     return {k: v for k, v in dct.items() if cndn(k, v)}
3132
3133
3134 def merge_dicts(*dicts):
3135     merged = {}
3136     for a_dict in dicts:
3137         for k, v in a_dict.items():
3138             if (v is not None and k not in merged
3139                     or isinstance(v, str) and merged[k] == ''):
3140                 merged[k] = v
3141     return merged
3142
3143
3144 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3145     return string if isinstance(string, str) else str(string, encoding, errors)
3146
3147
3148 US_RATINGS = {
3149     'G': 0,
3150     'PG': 10,
3151     'PG-13': 13,
3152     'R': 16,
3153     'NC': 18,
3154 }
3155
3156
3157 TV_PARENTAL_GUIDELINES = {
3158     'TV-Y': 0,
3159     'TV-Y7': 7,
3160     'TV-G': 0,
3161     'TV-PG': 0,
3162     'TV-14': 14,
3163     'TV-MA': 17,
3164 }
3165
3166
3167 def parse_age_limit(s):
3168     # isinstance(False, int) is True. So type() must be used instead
3169     if type(s) is int:  # noqa: E721
3170         return s if 0 <= s <= 21 else None
3171     elif not isinstance(s, str):
3172         return None
3173     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3174     if m:
3175         return int(m.group('age'))
3176     s = s.upper()
3177     if s in US_RATINGS:
3178         return US_RATINGS[s]
3179     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3180     if m:
3181         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3182     return None
3183
3184
3185 def strip_jsonp(code):
3186     return re.sub(
3187         r'''(?sx)^
3188             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3189             (?:\s*&&\s*(?P=func_name))?
3190             \s*\(\s*(?P<callback_data>.*)\);?
3191             \s*?(?://[^\n]*)*$''',
3192         r'\g<callback_data>', code)
3193
3194
3195 def js_to_json(code, vars={}):
3196     # vars is a dict of var, val pairs to substitute
3197     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3198     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3199     INTEGER_TABLE = (
3200         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3201         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3202     )
3203
3204     def fix_kv(m):
3205         v = m.group(0)
3206         if v in ('true', 'false', 'null'):
3207             return v
3208         elif v in ('undefined', 'void 0'):
3209             return 'null'
3210         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3211             return ""
3212
3213         if v[0] in ("'", '"'):
3214             v = re.sub(r'(?s)\\.|"', lambda m: {
3215                 '"': '\\"',
3216                 "\\'": "'",
3217                 '\\\n': '',
3218                 '\\x': '\\u00',
3219             }.get(m.group(0), m.group(0)), v[1:-1])
3220         else:
3221             for regex, base in INTEGER_TABLE:
3222                 im = re.match(regex, v)
3223                 if im:
3224                     i = int(im.group(1), base)
3225                     return '"%d":' % i if v.endswith(':') else '%d' % i
3226
3227             if v in vars:
3228                 return vars[v]
3229
3230         return '"%s"' % v
3231
3232     def create_map(mobj):
3233         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3234
3235     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3236     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3237
3238     return re.sub(r'''(?sx)
3239         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3240         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3241         {comment}|,(?={skip}[\]}}])|
3242         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3243         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3244         [0-9]+(?={skip}:)|
3245         !+
3246         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3247
3248
3249 def qualities(quality_ids):
3250     """ Get a numeric quality value out of a list of possible values """
3251     def q(qid):
3252         try:
3253             return quality_ids.index(qid)
3254         except ValueError:
3255             return -1
3256     return q
3257
3258
3259 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3260
3261
3262 DEFAULT_OUTTMPL = {
3263     'default': '%(title)s [%(id)s].%(ext)s',
3264     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3265 }
3266 OUTTMPL_TYPES = {
3267     'chapter': None,
3268     'subtitle': None,
3269     'thumbnail': None,
3270     'description': 'description',
3271     'annotation': 'annotations.xml',
3272     'infojson': 'info.json',
3273     'link': None,
3274     'pl_video': None,
3275     'pl_thumbnail': None,
3276     'pl_description': 'description',
3277     'pl_infojson': 'info.json',
3278 }
3279
3280 # As of [1] format syntax is:
3281 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3282 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3283 STR_FORMAT_RE_TMPL = r'''(?x)
3284     (?<!%)(?P<prefix>(?:%%)*)
3285     %
3286     (?P<has_key>\((?P<key>{0})\))?
3287     (?P<format>
3288         (?P<conversion>[#0\-+ ]+)?
3289         (?P<min_width>\d+)?
3290         (?P<precision>\.\d+)?
3291         (?P<len_mod>[hlL])?  # unused in python
3292         {1}  # conversion type
3293     )
3294 '''
3295
3296
3297 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3298
3299
3300 def limit_length(s, length):
3301     """ Add ellipses to overly long strings """
3302     if s is None:
3303         return None
3304     ELLIPSES = '...'
3305     if len(s) > length:
3306         return s[:length - len(ELLIPSES)] + ELLIPSES
3307     return s
3308
3309
3310 def version_tuple(v):
3311     return tuple(int(e) for e in re.split(r'[-.]', v))
3312
3313
3314 def is_outdated_version(version, limit, assume_new=True):
3315     if not version:
3316         return not assume_new
3317     try:
3318         return version_tuple(version) < version_tuple(limit)
3319     except ValueError:
3320         return not assume_new
3321
3322
3323 def ytdl_is_updateable():
3324     """ Returns if yt-dlp can be updated with -U """
3325
3326     from .update import is_non_updateable
3327
3328     return not is_non_updateable()
3329
3330
3331 def args_to_str(args):
3332     # Get a short string representation for a subprocess command
3333     return ' '.join(compat_shlex_quote(a) for a in args)
3334
3335
3336 def error_to_compat_str(err):
3337     return str(err)
3338
3339
3340 def error_to_str(err):
3341     return f'{type(err).__name__}: {err}'
3342
3343
3344 def mimetype2ext(mt):
3345     if mt is None:
3346         return None
3347
3348     mt, _, params = mt.partition(';')
3349     mt = mt.strip()
3350
3351     FULL_MAP = {
3352         'audio/mp4': 'm4a',
3353         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3354         # it's the most popular one
3355         'audio/mpeg': 'mp3',
3356         'audio/x-wav': 'wav',
3357         'audio/wav': 'wav',
3358         'audio/wave': 'wav',
3359     }
3360
3361     ext = FULL_MAP.get(mt)
3362     if ext is not None:
3363         return ext
3364
3365     SUBTYPE_MAP = {
3366         '3gpp': '3gp',
3367         'smptett+xml': 'tt',
3368         'ttaf+xml': 'dfxp',
3369         'ttml+xml': 'ttml',
3370         'x-flv': 'flv',
3371         'x-mp4-fragmented': 'mp4',
3372         'x-ms-sami': 'sami',
3373         'x-ms-wmv': 'wmv',
3374         'mpegurl': 'm3u8',
3375         'x-mpegurl': 'm3u8',
3376         'vnd.apple.mpegurl': 'm3u8',
3377         'dash+xml': 'mpd',
3378         'f4m+xml': 'f4m',
3379         'hds+xml': 'f4m',
3380         'vnd.ms-sstr+xml': 'ism',
3381         'quicktime': 'mov',
3382         'mp2t': 'ts',
3383         'x-wav': 'wav',
3384         'filmstrip+json': 'fs',
3385         'svg+xml': 'svg',
3386     }
3387
3388     _, _, subtype = mt.rpartition('/')
3389     ext = SUBTYPE_MAP.get(subtype.lower())
3390     if ext is not None:
3391         return ext
3392
3393     SUFFIX_MAP = {
3394         'json': 'json',
3395         'xml': 'xml',
3396         'zip': 'zip',
3397         'gzip': 'gz',
3398     }
3399
3400     _, _, suffix = subtype.partition('+')
3401     ext = SUFFIX_MAP.get(suffix)
3402     if ext is not None:
3403         return ext
3404
3405     return subtype.replace('+', '.')
3406
3407
3408 def ext2mimetype(ext_or_url):
3409     if not ext_or_url:
3410         return None
3411     if '.' not in ext_or_url:
3412         ext_or_url = f'file.{ext_or_url}'
3413     return mimetypes.guess_type(ext_or_url)[0]
3414
3415
3416 def parse_codecs(codecs_str):
3417     # http://tools.ietf.org/html/rfc6381
3418     if not codecs_str:
3419         return {}
3420     split_codecs = list(filter(None, map(
3421         str.strip, codecs_str.strip().strip(',').split(','))))
3422     vcodec, acodec, scodec, hdr = None, None, None, None
3423     for full_codec in split_codecs:
3424         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3425         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3426                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3427             if vcodec:
3428                 continue
3429             vcodec = full_codec
3430             if parts[0] in ('dvh1', 'dvhe'):
3431                 hdr = 'DV'
3432             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3433                 hdr = 'HDR10'
3434             elif parts[:2] == ['vp9', '2']:
3435                 hdr = 'HDR10'
3436         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3437                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3438             acodec = acodec or full_codec
3439         elif parts[0] in ('stpp', 'wvtt'):
3440             scodec = scodec or full_codec
3441         else:
3442             write_string(f'WARNING: Unknown codec {full_codec}\n')
3443     if vcodec or acodec or scodec:
3444         return {
3445             'vcodec': vcodec or 'none',
3446             'acodec': acodec or 'none',
3447             'dynamic_range': hdr,
3448             **({'scodec': scodec} if scodec is not None else {}),
3449         }
3450     elif len(split_codecs) == 2:
3451         return {
3452             'vcodec': split_codecs[0],
3453             'acodec': split_codecs[1],
3454         }
3455     return {}
3456
3457
3458 def urlhandle_detect_ext(url_handle):
3459     getheader = url_handle.headers.get
3460
3461     cd = getheader('Content-Disposition')
3462     if cd:
3463         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3464         if m:
3465             e = determine_ext(m.group('filename'), default_ext=None)
3466             if e:
3467                 return e
3468
3469     return mimetype2ext(getheader('Content-Type'))
3470
3471
3472 def encode_data_uri(data, mime_type):
3473     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3474
3475
3476 def age_restricted(content_limit, age_limit):
3477     """ Returns True iff the content should be blocked """
3478
3479     if age_limit is None:  # No limit set
3480         return False
3481     if content_limit is None:
3482         return False  # Content available for everyone
3483     return age_limit < content_limit
3484
3485
3486 # List of known byte-order-marks (BOM)
3487 BOMS = [
3488     (b'\xef\xbb\xbf', 'utf-8'),
3489     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3490     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3491     (b'\xff\xfe', 'utf-16-le'),
3492     (b'\xfe\xff', 'utf-16-be'),
3493 ]
3494
3495
3496 def is_html(first_bytes):
3497     """ Detect whether a file contains HTML by examining its first bytes. """
3498
3499     encoding = 'utf-8'
3500     for bom, enc in BOMS:
3501         while first_bytes.startswith(bom):
3502             encoding, first_bytes = enc, first_bytes[len(bom):]
3503
3504     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3505
3506
3507 def determine_protocol(info_dict):
3508     protocol = info_dict.get('protocol')
3509     if protocol is not None:
3510         return protocol
3511
3512     url = sanitize_url(info_dict['url'])
3513     if url.startswith('rtmp'):
3514         return 'rtmp'
3515     elif url.startswith('mms'):
3516         return 'mms'
3517     elif url.startswith('rtsp'):
3518         return 'rtsp'
3519
3520     ext = determine_ext(url)
3521     if ext == 'm3u8':
3522         return 'm3u8'
3523     elif ext == 'f4m':
3524         return 'f4m'
3525
3526     return urllib.parse.urlparse(url).scheme
3527
3528
3529 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3530     """ Render a list of rows, each as a list of values.
3531     Text after a \t will be right aligned """
3532     def width(string):
3533         return len(remove_terminal_sequences(string).replace('\t', ''))
3534
3535     def get_max_lens(table):
3536         return [max(width(str(v)) for v in col) for col in zip(*table)]
3537
3538     def filter_using_list(row, filterArray):
3539         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3540
3541     max_lens = get_max_lens(data) if hide_empty else []
3542     header_row = filter_using_list(header_row, max_lens)
3543     data = [filter_using_list(row, max_lens) for row in data]
3544
3545     table = [header_row] + data
3546     max_lens = get_max_lens(table)
3547     extra_gap += 1
3548     if delim:
3549         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3550         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3551     for row in table:
3552         for pos, text in enumerate(map(str, row)):
3553             if '\t' in text:
3554                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3555             else:
3556                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3557     ret = '\n'.join(''.join(row).rstrip() for row in table)
3558     return ret
3559
3560
3561 def _match_one(filter_part, dct, incomplete):
3562     # TODO: Generalize code with YoutubeDL._build_format_filter
3563     STRING_OPERATORS = {
3564         '*=': operator.contains,
3565         '^=': lambda attr, value: attr.startswith(value),
3566         '$=': lambda attr, value: attr.endswith(value),
3567         '~=': lambda attr, value: re.search(value, attr),
3568     }
3569     COMPARISON_OPERATORS = {
3570         **STRING_OPERATORS,
3571         '<=': operator.le,  # "<=" must be defined above "<"
3572         '<': operator.lt,
3573         '>=': operator.ge,
3574         '>': operator.gt,
3575         '=': operator.eq,
3576     }
3577
3578     if isinstance(incomplete, bool):
3579         is_incomplete = lambda _: incomplete
3580     else:
3581         is_incomplete = lambda k: k in incomplete
3582
3583     operator_rex = re.compile(r'''(?x)
3584         (?P<key>[a-z_]+)
3585         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3586         (?:
3587             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3588             (?P<strval>.+?)
3589         )
3590         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3591     m = operator_rex.fullmatch(filter_part.strip())
3592     if m:
3593         m = m.groupdict()
3594         unnegated_op = COMPARISON_OPERATORS[m['op']]
3595         if m['negation']:
3596             op = lambda attr, value: not unnegated_op(attr, value)
3597         else:
3598             op = unnegated_op
3599         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3600         if m['quote']:
3601             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3602         actual_value = dct.get(m['key'])
3603         numeric_comparison = None
3604         if isinstance(actual_value, (int, float)):
3605             # If the original field is a string and matching comparisonvalue is
3606             # a number we should respect the origin of the original field
3607             # and process comparison value as a string (see
3608             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3609             try:
3610                 numeric_comparison = int(comparison_value)
3611             except ValueError:
3612                 numeric_comparison = parse_filesize(comparison_value)
3613                 if numeric_comparison is None:
3614                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3615                 if numeric_comparison is None:
3616                     numeric_comparison = parse_duration(comparison_value)
3617         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3618             raise ValueError('Operator %s only supports string values!' % m['op'])
3619         if actual_value is None:
3620             return is_incomplete(m['key']) or m['none_inclusive']
3621         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3622
3623     UNARY_OPERATORS = {
3624         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3625         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3626     }
3627     operator_rex = re.compile(r'''(?x)
3628         (?P<op>%s)\s*(?P<key>[a-z_]+)
3629         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3630     m = operator_rex.fullmatch(filter_part.strip())
3631     if m:
3632         op = UNARY_OPERATORS[m.group('op')]
3633         actual_value = dct.get(m.group('key'))
3634         if is_incomplete(m.group('key')) and actual_value is None:
3635             return True
3636         return op(actual_value)
3637
3638     raise ValueError('Invalid filter part %r' % filter_part)
3639
3640
3641 def match_str(filter_str, dct, incomplete=False):
3642     """ Filter a dictionary with a simple string syntax.
3643     @returns           Whether the filter passes
3644     @param incomplete  Set of keys that is expected to be missing from dct.
3645                        Can be True/False to indicate all/none of the keys may be missing.
3646                        All conditions on incomplete keys pass if the key is missing
3647     """
3648     return all(
3649         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3650         for filter_part in re.split(r'(?<!\\)&', filter_str))
3651
3652
3653 def match_filter_func(filters):
3654     if not filters:
3655         return None
3656     filters = set(variadic(filters))
3657
3658     interactive = '-' in filters
3659     if interactive:
3660         filters.remove('-')
3661
3662     def _match_func(info_dict, incomplete=False):
3663         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3664             return NO_DEFAULT if interactive and not incomplete else None
3665         else:
3666             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3667             filter_str = ') | ('.join(map(str.strip, filters))
3668             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3669     return _match_func
3670
3671
3672 class download_range_func:
3673     def __init__(self, chapters, ranges):
3674         self.chapters, self.ranges = chapters, ranges
3675
3676     def __call__(self, info_dict, ydl):
3677         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3678                    else 'Cannot match chapters since chapter information is unavailable')
3679         for regex in self.chapters or []:
3680             for i, chapter in enumerate(info_dict.get('chapters') or []):
3681                 if re.search(regex, chapter['title']):
3682                     warning = None
3683                     yield {**chapter, 'index': i}
3684         if self.chapters and warning:
3685             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3686
3687         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3688
3689     def __eq__(self, other):
3690         return (isinstance(other, download_range_func)
3691                 and self.chapters == other.chapters and self.ranges == other.ranges)
3692
3693
3694 def parse_dfxp_time_expr(time_expr):
3695     if not time_expr:
3696         return
3697
3698     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3699     if mobj:
3700         return float(mobj.group('time_offset'))
3701
3702     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3703     if mobj:
3704         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3705
3706
3707 def srt_subtitles_timecode(seconds):
3708     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3709
3710
3711 def ass_subtitles_timecode(seconds):
3712     time = timetuple_from_msec(seconds * 1000)
3713     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3714
3715
3716 def dfxp2srt(dfxp_data):
3717     '''
3718     @param dfxp_data A bytes-like object containing DFXP data
3719     @returns A unicode object containing converted SRT data
3720     '''
3721     LEGACY_NAMESPACES = (
3722         (b'http://www.w3.org/ns/ttml', [
3723             b'http://www.w3.org/2004/11/ttaf1',
3724             b'http://www.w3.org/2006/04/ttaf1',
3725             b'http://www.w3.org/2006/10/ttaf1',
3726         ]),
3727         (b'http://www.w3.org/ns/ttml#styling', [
3728             b'http://www.w3.org/ns/ttml#style',
3729         ]),
3730     )
3731
3732     SUPPORTED_STYLING = [
3733         'color',
3734         'fontFamily',
3735         'fontSize',
3736         'fontStyle',
3737         'fontWeight',
3738         'textDecoration'
3739     ]
3740
3741     _x = functools.partial(xpath_with_ns, ns_map={
3742         'xml': 'http://www.w3.org/XML/1998/namespace',
3743         'ttml': 'http://www.w3.org/ns/ttml',
3744         'tts': 'http://www.w3.org/ns/ttml#styling',
3745     })
3746
3747     styles = {}
3748     default_style = {}
3749
3750     class TTMLPElementParser:
3751         _out = ''
3752         _unclosed_elements = []
3753         _applied_styles = []
3754
3755         def start(self, tag, attrib):
3756             if tag in (_x('ttml:br'), 'br'):
3757                 self._out += '\n'
3758             else:
3759                 unclosed_elements = []
3760                 style = {}
3761                 element_style_id = attrib.get('style')
3762                 if default_style:
3763                     style.update(default_style)
3764                 if element_style_id:
3765                     style.update(styles.get(element_style_id, {}))
3766                 for prop in SUPPORTED_STYLING:
3767                     prop_val = attrib.get(_x('tts:' + prop))
3768                     if prop_val:
3769                         style[prop] = prop_val
3770                 if style:
3771                     font = ''
3772                     for k, v in sorted(style.items()):
3773                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3774                             continue
3775                         if k == 'color':
3776                             font += ' color="%s"' % v
3777                         elif k == 'fontSize':
3778                             font += ' size="%s"' % v
3779                         elif k == 'fontFamily':
3780                             font += ' face="%s"' % v
3781                         elif k == 'fontWeight' and v == 'bold':
3782                             self._out += '<b>'
3783                             unclosed_elements.append('b')
3784                         elif k == 'fontStyle' and v == 'italic':
3785                             self._out += '<i>'
3786                             unclosed_elements.append('i')
3787                         elif k == 'textDecoration' and v == 'underline':
3788                             self._out += '<u>'
3789                             unclosed_elements.append('u')
3790                     if font:
3791                         self._out += '<font' + font + '>'
3792                         unclosed_elements.append('font')
3793                     applied_style = {}
3794                     if self._applied_styles:
3795                         applied_style.update(self._applied_styles[-1])
3796                     applied_style.update(style)
3797                     self._applied_styles.append(applied_style)
3798                 self._unclosed_elements.append(unclosed_elements)
3799
3800         def end(self, tag):
3801             if tag not in (_x('ttml:br'), 'br'):
3802                 unclosed_elements = self._unclosed_elements.pop()
3803                 for element in reversed(unclosed_elements):
3804                     self._out += '</%s>' % element
3805                 if unclosed_elements and self._applied_styles:
3806                     self._applied_styles.pop()
3807
3808         def data(self, data):
3809             self._out += data
3810
3811         def close(self):
3812             return self._out.strip()
3813
3814     def parse_node(node):
3815         target = TTMLPElementParser()
3816         parser = xml.etree.ElementTree.XMLParser(target=target)
3817         parser.feed(xml.etree.ElementTree.tostring(node))
3818         return parser.close()
3819
3820     for k, v in LEGACY_NAMESPACES:
3821         for ns in v:
3822             dfxp_data = dfxp_data.replace(ns, k)
3823
3824     dfxp = compat_etree_fromstring(dfxp_data)
3825     out = []
3826     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3827
3828     if not paras:
3829         raise ValueError('Invalid dfxp/TTML subtitle')
3830
3831     repeat = False
3832     while True:
3833         for style in dfxp.findall(_x('.//ttml:style')):
3834             style_id = style.get('id') or style.get(_x('xml:id'))
3835             if not style_id:
3836                 continue
3837             parent_style_id = style.get('style')
3838             if parent_style_id:
3839                 if parent_style_id not in styles:
3840                     repeat = True
3841                     continue
3842                 styles[style_id] = styles[parent_style_id].copy()
3843             for prop in SUPPORTED_STYLING:
3844                 prop_val = style.get(_x('tts:' + prop))
3845                 if prop_val:
3846                     styles.setdefault(style_id, {})[prop] = prop_val
3847         if repeat:
3848             repeat = False
3849         else:
3850             break
3851
3852     for p in ('body', 'div'):
3853         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3854         if ele is None:
3855             continue
3856         style = styles.get(ele.get('style'))
3857         if not style:
3858             continue
3859         default_style.update(style)
3860
3861     for para, index in zip(paras, itertools.count(1)):
3862         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3863         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3864         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3865         if begin_time is None:
3866             continue
3867         if not end_time:
3868             if not dur:
3869                 continue
3870             end_time = begin_time + dur
3871         out.append('%d\n%s --> %s\n%s\n\n' % (
3872             index,
3873             srt_subtitles_timecode(begin_time),
3874             srt_subtitles_timecode(end_time),
3875             parse_node(para)))
3876
3877     return ''.join(out)
3878
3879
3880 def cli_option(params, command_option, param, separator=None):
3881     param = params.get(param)
3882     return ([] if param is None
3883             else [command_option, str(param)] if separator is None
3884             else [f'{command_option}{separator}{param}'])
3885
3886
3887 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3888     param = params.get(param)
3889     assert param in (True, False, None)
3890     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3891
3892
3893 def cli_valueless_option(params, command_option, param, expected_value=True):
3894     return [command_option] if params.get(param) == expected_value else []
3895
3896
3897 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3898     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3899         if use_compat:
3900             return argdict
3901         else:
3902             argdict = None
3903     if argdict is None:
3904         return default
3905     assert isinstance(argdict, dict)
3906
3907     assert isinstance(keys, (list, tuple))
3908     for key_list in keys:
3909         arg_list = list(filter(
3910             lambda x: x is not None,
3911             [argdict.get(key.lower()) for key in variadic(key_list)]))
3912         if arg_list:
3913             return [arg for args in arg_list for arg in args]
3914     return default
3915
3916
3917 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3918     main_key, exe = main_key.lower(), exe.lower()
3919     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3920     keys = [f'{root_key}{k}' for k in (keys or [''])]
3921     if root_key in keys:
3922         if main_key != exe:
3923             keys.append((main_key, exe))
3924         keys.append('default')
3925     else:
3926         use_compat = False
3927     return cli_configuration_args(argdict, keys, default, use_compat)
3928
3929
3930 class ISO639Utils:
3931     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3932     _lang_map = {
3933         'aa': 'aar',
3934         'ab': 'abk',
3935         'ae': 'ave',
3936         'af': 'afr',
3937         'ak': 'aka',
3938         'am': 'amh',
3939         'an': 'arg',
3940         'ar': 'ara',
3941         'as': 'asm',
3942         'av': 'ava',
3943         'ay': 'aym',
3944         'az': 'aze',
3945         'ba': 'bak',
3946         'be': 'bel',
3947         'bg': 'bul',
3948         'bh': 'bih',
3949         'bi': 'bis',
3950         'bm': 'bam',
3951         'bn': 'ben',
3952         'bo': 'bod',
3953         'br': 'bre',
3954         'bs': 'bos',
3955         'ca': 'cat',
3956         'ce': 'che',
3957         'ch': 'cha',
3958         'co': 'cos',
3959         'cr': 'cre',
3960         'cs': 'ces',
3961         'cu': 'chu',
3962         'cv': 'chv',
3963         'cy': 'cym',
3964         'da': 'dan',
3965         'de': 'deu',
3966         'dv': 'div',
3967         'dz': 'dzo',
3968         'ee': 'ewe',
3969         'el': 'ell',
3970         'en': 'eng',
3971         'eo': 'epo',
3972         'es': 'spa',
3973         'et': 'est',
3974         'eu': 'eus',
3975         'fa': 'fas',
3976         'ff': 'ful',
3977         'fi': 'fin',
3978         'fj': 'fij',
3979         'fo': 'fao',
3980         'fr': 'fra',
3981         'fy': 'fry',
3982         'ga': 'gle',
3983         'gd': 'gla',
3984         'gl': 'glg',
3985         'gn': 'grn',
3986         'gu': 'guj',
3987         'gv': 'glv',
3988         'ha': 'hau',
3989         'he': 'heb',
3990         'iw': 'heb',  # Replaced by he in 1989 revision
3991         'hi': 'hin',
3992         'ho': 'hmo',
3993         'hr': 'hrv',
3994         'ht': 'hat',
3995         'hu': 'hun',
3996         'hy': 'hye',
3997         'hz': 'her',
3998         'ia': 'ina',
3999         'id': 'ind',
4000         'in': 'ind',  # Replaced by id in 1989 revision
4001         'ie': 'ile',
4002         'ig': 'ibo',
4003         'ii': 'iii',
4004         'ik': 'ipk',
4005         'io': 'ido',
4006         'is': 'isl',
4007         'it': 'ita',
4008         'iu': 'iku',
4009         'ja': 'jpn',
4010         'jv': 'jav',
4011         'ka': 'kat',
4012         'kg': 'kon',
4013         'ki': 'kik',
4014         'kj': 'kua',
4015         'kk': 'kaz',
4016         'kl': 'kal',
4017         'km': 'khm',
4018         'kn': 'kan',
4019         'ko': 'kor',
4020         'kr': 'kau',
4021         'ks': 'kas',
4022         'ku': 'kur',
4023         'kv': 'kom',
4024         'kw': 'cor',
4025         'ky': 'kir',
4026         'la': 'lat',
4027         'lb': 'ltz',
4028         'lg': 'lug',
4029         'li': 'lim',
4030         'ln': 'lin',
4031         'lo': 'lao',
4032         'lt': 'lit',
4033         'lu': 'lub',
4034         'lv': 'lav',
4035         'mg': 'mlg',
4036         'mh': 'mah',
4037         'mi': 'mri',
4038         'mk': 'mkd',
4039         'ml': 'mal',
4040         'mn': 'mon',
4041         'mr': 'mar',
4042         'ms': 'msa',
4043         'mt': 'mlt',
4044         'my': 'mya',
4045         'na': 'nau',
4046         'nb': 'nob',
4047         'nd': 'nde',
4048         'ne': 'nep',
4049         'ng': 'ndo',
4050         'nl': 'nld',
4051         'nn': 'nno',
4052         'no': 'nor',
4053         'nr': 'nbl',
4054         'nv': 'nav',
4055         'ny': 'nya',
4056         'oc': 'oci',
4057         'oj': 'oji',
4058         'om': 'orm',
4059         'or': 'ori',
4060         'os': 'oss',
4061         'pa': 'pan',
4062         'pi': 'pli',
4063         'pl': 'pol',
4064         'ps': 'pus',
4065         'pt': 'por',
4066         'qu': 'que',
4067         'rm': 'roh',
4068         'rn': 'run',
4069         'ro': 'ron',
4070         'ru': 'rus',
4071         'rw': 'kin',
4072         'sa': 'san',
4073         'sc': 'srd',
4074         'sd': 'snd',
4075         'se': 'sme',
4076         'sg': 'sag',
4077         'si': 'sin',
4078         'sk': 'slk',
4079         'sl': 'slv',
4080         'sm': 'smo',
4081         'sn': 'sna',
4082         'so': 'som',
4083         'sq': 'sqi',
4084         'sr': 'srp',
4085         'ss': 'ssw',
4086         'st': 'sot',
4087         'su': 'sun',
4088         'sv': 'swe',
4089         'sw': 'swa',
4090         'ta': 'tam',
4091         'te': 'tel',
4092         'tg': 'tgk',
4093         'th': 'tha',
4094         'ti': 'tir',
4095         'tk': 'tuk',
4096         'tl': 'tgl',
4097         'tn': 'tsn',
4098         'to': 'ton',
4099         'tr': 'tur',
4100         'ts': 'tso',
4101         'tt': 'tat',
4102         'tw': 'twi',
4103         'ty': 'tah',
4104         'ug': 'uig',
4105         'uk': 'ukr',
4106         'ur': 'urd',
4107         'uz': 'uzb',
4108         've': 'ven',
4109         'vi': 'vie',
4110         'vo': 'vol',
4111         'wa': 'wln',
4112         'wo': 'wol',
4113         'xh': 'xho',
4114         'yi': 'yid',
4115         'ji': 'yid',  # Replaced by yi in 1989 revision
4116         'yo': 'yor',
4117         'za': 'zha',
4118         'zh': 'zho',
4119         'zu': 'zul',
4120     }
4121
4122     @classmethod
4123     def short2long(cls, code):
4124         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4125         return cls._lang_map.get(code[:2])
4126
4127     @classmethod
4128     def long2short(cls, code):
4129         """Convert language code from ISO 639-2/T to ISO 639-1"""
4130         for short_name, long_name in cls._lang_map.items():
4131             if long_name == code:
4132                 return short_name
4133
4134
4135 class ISO3166Utils:
4136     # From http://data.okfn.org/data/core/country-list
4137     _country_map = {
4138         'AF': 'Afghanistan',
4139         'AX': 'Åland Islands',
4140         'AL': 'Albania',
4141         'DZ': 'Algeria',
4142         'AS': 'American Samoa',
4143         'AD': 'Andorra',
4144         'AO': 'Angola',
4145         'AI': 'Anguilla',
4146         'AQ': 'Antarctica',
4147         'AG': 'Antigua and Barbuda',
4148         'AR': 'Argentina',
4149         'AM': 'Armenia',
4150         'AW': 'Aruba',
4151         'AU': 'Australia',
4152         'AT': 'Austria',
4153         'AZ': 'Azerbaijan',
4154         'BS': 'Bahamas',
4155         'BH': 'Bahrain',
4156         'BD': 'Bangladesh',
4157         'BB': 'Barbados',
4158         'BY': 'Belarus',
4159         'BE': 'Belgium',
4160         'BZ': 'Belize',
4161         'BJ': 'Benin',
4162         'BM': 'Bermuda',
4163         'BT': 'Bhutan',
4164         'BO': 'Bolivia, Plurinational State of',
4165         'BQ': 'Bonaire, Sint Eustatius and Saba',
4166         'BA': 'Bosnia and Herzegovina',
4167         'BW': 'Botswana',
4168         'BV': 'Bouvet Island',
4169         'BR': 'Brazil',
4170         'IO': 'British Indian Ocean Territory',
4171         'BN': 'Brunei Darussalam',
4172         'BG': 'Bulgaria',
4173         'BF': 'Burkina Faso',
4174         'BI': 'Burundi',
4175         'KH': 'Cambodia',
4176         'CM': 'Cameroon',
4177         'CA': 'Canada',
4178         'CV': 'Cape Verde',
4179         'KY': 'Cayman Islands',
4180         'CF': 'Central African Republic',
4181         'TD': 'Chad',
4182         'CL': 'Chile',
4183         'CN': 'China',
4184         'CX': 'Christmas Island',
4185         'CC': 'Cocos (Keeling) Islands',
4186         'CO': 'Colombia',
4187         'KM': 'Comoros',
4188         'CG': 'Congo',
4189         'CD': 'Congo, the Democratic Republic of the',
4190         'CK': 'Cook Islands',
4191         'CR': 'Costa Rica',
4192         'CI': 'Côte d\'Ivoire',
4193         'HR': 'Croatia',
4194         'CU': 'Cuba',
4195         'CW': 'Curaçao',
4196         'CY': 'Cyprus',
4197         'CZ': 'Czech Republic',
4198         'DK': 'Denmark',
4199         'DJ': 'Djibouti',
4200         'DM': 'Dominica',
4201         'DO': 'Dominican Republic',
4202         'EC': 'Ecuador',
4203         'EG': 'Egypt',
4204         'SV': 'El Salvador',
4205         'GQ': 'Equatorial Guinea',
4206         'ER': 'Eritrea',
4207         'EE': 'Estonia',
4208         'ET': 'Ethiopia',
4209         'FK': 'Falkland Islands (Malvinas)',
4210         'FO': 'Faroe Islands',
4211         'FJ': 'Fiji',
4212         'FI': 'Finland',
4213         'FR': 'France',
4214         'GF': 'French Guiana',
4215         'PF': 'French Polynesia',
4216         'TF': 'French Southern Territories',
4217         'GA': 'Gabon',
4218         'GM': 'Gambia',
4219         'GE': 'Georgia',
4220         'DE': 'Germany',
4221         'GH': 'Ghana',
4222         'GI': 'Gibraltar',
4223         'GR': 'Greece',
4224         'GL': 'Greenland',
4225         'GD': 'Grenada',
4226         'GP': 'Guadeloupe',
4227         'GU': 'Guam',
4228         'GT': 'Guatemala',
4229         'GG': 'Guernsey',
4230         'GN': 'Guinea',
4231         'GW': 'Guinea-Bissau',
4232         'GY': 'Guyana',
4233         'HT': 'Haiti',
4234         'HM': 'Heard Island and McDonald Islands',
4235         'VA': 'Holy See (Vatican City State)',
4236         'HN': 'Honduras',
4237         'HK': 'Hong Kong',
4238         'HU': 'Hungary',
4239         'IS': 'Iceland',
4240         'IN': 'India',
4241         'ID': 'Indonesia',
4242         'IR': 'Iran, Islamic Republic of',
4243         'IQ': 'Iraq',
4244         'IE': 'Ireland',
4245         'IM': 'Isle of Man',
4246         'IL': 'Israel',
4247         'IT': 'Italy',
4248         'JM': 'Jamaica',
4249         'JP': 'Japan',
4250         'JE': 'Jersey',
4251         'JO': 'Jordan',
4252         'KZ': 'Kazakhstan',
4253         'KE': 'Kenya',
4254         'KI': 'Kiribati',
4255         'KP': 'Korea, Democratic People\'s Republic of',
4256         'KR': 'Korea, Republic of',
4257         'KW': 'Kuwait',
4258         'KG': 'Kyrgyzstan',
4259         'LA': 'Lao People\'s Democratic Republic',
4260         'LV': 'Latvia',
4261         'LB': 'Lebanon',
4262         'LS': 'Lesotho',
4263         'LR': 'Liberia',
4264         'LY': 'Libya',
4265         'LI': 'Liechtenstein',
4266         'LT': 'Lithuania',
4267         'LU': 'Luxembourg',
4268         'MO': 'Macao',
4269         'MK': 'Macedonia, the Former Yugoslav Republic of',
4270         'MG': 'Madagascar',
4271         'MW': 'Malawi',
4272         'MY': 'Malaysia',
4273         'MV': 'Maldives',
4274         'ML': 'Mali',
4275         'MT': 'Malta',
4276         'MH': 'Marshall Islands',
4277         'MQ': 'Martinique',
4278         'MR': 'Mauritania',
4279         'MU': 'Mauritius',
4280         'YT': 'Mayotte',
4281         'MX': 'Mexico',
4282         'FM': 'Micronesia, Federated States of',
4283         'MD': 'Moldova, Republic of',
4284         'MC': 'Monaco',
4285         'MN': 'Mongolia',
4286         'ME': 'Montenegro',
4287         'MS': 'Montserrat',
4288         'MA': 'Morocco',
4289         'MZ': 'Mozambique',
4290         'MM': 'Myanmar',
4291         'NA': 'Namibia',
4292         'NR': 'Nauru',
4293         'NP': 'Nepal',
4294         'NL': 'Netherlands',
4295         'NC': 'New Caledonia',
4296         'NZ': 'New Zealand',
4297         'NI': 'Nicaragua',
4298         'NE': 'Niger',
4299         'NG': 'Nigeria',
4300         'NU': 'Niue',
4301         'NF': 'Norfolk Island',
4302         'MP': 'Northern Mariana Islands',
4303         'NO': 'Norway',
4304         'OM': 'Oman',
4305         'PK': 'Pakistan',
4306         'PW': 'Palau',
4307         'PS': 'Palestine, State of',
4308         'PA': 'Panama',
4309         'PG': 'Papua New Guinea',
4310         'PY': 'Paraguay',
4311         'PE': 'Peru',
4312         'PH': 'Philippines',
4313         'PN': 'Pitcairn',
4314         'PL': 'Poland',
4315         'PT': 'Portugal',
4316         'PR': 'Puerto Rico',
4317         'QA': 'Qatar',
4318         'RE': 'Réunion',
4319         'RO': 'Romania',
4320         'RU': 'Russian Federation',
4321         'RW': 'Rwanda',
4322         'BL': 'Saint Barthélemy',
4323         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4324         'KN': 'Saint Kitts and Nevis',
4325         'LC': 'Saint Lucia',
4326         'MF': 'Saint Martin (French part)',
4327         'PM': 'Saint Pierre and Miquelon',
4328         'VC': 'Saint Vincent and the Grenadines',
4329         'WS': 'Samoa',
4330         'SM': 'San Marino',
4331         'ST': 'Sao Tome and Principe',
4332         'SA': 'Saudi Arabia',
4333         'SN': 'Senegal',
4334         'RS': 'Serbia',
4335         'SC': 'Seychelles',
4336         'SL': 'Sierra Leone',
4337         'SG': 'Singapore',
4338         'SX': 'Sint Maarten (Dutch part)',
4339         'SK': 'Slovakia',
4340         'SI': 'Slovenia',
4341         'SB': 'Solomon Islands',
4342         'SO': 'Somalia',
4343         'ZA': 'South Africa',
4344         'GS': 'South Georgia and the South Sandwich Islands',
4345         'SS': 'South Sudan',
4346         'ES': 'Spain',
4347         'LK': 'Sri Lanka',
4348         'SD': 'Sudan',
4349         'SR': 'Suriname',
4350         'SJ': 'Svalbard and Jan Mayen',
4351         'SZ': 'Swaziland',
4352         'SE': 'Sweden',
4353         'CH': 'Switzerland',
4354         'SY': 'Syrian Arab Republic',
4355         'TW': 'Taiwan, Province of China',
4356         'TJ': 'Tajikistan',
4357         'TZ': 'Tanzania, United Republic of',
4358         'TH': 'Thailand',
4359         'TL': 'Timor-Leste',
4360         'TG': 'Togo',
4361         'TK': 'Tokelau',
4362         'TO': 'Tonga',
4363         'TT': 'Trinidad and Tobago',
4364         'TN': 'Tunisia',
4365         'TR': 'Turkey',
4366         'TM': 'Turkmenistan',
4367         'TC': 'Turks and Caicos Islands',
4368         'TV': 'Tuvalu',
4369         'UG': 'Uganda',
4370         'UA': 'Ukraine',
4371         'AE': 'United Arab Emirates',
4372         'GB': 'United Kingdom',
4373         'US': 'United States',
4374         'UM': 'United States Minor Outlying Islands',
4375         'UY': 'Uruguay',
4376         'UZ': 'Uzbekistan',
4377         'VU': 'Vanuatu',
4378         'VE': 'Venezuela, Bolivarian Republic of',
4379         'VN': 'Viet Nam',
4380         'VG': 'Virgin Islands, British',
4381         'VI': 'Virgin Islands, U.S.',
4382         'WF': 'Wallis and Futuna',
4383         'EH': 'Western Sahara',
4384         'YE': 'Yemen',
4385         'ZM': 'Zambia',
4386         'ZW': 'Zimbabwe',
4387         # Not ISO 3166 codes, but used for IP blocks
4388         'AP': 'Asia/Pacific Region',
4389         'EU': 'Europe',
4390     }
4391
4392     @classmethod
4393     def short2full(cls, code):
4394         """Convert an ISO 3166-2 country code to the corresponding full name"""
4395         return cls._country_map.get(code.upper())
4396
4397
4398 class GeoUtils:
4399     # Major IPv4 address blocks per country
4400     _country_ip_map = {
4401         'AD': '46.172.224.0/19',
4402         'AE': '94.200.0.0/13',
4403         'AF': '149.54.0.0/17',
4404         'AG': '209.59.64.0/18',
4405         'AI': '204.14.248.0/21',
4406         'AL': '46.99.0.0/16',
4407         'AM': '46.70.0.0/15',
4408         'AO': '105.168.0.0/13',
4409         'AP': '182.50.184.0/21',
4410         'AQ': '23.154.160.0/24',
4411         'AR': '181.0.0.0/12',
4412         'AS': '202.70.112.0/20',
4413         'AT': '77.116.0.0/14',
4414         'AU': '1.128.0.0/11',
4415         'AW': '181.41.0.0/18',
4416         'AX': '185.217.4.0/22',
4417         'AZ': '5.197.0.0/16',
4418         'BA': '31.176.128.0/17',
4419         'BB': '65.48.128.0/17',
4420         'BD': '114.130.0.0/16',
4421         'BE': '57.0.0.0/8',
4422         'BF': '102.178.0.0/15',
4423         'BG': '95.42.0.0/15',
4424         'BH': '37.131.0.0/17',
4425         'BI': '154.117.192.0/18',
4426         'BJ': '137.255.0.0/16',
4427         'BL': '185.212.72.0/23',
4428         'BM': '196.12.64.0/18',
4429         'BN': '156.31.0.0/16',
4430         'BO': '161.56.0.0/16',
4431         'BQ': '161.0.80.0/20',
4432         'BR': '191.128.0.0/12',
4433         'BS': '24.51.64.0/18',
4434         'BT': '119.2.96.0/19',
4435         'BW': '168.167.0.0/16',
4436         'BY': '178.120.0.0/13',
4437         'BZ': '179.42.192.0/18',
4438         'CA': '99.224.0.0/11',
4439         'CD': '41.243.0.0/16',
4440         'CF': '197.242.176.0/21',
4441         'CG': '160.113.0.0/16',
4442         'CH': '85.0.0.0/13',
4443         'CI': '102.136.0.0/14',
4444         'CK': '202.65.32.0/19',
4445         'CL': '152.172.0.0/14',
4446         'CM': '102.244.0.0/14',
4447         'CN': '36.128.0.0/10',
4448         'CO': '181.240.0.0/12',
4449         'CR': '201.192.0.0/12',
4450         'CU': '152.206.0.0/15',
4451         'CV': '165.90.96.0/19',
4452         'CW': '190.88.128.0/17',
4453         'CY': '31.153.0.0/16',
4454         'CZ': '88.100.0.0/14',
4455         'DE': '53.0.0.0/8',
4456         'DJ': '197.241.0.0/17',
4457         'DK': '87.48.0.0/12',
4458         'DM': '192.243.48.0/20',
4459         'DO': '152.166.0.0/15',
4460         'DZ': '41.96.0.0/12',
4461         'EC': '186.68.0.0/15',
4462         'EE': '90.190.0.0/15',
4463         'EG': '156.160.0.0/11',
4464         'ER': '196.200.96.0/20',
4465         'ES': '88.0.0.0/11',
4466         'ET': '196.188.0.0/14',
4467         'EU': '2.16.0.0/13',
4468         'FI': '91.152.0.0/13',
4469         'FJ': '144.120.0.0/16',
4470         'FK': '80.73.208.0/21',
4471         'FM': '119.252.112.0/20',
4472         'FO': '88.85.32.0/19',
4473         'FR': '90.0.0.0/9',
4474         'GA': '41.158.0.0/15',
4475         'GB': '25.0.0.0/8',
4476         'GD': '74.122.88.0/21',
4477         'GE': '31.146.0.0/16',
4478         'GF': '161.22.64.0/18',
4479         'GG': '62.68.160.0/19',
4480         'GH': '154.160.0.0/12',
4481         'GI': '95.164.0.0/16',
4482         'GL': '88.83.0.0/19',
4483         'GM': '160.182.0.0/15',
4484         'GN': '197.149.192.0/18',
4485         'GP': '104.250.0.0/19',
4486         'GQ': '105.235.224.0/20',
4487         'GR': '94.64.0.0/13',
4488         'GT': '168.234.0.0/16',
4489         'GU': '168.123.0.0/16',
4490         'GW': '197.214.80.0/20',
4491         'GY': '181.41.64.0/18',
4492         'HK': '113.252.0.0/14',
4493         'HN': '181.210.0.0/16',
4494         'HR': '93.136.0.0/13',
4495         'HT': '148.102.128.0/17',
4496         'HU': '84.0.0.0/14',
4497         'ID': '39.192.0.0/10',
4498         'IE': '87.32.0.0/12',
4499         'IL': '79.176.0.0/13',
4500         'IM': '5.62.80.0/20',
4501         'IN': '117.192.0.0/10',
4502         'IO': '203.83.48.0/21',
4503         'IQ': '37.236.0.0/14',
4504         'IR': '2.176.0.0/12',
4505         'IS': '82.221.0.0/16',
4506         'IT': '79.0.0.0/10',
4507         'JE': '87.244.64.0/18',
4508         'JM': '72.27.0.0/17',
4509         'JO': '176.29.0.0/16',
4510         'JP': '133.0.0.0/8',
4511         'KE': '105.48.0.0/12',
4512         'KG': '158.181.128.0/17',
4513         'KH': '36.37.128.0/17',
4514         'KI': '103.25.140.0/22',
4515         'KM': '197.255.224.0/20',
4516         'KN': '198.167.192.0/19',
4517         'KP': '175.45.176.0/22',
4518         'KR': '175.192.0.0/10',
4519         'KW': '37.36.0.0/14',
4520         'KY': '64.96.0.0/15',
4521         'KZ': '2.72.0.0/13',
4522         'LA': '115.84.64.0/18',
4523         'LB': '178.135.0.0/16',
4524         'LC': '24.92.144.0/20',
4525         'LI': '82.117.0.0/19',
4526         'LK': '112.134.0.0/15',
4527         'LR': '102.183.0.0/16',
4528         'LS': '129.232.0.0/17',
4529         'LT': '78.56.0.0/13',
4530         'LU': '188.42.0.0/16',
4531         'LV': '46.109.0.0/16',
4532         'LY': '41.252.0.0/14',
4533         'MA': '105.128.0.0/11',
4534         'MC': '88.209.64.0/18',
4535         'MD': '37.246.0.0/16',
4536         'ME': '178.175.0.0/17',
4537         'MF': '74.112.232.0/21',
4538         'MG': '154.126.0.0/17',
4539         'MH': '117.103.88.0/21',
4540         'MK': '77.28.0.0/15',
4541         'ML': '154.118.128.0/18',
4542         'MM': '37.111.0.0/17',
4543         'MN': '49.0.128.0/17',
4544         'MO': '60.246.0.0/16',
4545         'MP': '202.88.64.0/20',
4546         'MQ': '109.203.224.0/19',
4547         'MR': '41.188.64.0/18',
4548         'MS': '208.90.112.0/22',
4549         'MT': '46.11.0.0/16',
4550         'MU': '105.16.0.0/12',
4551         'MV': '27.114.128.0/18',
4552         'MW': '102.70.0.0/15',
4553         'MX': '187.192.0.0/11',
4554         'MY': '175.136.0.0/13',
4555         'MZ': '197.218.0.0/15',
4556         'NA': '41.182.0.0/16',
4557         'NC': '101.101.0.0/18',
4558         'NE': '197.214.0.0/18',
4559         'NF': '203.17.240.0/22',
4560         'NG': '105.112.0.0/12',
4561         'NI': '186.76.0.0/15',
4562         'NL': '145.96.0.0/11',
4563         'NO': '84.208.0.0/13',
4564         'NP': '36.252.0.0/15',
4565         'NR': '203.98.224.0/19',
4566         'NU': '49.156.48.0/22',
4567         'NZ': '49.224.0.0/14',
4568         'OM': '5.36.0.0/15',
4569         'PA': '186.72.0.0/15',
4570         'PE': '186.160.0.0/14',
4571         'PF': '123.50.64.0/18',
4572         'PG': '124.240.192.0/19',
4573         'PH': '49.144.0.0/13',
4574         'PK': '39.32.0.0/11',
4575         'PL': '83.0.0.0/11',
4576         'PM': '70.36.0.0/20',
4577         'PR': '66.50.0.0/16',
4578         'PS': '188.161.0.0/16',
4579         'PT': '85.240.0.0/13',
4580         'PW': '202.124.224.0/20',
4581         'PY': '181.120.0.0/14',
4582         'QA': '37.210.0.0/15',
4583         'RE': '102.35.0.0/16',
4584         'RO': '79.112.0.0/13',
4585         'RS': '93.86.0.0/15',
4586         'RU': '5.136.0.0/13',
4587         'RW': '41.186.0.0/16',
4588         'SA': '188.48.0.0/13',
4589         'SB': '202.1.160.0/19',
4590         'SC': '154.192.0.0/11',
4591         'SD': '102.120.0.0/13',
4592         'SE': '78.64.0.0/12',
4593         'SG': '8.128.0.0/10',
4594         'SI': '188.196.0.0/14',
4595         'SK': '78.98.0.0/15',
4596         'SL': '102.143.0.0/17',
4597         'SM': '89.186.32.0/19',
4598         'SN': '41.82.0.0/15',
4599         'SO': '154.115.192.0/18',
4600         'SR': '186.179.128.0/17',
4601         'SS': '105.235.208.0/21',
4602         'ST': '197.159.160.0/19',
4603         'SV': '168.243.0.0/16',
4604         'SX': '190.102.0.0/20',
4605         'SY': '5.0.0.0/16',
4606         'SZ': '41.84.224.0/19',
4607         'TC': '65.255.48.0/20',
4608         'TD': '154.68.128.0/19',
4609         'TG': '196.168.0.0/14',
4610         'TH': '171.96.0.0/13',
4611         'TJ': '85.9.128.0/18',
4612         'TK': '27.96.24.0/21',
4613         'TL': '180.189.160.0/20',
4614         'TM': '95.85.96.0/19',
4615         'TN': '197.0.0.0/11',
4616         'TO': '175.176.144.0/21',
4617         'TR': '78.160.0.0/11',
4618         'TT': '186.44.0.0/15',
4619         'TV': '202.2.96.0/19',
4620         'TW': '120.96.0.0/11',
4621         'TZ': '156.156.0.0/14',
4622         'UA': '37.52.0.0/14',
4623         'UG': '102.80.0.0/13',
4624         'US': '6.0.0.0/8',
4625         'UY': '167.56.0.0/13',
4626         'UZ': '84.54.64.0/18',
4627         'VA': '212.77.0.0/19',
4628         'VC': '207.191.240.0/21',
4629         'VE': '186.88.0.0/13',
4630         'VG': '66.81.192.0/20',
4631         'VI': '146.226.0.0/16',
4632         'VN': '14.160.0.0/11',
4633         'VU': '202.80.32.0/20',
4634         'WF': '117.20.32.0/21',
4635         'WS': '202.4.32.0/19',
4636         'YE': '134.35.0.0/16',
4637         'YT': '41.242.116.0/22',
4638         'ZA': '41.0.0.0/11',
4639         'ZM': '102.144.0.0/13',
4640         'ZW': '102.177.192.0/18',
4641     }
4642
4643     @classmethod
4644     def random_ipv4(cls, code_or_block):
4645         if len(code_or_block) == 2:
4646             block = cls._country_ip_map.get(code_or_block.upper())
4647             if not block:
4648                 return None
4649         else:
4650             block = code_or_block
4651         addr, preflen = block.split('/')
4652         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4653         addr_max = addr_min | (0xffffffff >> int(preflen))
4654         return str(socket.inet_ntoa(
4655             struct.pack('!L', random.randint(addr_min, addr_max))))
4656
4657
4658 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4659     def __init__(self, proxies=None):
4660         # Set default handlers
4661         for type in ('http', 'https'):
4662             setattr(self, '%s_open' % type,
4663                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4664                         meth(r, proxy, type))
4665         urllib.request.ProxyHandler.__init__(self, proxies)
4666
4667     def proxy_open(self, req, proxy, type):
4668         req_proxy = req.headers.get('Ytdl-request-proxy')
4669         if req_proxy is not None:
4670             proxy = req_proxy
4671             del req.headers['Ytdl-request-proxy']
4672
4673         if proxy == '__noproxy__':
4674             return None  # No Proxy
4675         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4676             req.add_header('Ytdl-socks-proxy', proxy)
4677             # yt-dlp's http/https handlers do wrapping the socket with socks
4678             return None
4679         return urllib.request.ProxyHandler.proxy_open(
4680             self, req, proxy, type)
4681
4682
4683 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4684 # released into Public Domain
4685 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4686
4687 def long_to_bytes(n, blocksize=0):
4688     """long_to_bytes(n:long, blocksize:int) : string
4689     Convert a long integer to a byte string.
4690
4691     If optional blocksize is given and greater than zero, pad the front of the
4692     byte string with binary zeros so that the length is a multiple of
4693     blocksize.
4694     """
4695     # after much testing, this algorithm was deemed to be the fastest
4696     s = b''
4697     n = int(n)
4698     while n > 0:
4699         s = struct.pack('>I', n & 0xffffffff) + s
4700         n = n >> 32
4701     # strip off leading zeros
4702     for i in range(len(s)):
4703         if s[i] != b'\000'[0]:
4704             break
4705     else:
4706         # only happens when n == 0
4707         s = b'\000'
4708         i = 0
4709     s = s[i:]
4710     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4711     # de-padding being done above, but sigh...
4712     if blocksize > 0 and len(s) % blocksize:
4713         s = (blocksize - len(s) % blocksize) * b'\000' + s
4714     return s
4715
4716
4717 def bytes_to_long(s):
4718     """bytes_to_long(string) : long
4719     Convert a byte string to a long integer.
4720
4721     This is (essentially) the inverse of long_to_bytes().
4722     """
4723     acc = 0
4724     length = len(s)
4725     if length % 4:
4726         extra = (4 - length % 4)
4727         s = b'\000' * extra + s
4728         length = length + extra
4729     for i in range(0, length, 4):
4730         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4731     return acc
4732
4733
4734 def ohdave_rsa_encrypt(data, exponent, modulus):
4735     '''
4736     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4737
4738     Input:
4739         data: data to encrypt, bytes-like object
4740         exponent, modulus: parameter e and N of RSA algorithm, both integer
4741     Output: hex string of encrypted data
4742
4743     Limitation: supports one block encryption only
4744     '''
4745
4746     payload = int(binascii.hexlify(data[::-1]), 16)
4747     encrypted = pow(payload, exponent, modulus)
4748     return '%x' % encrypted
4749
4750
4751 def pkcs1pad(data, length):
4752     """
4753     Padding input data with PKCS#1 scheme
4754
4755     @param {int[]} data        input data
4756     @param {int}   length      target length
4757     @returns {int[]}           padded data
4758     """
4759     if len(data) > length - 11:
4760         raise ValueError('Input data too long for PKCS#1 padding')
4761
4762     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4763     return [0, 2] + pseudo_random + [0] + data
4764
4765
4766 def _base_n_table(n, table):
4767     if not table and not n:
4768         raise ValueError('Either table or n must be specified')
4769     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4770
4771     if n and n != len(table):
4772         raise ValueError(f'base {n} exceeds table length {len(table)}')
4773     return table
4774
4775
4776 def encode_base_n(num, n=None, table=None):
4777     """Convert given int to a base-n string"""
4778     table = _base_n_table(n, table)
4779     if not num:
4780         return table[0]
4781
4782     result, base = '', len(table)
4783     while num:
4784         result = table[num % base] + result
4785         num = num // base
4786     return result
4787
4788
4789 def decode_base_n(string, n=None, table=None):
4790     """Convert given base-n string to int"""
4791     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4792     result, base = 0, len(table)
4793     for char in string:
4794         result = result * base + table[char]
4795     return result
4796
4797
4798 def decode_base(value, digits):
4799     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4800                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4801     return decode_base_n(value, table=digits)
4802
4803
4804 def decode_packed_codes(code):
4805     mobj = re.search(PACKED_CODES_RE, code)
4806     obfuscated_code, base, count, symbols = mobj.groups()
4807     base = int(base)
4808     count = int(count)
4809     symbols = symbols.split('|')
4810     symbol_table = {}
4811
4812     while count:
4813         count -= 1
4814         base_n_count = encode_base_n(count, base)
4815         symbol_table[base_n_count] = symbols[count] or base_n_count
4816
4817     return re.sub(
4818         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4819         obfuscated_code)
4820
4821
4822 def caesar(s, alphabet, shift):
4823     if shift == 0:
4824         return s
4825     l = len(alphabet)
4826     return ''.join(
4827         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4828         for c in s)
4829
4830
4831 def rot47(s):
4832     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4833
4834
4835 def parse_m3u8_attributes(attrib):
4836     info = {}
4837     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4838         if val.startswith('"'):
4839             val = val[1:-1]
4840         info[key] = val
4841     return info
4842
4843
4844 def urshift(val, n):
4845     return val >> n if val >= 0 else (val + 0x100000000) >> n
4846
4847
4848 # Based on png2str() written by @gdkchan and improved by @yokrysty
4849 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4850 def decode_png(png_data):
4851     # Reference: https://www.w3.org/TR/PNG/
4852     header = png_data[8:]
4853
4854     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4855         raise OSError('Not a valid PNG file.')
4856
4857     int_map = {1: '>B', 2: '>H', 4: '>I'}
4858     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4859
4860     chunks = []
4861
4862     while header:
4863         length = unpack_integer(header[:4])
4864         header = header[4:]
4865
4866         chunk_type = header[:4]
4867         header = header[4:]
4868
4869         chunk_data = header[:length]
4870         header = header[length:]
4871
4872         header = header[4:]  # Skip CRC
4873
4874         chunks.append({
4875             'type': chunk_type,
4876             'length': length,
4877             'data': chunk_data
4878         })
4879
4880     ihdr = chunks[0]['data']
4881
4882     width = unpack_integer(ihdr[:4])
4883     height = unpack_integer(ihdr[4:8])
4884
4885     idat = b''
4886
4887     for chunk in chunks:
4888         if chunk['type'] == b'IDAT':
4889             idat += chunk['data']
4890
4891     if not idat:
4892         raise OSError('Unable to read PNG data.')
4893
4894     decompressed_data = bytearray(zlib.decompress(idat))
4895
4896     stride = width * 3
4897     pixels = []
4898
4899     def _get_pixel(idx):
4900         x = idx % stride
4901         y = idx // stride
4902         return pixels[y][x]
4903
4904     for y in range(height):
4905         basePos = y * (1 + stride)
4906         filter_type = decompressed_data[basePos]
4907
4908         current_row = []
4909
4910         pixels.append(current_row)
4911
4912         for x in range(stride):
4913             color = decompressed_data[1 + basePos + x]
4914             basex = y * stride + x
4915             left = 0
4916             up = 0
4917
4918             if x > 2:
4919                 left = _get_pixel(basex - 3)
4920             if y > 0:
4921                 up = _get_pixel(basex - stride)
4922
4923             if filter_type == 1:  # Sub
4924                 color = (color + left) & 0xff
4925             elif filter_type == 2:  # Up
4926                 color = (color + up) & 0xff
4927             elif filter_type == 3:  # Average
4928                 color = (color + ((left + up) >> 1)) & 0xff
4929             elif filter_type == 4:  # Paeth
4930                 a = left
4931                 b = up
4932                 c = 0
4933
4934                 if x > 2 and y > 0:
4935                     c = _get_pixel(basex - stride - 3)
4936
4937                 p = a + b - c
4938
4939                 pa = abs(p - a)
4940                 pb = abs(p - b)
4941                 pc = abs(p - c)
4942
4943                 if pa <= pb and pa <= pc:
4944                     color = (color + a) & 0xff
4945                 elif pb <= pc:
4946                     color = (color + b) & 0xff
4947                 else:
4948                     color = (color + c) & 0xff
4949
4950             current_row.append(color)
4951
4952     return width, height, pixels
4953
4954
4955 def write_xattr(path, key, value):
4956     # Windows: Write xattrs to NTFS Alternate Data Streams:
4957     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4958     if compat_os_name == 'nt':
4959         assert ':' not in key
4960         assert os.path.exists(path)
4961
4962         try:
4963             with open(f'{path}:{key}', 'wb') as f:
4964                 f.write(value)
4965         except OSError as e:
4966             raise XAttrMetadataError(e.errno, e.strerror)
4967         return
4968
4969     # UNIX Method 1. Use xattrs/pyxattrs modules
4970
4971     setxattr = None
4972     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4973         # Unicode arguments are not supported in pyxattr until version 0.5.0
4974         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4975         if version_tuple(xattr.__version__) >= (0, 5, 0):
4976             setxattr = xattr.set
4977     elif xattr:
4978         setxattr = xattr.setxattr
4979
4980     if setxattr:
4981         try:
4982             setxattr(path, key, value)
4983         except OSError as e:
4984             raise XAttrMetadataError(e.errno, e.strerror)
4985         return
4986
4987     # UNIX Method 2. Use setfattr/xattr executables
4988     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4989            else 'xattr' if check_executable('xattr', ['-h']) else None)
4990     if not exe:
4991         raise XAttrUnavailableError(
4992             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4993             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4994
4995     value = value.decode()
4996     try:
4997         _, stderr, returncode = Popen.run(
4998             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4999             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5000     except OSError as e:
5001         raise XAttrMetadataError(e.errno, e.strerror)
5002     if returncode:
5003         raise XAttrMetadataError(returncode, stderr)
5004
5005
5006 def random_birthday(year_field, month_field, day_field):
5007     start_date = datetime.date(1950, 1, 1)
5008     end_date = datetime.date(1995, 12, 31)
5009     offset = random.randint(0, (end_date - start_date).days)
5010     random_date = start_date + datetime.timedelta(offset)
5011     return {
5012         year_field: str(random_date.year),
5013         month_field: str(random_date.month),
5014         day_field: str(random_date.day),
5015     }
5016
5017
5018 # Templates for internet shortcut files, which are plain text files.
5019 DOT_URL_LINK_TEMPLATE = '''\
5020 [InternetShortcut]
5021 URL=%(url)s
5022 '''
5023
5024 DOT_WEBLOC_LINK_TEMPLATE = '''\
5025 <?xml version="1.0" encoding="UTF-8"?>
5026 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5027 <plist version="1.0">
5028 <dict>
5029 \t<key>URL</key>
5030 \t<string>%(url)s</string>
5031 </dict>
5032 </plist>
5033 '''
5034
5035 DOT_DESKTOP_LINK_TEMPLATE = '''\
5036 [Desktop Entry]
5037 Encoding=UTF-8
5038 Name=%(filename)s
5039 Type=Link
5040 URL=%(url)s
5041 Icon=text-html
5042 '''
5043
5044 LINK_TEMPLATES = {
5045     'url': DOT_URL_LINK_TEMPLATE,
5046     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5047     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5048 }
5049
5050
5051 def iri_to_uri(iri):
5052     """
5053     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5054
5055     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5056     """
5057
5058     iri_parts = urllib.parse.urlparse(iri)
5059
5060     if '[' in iri_parts.netloc:
5061         raise ValueError('IPv6 URIs are not, yet, supported.')
5062         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5063
5064     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5065
5066     net_location = ''
5067     if iri_parts.username:
5068         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5069         if iri_parts.password is not None:
5070             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5071         net_location += '@'
5072
5073     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5074     # The 'idna' encoding produces ASCII text.
5075     if iri_parts.port is not None and iri_parts.port != 80:
5076         net_location += ':' + str(iri_parts.port)
5077
5078     return urllib.parse.urlunparse(
5079         (iri_parts.scheme,
5080             net_location,
5081
5082             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5083
5084             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5085             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5086
5087             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5088             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5089
5090             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5091
5092     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5093
5094
5095 def to_high_limit_path(path):
5096     if sys.platform in ['win32', 'cygwin']:
5097         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5098         return '\\\\?\\' + os.path.abspath(path)
5099
5100     return path
5101
5102
5103 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5104     val = traverse_obj(obj, *variadic(field))
5105     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5106         return default
5107     return template % func(val)
5108
5109
5110 def clean_podcast_url(url):
5111     return re.sub(r'''(?x)
5112         (?:
5113             (?:
5114                 chtbl\.com/track|
5115                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5116                 play\.podtrac\.com
5117             )/[^/]+|
5118             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5119             flex\.acast\.com|
5120             pd(?:
5121                 cn\.co| # https://podcorn.com/analytics-prefix/
5122                 st\.fm # https://podsights.com/docs/
5123             )/e
5124         )/''', '', url)
5125
5126
5127 _HEX_TABLE = '0123456789abcdef'
5128
5129
5130 def random_uuidv4():
5131     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5132
5133
5134 def make_dir(path, to_screen=None):
5135     try:
5136         dn = os.path.dirname(path)
5137         if dn and not os.path.exists(dn):
5138             os.makedirs(dn)
5139         return True
5140     except OSError as err:
5141         if callable(to_screen) is not None:
5142             to_screen('unable to create directory ' + error_to_compat_str(err))
5143         return False
5144
5145
5146 def get_executable_path():
5147     from .update import _get_variant_and_executable_path
5148
5149     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5150
5151
5152 def load_plugins(name, suffix, namespace):
5153     classes = {}
5154     with contextlib.suppress(FileNotFoundError):
5155         plugins_spec = importlib.util.spec_from_file_location(
5156             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5157         plugins = importlib.util.module_from_spec(plugins_spec)
5158         sys.modules[plugins_spec.name] = plugins
5159         plugins_spec.loader.exec_module(plugins)
5160         for name in dir(plugins):
5161             if name in namespace:
5162                 continue
5163             if not name.endswith(suffix):
5164                 continue
5165             klass = getattr(plugins, name)
5166             classes[name] = namespace[name] = klass
5167     return classes
5168
5169
5170 def traverse_obj(
5171         obj, *path_list, default=None, expected_type=None, get_all=True,
5172         casesense=True, is_user_input=False, traverse_string=False):
5173     ''' Traverse nested list/dict/tuple
5174     @param path_list        A list of paths which are checked one by one.
5175                             Each path is a list of keys where each key is a:
5176                               - None:     Do nothing
5177                               - string:   A dictionary key
5178                               - int:      An index into a list
5179                               - tuple:    A list of keys all of which will be traversed
5180                               - Ellipsis: Fetch all values in the object
5181                               - Function: Takes the key and value as arguments
5182                                           and returns whether the key matches or not
5183     @param default          Default value to return
5184     @param expected_type    Only accept final value of this type (Can also be any callable)
5185     @param get_all          Return all the values obtained from a path or only the first one
5186     @param casesense        Whether to consider dictionary keys as case sensitive
5187     @param is_user_input    Whether the keys are generated from user input. If True,
5188                             strings are converted to int/slice if necessary
5189     @param traverse_string  Whether to traverse inside strings. If True, any
5190                             non-compatible object will also be converted into a string
5191     # TODO: Write tests
5192     '''
5193     if not casesense:
5194         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5195         path_list = (map(_lower, variadic(path)) for path in path_list)
5196
5197     def _traverse_obj(obj, path, _current_depth=0):
5198         nonlocal depth
5199         path = tuple(variadic(path))
5200         for i, key in enumerate(path):
5201             if None in (key, obj):
5202                 return obj
5203             if isinstance(key, (list, tuple)):
5204                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5205                 key = ...
5206             if key is ...:
5207                 obj = (obj.values() if isinstance(obj, dict)
5208                        else obj if isinstance(obj, (list, tuple, LazyList))
5209                        else str(obj) if traverse_string else [])
5210                 _current_depth += 1
5211                 depth = max(depth, _current_depth)
5212                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5213             elif callable(key):
5214                 if isinstance(obj, (list, tuple, LazyList)):
5215                     obj = enumerate(obj)
5216                 elif isinstance(obj, dict):
5217                     obj = obj.items()
5218                 else:
5219                     if not traverse_string:
5220                         return None
5221                     obj = str(obj)
5222                 _current_depth += 1
5223                 depth = max(depth, _current_depth)
5224                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5225             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5226                 obj = (obj.get(key) if casesense or (key in obj)
5227                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5228             else:
5229                 if is_user_input:
5230                     key = (int_or_none(key) if ':' not in key
5231                            else slice(*map(int_or_none, key.split(':'))))
5232                     if key == slice(None):
5233                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5234                 if not isinstance(key, (int, slice)):
5235                     return None
5236                 if not isinstance(obj, (list, tuple, LazyList)):
5237                     if not traverse_string:
5238                         return None
5239                     obj = str(obj)
5240                 try:
5241                     obj = obj[key]
5242                 except IndexError:
5243                     return None
5244         return obj
5245
5246     if isinstance(expected_type, type):
5247         type_test = lambda val: val if isinstance(val, expected_type) else None
5248     else:
5249         type_test = expected_type or IDENTITY
5250
5251     for path in path_list:
5252         depth = 0
5253         val = _traverse_obj(obj, path)
5254         if val is not None:
5255             if depth:
5256                 for _ in range(depth - 1):
5257                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5258                 val = [v for v in map(type_test, val) if v is not None]
5259                 if val:
5260                     return val if get_all else val[0]
5261             else:
5262                 val = type_test(val)
5263                 if val is not None:
5264                     return val
5265     return default
5266
5267
5268 def traverse_dict(dictn, keys, casesense=True):
5269     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5270                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5271     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5272
5273
5274 def get_first(obj, keys, **kwargs):
5275     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5276
5277
5278 def variadic(x, allowed_types=(str, bytes, dict)):
5279     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5280
5281
5282 def time_seconds(**kwargs):
5283     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5284     return t.timestamp()
5285
5286
5287 # create a JSON Web Signature (jws) with HS256 algorithm
5288 # the resulting format is in JWS Compact Serialization
5289 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5290 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5291 def jwt_encode_hs256(payload_data, key, headers={}):
5292     header_data = {
5293         'alg': 'HS256',
5294         'typ': 'JWT',
5295     }
5296     if headers:
5297         header_data.update(headers)
5298     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5299     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5300     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5301     signature_b64 = base64.b64encode(h.digest())
5302     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5303     return token
5304
5305
5306 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5307 def jwt_decode_hs256(jwt):
5308     header_b64, payload_b64, signature_b64 = jwt.split('.')
5309     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5310     return payload_data
5311
5312
5313 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5314
5315
5316 @functools.cache
5317 def supports_terminal_sequences(stream):
5318     if compat_os_name == 'nt':
5319         if not WINDOWS_VT_MODE:
5320             return False
5321     elif not os.getenv('TERM'):
5322         return False
5323     try:
5324         return stream.isatty()
5325     except BaseException:
5326         return False
5327
5328
5329 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5330     if get_windows_version() < (10, 0, 10586):
5331         return
5332     global WINDOWS_VT_MODE
5333     try:
5334         Popen.run('', shell=True)
5335     except Exception:
5336         return
5337
5338     WINDOWS_VT_MODE = True
5339     supports_terminal_sequences.cache_clear()
5340
5341
5342 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5343
5344
5345 def remove_terminal_sequences(string):
5346     return _terminal_sequences_re.sub('', string)
5347
5348
5349 def number_of_digits(number):
5350     return len('%d' % number)
5351
5352
5353 def join_nonempty(*values, delim='-', from_dict=None):
5354     if from_dict is not None:
5355         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5356     return delim.join(map(str, filter(None, values)))
5357
5358
5359 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5360     """
5361     Find the largest format dimensions in terms of video width and, for each thumbnail:
5362     * Modify the URL: Match the width with the provided regex and replace with the former width
5363     * Update dimensions
5364
5365     This function is useful with video services that scale the provided thumbnails on demand
5366     """
5367     _keys = ('width', 'height')
5368     max_dimensions = max(
5369         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5370         default=(0, 0))
5371     if not max_dimensions[0]:
5372         return thumbnails
5373     return [
5374         merge_dicts(
5375             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5376             dict(zip(_keys, max_dimensions)), thumbnail)
5377         for thumbnail in thumbnails
5378     ]
5379
5380
5381 def parse_http_range(range):
5382     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5383     if not range:
5384         return None, None, None
5385     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5386     if not crg:
5387         return None, None, None
5388     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5389
5390
5391 def read_stdin(what):
5392     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5393     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5394     return sys.stdin
5395
5396
5397 def determine_file_encoding(data):
5398     """
5399     Detect the text encoding used
5400     @returns (encoding, bytes to skip)
5401     """
5402
5403     # BOM marks are given priority over declarations
5404     for bom, enc in BOMS:
5405         if data.startswith(bom):
5406             return enc, len(bom)
5407
5408     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5409     # We ignore the endianness to get a good enough match
5410     data = data.replace(b'\0', b'')
5411     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5412     return mobj.group(1).decode() if mobj else None, 0
5413
5414
5415 class Config:
5416     own_args = None
5417     parsed_args = None
5418     filename = None
5419     __initialized = False
5420
5421     def __init__(self, parser, label=None):
5422         self.parser, self.label = parser, label
5423         self._loaded_paths, self.configs = set(), []
5424
5425     def init(self, args=None, filename=None):
5426         assert not self.__initialized
5427         self.own_args, self.filename = args, filename
5428         return self.load_configs()
5429
5430     def load_configs(self):
5431         directory = ''
5432         if self.filename:
5433             location = os.path.realpath(self.filename)
5434             directory = os.path.dirname(location)
5435             if location in self._loaded_paths:
5436                 return False
5437             self._loaded_paths.add(location)
5438
5439         self.__initialized = True
5440         opts, _ = self.parser.parse_known_args(self.own_args)
5441         self.parsed_args = self.own_args
5442         for location in opts.config_locations or []:
5443             if location == '-':
5444                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5445                 continue
5446             location = os.path.join(directory, expand_path(location))
5447             if os.path.isdir(location):
5448                 location = os.path.join(location, 'yt-dlp.conf')
5449             if not os.path.exists(location):
5450                 self.parser.error(f'config location {location} does not exist')
5451             self.append_config(self.read_file(location), location)
5452         return True
5453
5454     def __str__(self):
5455         label = join_nonempty(
5456             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5457             delim=' ')
5458         return join_nonempty(
5459             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5460             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5461             delim='\n')
5462
5463     @staticmethod
5464     def read_file(filename, default=[]):
5465         try:
5466             optionf = open(filename, 'rb')
5467         except OSError:
5468             return default  # silently skip if file is not present
5469         try:
5470             enc, skip = determine_file_encoding(optionf.read(512))
5471             optionf.seek(skip, io.SEEK_SET)
5472         except OSError:
5473             enc = None  # silently skip read errors
5474         try:
5475             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5476             contents = optionf.read().decode(enc or preferredencoding())
5477             res = shlex.split(contents, comments=True)
5478         except Exception as err:
5479             raise ValueError(f'Unable to parse "{filename}": {err}')
5480         finally:
5481             optionf.close()
5482         return res
5483
5484     @staticmethod
5485     def hide_login_info(opts):
5486         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5487         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5488
5489         def _scrub_eq(o):
5490             m = eqre.match(o)
5491             if m:
5492                 return m.group('key') + '=PRIVATE'
5493             else:
5494                 return o
5495
5496         opts = list(map(_scrub_eq, opts))
5497         for idx, opt in enumerate(opts):
5498             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5499                 opts[idx + 1] = 'PRIVATE'
5500         return opts
5501
5502     def append_config(self, *args, label=None):
5503         config = type(self)(self.parser, label)
5504         config._loaded_paths = self._loaded_paths
5505         if config.init(*args):
5506             self.configs.append(config)
5507
5508     @property
5509     def all_args(self):
5510         for config in reversed(self.configs):
5511             yield from config.all_args
5512         yield from self.parsed_args or []
5513
5514     def parse_known_args(self, **kwargs):
5515         return self.parser.parse_known_args(self.all_args, **kwargs)
5516
5517     def parse_args(self):
5518         return self.parser.parse_args(self.all_args)
5519
5520
5521 class WebSocketsWrapper():
5522     """Wraps websockets module to use in non-async scopes"""
5523     pool = None
5524
5525     def __init__(self, url, headers=None, connect=True):
5526         self.loop = asyncio.new_event_loop()
5527         # XXX: "loop" is deprecated
5528         self.conn = websockets.connect(
5529             url, extra_headers=headers, ping_interval=None,
5530             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5531         if connect:
5532             self.__enter__()
5533         atexit.register(self.__exit__, None, None, None)
5534
5535     def __enter__(self):
5536         if not self.pool:
5537             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5538         return self
5539
5540     def send(self, *args):
5541         self.run_with_loop(self.pool.send(*args), self.loop)
5542
5543     def recv(self, *args):
5544         return self.run_with_loop(self.pool.recv(*args), self.loop)
5545
5546     def __exit__(self, type, value, traceback):
5547         try:
5548             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5549         finally:
5550             self.loop.close()
5551             self._cancel_all_tasks(self.loop)
5552
5553     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5554     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5555     @staticmethod
5556     def run_with_loop(main, loop):
5557         if not asyncio.iscoroutine(main):
5558             raise ValueError(f'a coroutine was expected, got {main!r}')
5559
5560         try:
5561             return loop.run_until_complete(main)
5562         finally:
5563             loop.run_until_complete(loop.shutdown_asyncgens())
5564             if hasattr(loop, 'shutdown_default_executor'):
5565                 loop.run_until_complete(loop.shutdown_default_executor())
5566
5567     @staticmethod
5568     def _cancel_all_tasks(loop):
5569         to_cancel = asyncio.all_tasks(loop)
5570
5571         if not to_cancel:
5572             return
5573
5574         for task in to_cancel:
5575             task.cancel()
5576
5577         # XXX: "loop" is removed in python 3.10+
5578         loop.run_until_complete(
5579             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5580
5581         for task in to_cancel:
5582             if task.cancelled():
5583                 continue
5584             if task.exception() is not None:
5585                 loop.call_exception_handler({
5586                     'message': 'unhandled exception during asyncio.run() shutdown',
5587                     'exception': task.exception(),
5588                     'task': task,
5589                 })
5590
5591
5592 def merge_headers(*dicts):
5593     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5594     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5595
5596
5597 def cached_method(f):
5598     """Cache a method"""
5599     signature = inspect.signature(f)
5600
5601     @functools.wraps(f)
5602     def wrapper(self, *args, **kwargs):
5603         bound_args = signature.bind(self, *args, **kwargs)
5604         bound_args.apply_defaults()
5605         key = tuple(bound_args.arguments.values())
5606
5607         if not hasattr(self, '__cached_method__cache'):
5608             self.__cached_method__cache = {}
5609         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5610         if key not in cache:
5611             cache[key] = f(self, *args, **kwargs)
5612         return cache[key]
5613     return wrapper
5614
5615
5616 class classproperty:
5617     """property access for class methods"""
5618
5619     def __init__(self, func):
5620         functools.update_wrapper(self, func)
5621         self.func = func
5622
5623     def __get__(self, _, cls):
5624         return self.func(cls)
5625
5626
5627 class Namespace(types.SimpleNamespace):
5628     """Immutable namespace"""
5629
5630     def __iter__(self):
5631         return iter(self.__dict__.values())
5632
5633     @property
5634     def items_(self):
5635         return self.__dict__.items()
5636
5637
5638 MEDIA_EXTENSIONS = Namespace(
5639     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5640     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5641     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5642     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5643     thumbnails=('jpg', 'png', 'webp'),
5644     storyboards=('mhtml', ),
5645     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5646     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5647 )
5648 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5649 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5650
5651 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5652
5653
5654 class RetryManager:
5655     """Usage:
5656         for retry in RetryManager(...):
5657             try:
5658                 ...
5659             except SomeException as err:
5660                 retry.error = err
5661                 continue
5662     """
5663     attempt, _error = 0, None
5664
5665     def __init__(self, _retries, _error_callback, **kwargs):
5666         self.retries = _retries or 0
5667         self.error_callback = functools.partial(_error_callback, **kwargs)
5668
5669     def _should_retry(self):
5670         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5671
5672     @property
5673     def error(self):
5674         if self._error is NO_DEFAULT:
5675             return None
5676         return self._error
5677
5678     @error.setter
5679     def error(self, value):
5680         self._error = value
5681
5682     def __iter__(self):
5683         while self._should_retry():
5684             self.error = NO_DEFAULT
5685             self.attempt += 1
5686             yield self
5687             if self.error:
5688                 self.error_callback(self.error, self.attempt, self.retries)
5689
5690     @staticmethod
5691     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5692         """Utility function for reporting retries"""
5693         if count > retries:
5694             if error:
5695                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5696             raise e
5697
5698         if not count:
5699             return warn(e)
5700         elif isinstance(e, ExtractorError):
5701             e = remove_end(e.cause or e.orig_msg, '.')
5702         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5703
5704         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5705         if delay:
5706             info(f'Sleeping {delay:.2f} seconds ...')
5707             time.sleep(delay)
5708
5709
5710 # Deprecated
5711 has_certifi = bool(certifi)
5712 has_websockets = bool(websockets)