yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import importlib.util
  22 import inspect
  23 import io
  24 import itertools
  25 import json
  26 import locale
  27 import math
  28 import mimetypes
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from .compat import functools  # isort: split
  51 from .compat import (
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_HTMLParseError,
  55     compat_os_name,
  56     compat_shlex_quote,
  57 )
  58 from .dependencies import brotli, certifi, websockets, xattr
  59 from .socks import ProxyType, sockssocket
  60
  61
  62 def register_socks_protocols():
  63     # "Register" SOCKS protocols
  64     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  65     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  66     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  67         if scheme not in urllib.parse.uses_netloc:
  68             urllib.parse.uses_netloc.append(scheme)
  69
  70
  71 # This is not clearly defined otherwise
  72 compiled_regex_type = type(re.compile(''))
  73
  74
  75 def random_user_agent():
  76     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  77     _CHROME_VERSIONS = (
  78         '90.0.4430.212',
  79         '90.0.4430.24',
  80         '90.0.4430.70',
  81         '90.0.4430.72',
  82         '90.0.4430.85',
  83         '90.0.4430.93',
  84         '91.0.4472.101',
  85         '91.0.4472.106',
  86         '91.0.4472.114',
  87         '91.0.4472.124',
  88         '91.0.4472.164',
  89         '91.0.4472.19',
  90         '91.0.4472.77',
  91         '92.0.4515.107',
  92         '92.0.4515.115',
  93         '92.0.4515.131',
  94         '92.0.4515.159',
  95         '92.0.4515.43',
  96         '93.0.4556.0',
  97         '93.0.4577.15',
  98         '93.0.4577.63',
  99         '93.0.4577.82',
 100         '94.0.4606.41',
 101         '94.0.4606.54',
 102         '94.0.4606.61',
 103         '94.0.4606.71',
 104         '94.0.4606.81',
 105         '94.0.4606.85',
 106         '95.0.4638.17',
 107         '95.0.4638.50',
 108         '95.0.4638.54',
 109         '95.0.4638.69',
 110         '95.0.4638.74',
 111         '96.0.4664.18',
 112         '96.0.4664.45',
 113         '96.0.4664.55',
 114         '96.0.4664.93',
 115         '97.0.4692.20',
 116     )
 117     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 118
 119
 120 SUPPORTED_ENCODINGS = [
 121     'gzip', 'deflate'
 122 ]
 123 if brotli:
 124     SUPPORTED_ENCODINGS.append('br')
 125
 126 std_headers = {
 127     'User-Agent': random_user_agent(),
 128     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 129     'Accept-Language': 'en-us,en;q=0.5',
 130     'Sec-Fetch-Mode': 'navigate',
 131 }
 132
 133
 134 USER_AGENTS = {
 135     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 136 }
 137
 138
 139 NO_DEFAULT = object()
 140 IDENTITY = lambda x: x
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151 }
 152
 153 # needed for sanitizing filenames in restricted mode
 154 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 155                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 156                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 157
 158 DATE_FORMATS = (
 159     '%d %B %Y',
 160     '%d %b %Y',
 161     '%B %d %Y',
 162     '%B %dst %Y',
 163     '%B %dnd %Y',
 164     '%B %drd %Y',
 165     '%B %dth %Y',
 166     '%b %d %Y',
 167     '%b %dst %Y',
 168     '%b %dnd %Y',
 169     '%b %drd %Y',
 170     '%b %dth %Y',
 171     '%b %dst %Y %I:%M',
 172     '%b %dnd %Y %I:%M',
 173     '%b %drd %Y %I:%M',
 174     '%b %dth %Y %I:%M',
 175     '%Y %m %d',
 176     '%Y-%m-%d',
 177     '%Y.%m.%d.',
 178     '%Y/%m/%d',
 179     '%Y/%m/%d %H:%M',
 180     '%Y/%m/%d %H:%M:%S',
 181     '%Y%m%d%H%M',
 182     '%Y%m%d%H%M%S',
 183     '%Y%m%d',
 184     '%Y-%m-%d %H:%M',
 185     '%Y-%m-%d %H:%M:%S',
 186     '%Y-%m-%d %H:%M:%S.%f',
 187     '%Y-%m-%d %H:%M:%S:%f',
 188     '%d.%m.%Y %H:%M',
 189     '%d.%m.%Y %H.%M',
 190     '%Y-%m-%dT%H:%M:%SZ',
 191     '%Y-%m-%dT%H:%M:%S.%fZ',
 192     '%Y-%m-%dT%H:%M:%S.%f0Z',
 193     '%Y-%m-%dT%H:%M:%S',
 194     '%Y-%m-%dT%H:%M:%S.%f',
 195     '%Y-%m-%dT%H:%M',
 196     '%b %d %Y at %H:%M',
 197     '%b %d %Y at %H:%M:%S',
 198     '%B %d %Y at %H:%M',
 199     '%B %d %Y at %H:%M:%S',
 200     '%H:%M %d-%b-%Y',
 201 )
 202
 203 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 204 DATE_FORMATS_DAY_FIRST.extend([
 205     '%d-%m-%Y',
 206     '%d.%m.%Y',
 207     '%d.%m.%y',
 208     '%d/%m/%Y',
 209     '%d/%m/%y',
 210     '%d/%m/%Y %H:%M:%S',
 211     '%d-%m-%Y %H:%M',
 212 ])
 213
 214 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 215 DATE_FORMATS_MONTH_FIRST.extend([
 216     '%m-%d-%Y',
 217     '%m.%d.%Y',
 218     '%m/%d/%Y',
 219     '%m/%d/%y',
 220     '%m/%d/%Y %H:%M:%S',
 221 ])
 222
 223 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 224 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 225
 226 NUMBER_RE = r'\d+(?:\.\d+)?'
 227
 228
 229 @functools.cache
 230 def preferredencoding():
 231     """Get preferred encoding.
 232
 233     Returns the best encoding scheme for the system, based on
 234     locale.getpreferredencoding() and some further tweaks.
 235     """
 236     try:
 237         pref = locale.getpreferredencoding()
 238         'TEST'.encode(pref)
 239     except Exception:
 240         pref = 'UTF-8'
 241
 242     return pref
 243
 244
 245 def write_json_file(obj, fn):
 246     """ Encode obj as JSON and write it to fn, atomically if possible """
 247
 248     tf = tempfile.NamedTemporaryFile(
 249         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 250         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 251
 252     try:
 253         with tf:
 254             json.dump(obj, tf, ensure_ascii=False)
 255         if sys.platform == 'win32':
 256             # Need to remove existing file on Windows, else os.rename raises
 257             # WindowsError or FileExistsError.
 258             with contextlib.suppress(OSError):
 259                 os.unlink(fn)
 260         with contextlib.suppress(OSError):
 261             mask = os.umask(0)
 262             os.umask(mask)
 263             os.chmod(tf.name, 0o666 & ~mask)
 264         os.rename(tf.name, fn)
 265     except Exception:
 266         with contextlib.suppress(OSError):
 267             os.remove(tf.name)
 268         raise
 269
 270
 271 def find_xpath_attr(node, xpath, key, val=None):
 272     """ Find the xpath xpath[@key=val] """
 273     assert re.match(r'^[a-zA-Z_-]+$', key)
 274     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 275     return node.find(expr)
 276
 277 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 278 # the namespace parameter
 279
 280
 281 def xpath_with_ns(path, ns_map):
 282     components = [c.split(':') for c in path.split('/')]
 283     replaced = []
 284     for c in components:
 285         if len(c) == 1:
 286             replaced.append(c[0])
 287         else:
 288             ns, tag = c
 289             replaced.append('{%s}%s' % (ns_map[ns], tag))
 290     return '/'.join(replaced)
 291
 292
 293 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 294     def _find_xpath(xpath):
 295         return node.find(xpath)
 296
 297     if isinstance(xpath, str):
 298         n = _find_xpath(xpath)
 299     else:
 300         for xp in xpath:
 301             n = _find_xpath(xp)
 302             if n is not None:
 303                 break
 304
 305     if n is None:
 306         if default is not NO_DEFAULT:
 307             return default
 308         elif fatal:
 309             name = xpath if name is None else name
 310             raise ExtractorError('Could not find XML element %s' % name)
 311         else:
 312             return None
 313     return n
 314
 315
 316 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 317     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 318     if n is None or n == default:
 319         return n
 320     if n.text is None:
 321         if default is not NO_DEFAULT:
 322             return default
 323         elif fatal:
 324             name = xpath if name is None else name
 325             raise ExtractorError('Could not find XML element\'s text %s' % name)
 326         else:
 327             return None
 328     return n.text
 329
 330
 331 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 332     n = find_xpath_attr(node, xpath, key)
 333     if n is None:
 334         if default is not NO_DEFAULT:
 335             return default
 336         elif fatal:
 337             name = f'{xpath}[@{key}]' if name is None else name
 338             raise ExtractorError('Could not find XML attribute %s' % name)
 339         else:
 340             return None
 341     return n.attrib[key]
 342
 343
 344 def get_element_by_id(id, html, **kwargs):
 345     """Return the content of the tag with the specified ID in the passed HTML document"""
 346     return get_element_by_attribute('id', id, html, **kwargs)
 347
 348
 349 def get_element_html_by_id(id, html, **kwargs):
 350     """Return the html of the tag with the specified ID in the passed HTML document"""
 351     return get_element_html_by_attribute('id', id, html, **kwargs)
 352
 353
 354 def get_element_by_class(class_name, html):
 355     """Return the content of the first tag with the specified class in the passed HTML document"""
 356     retval = get_elements_by_class(class_name, html)
 357     return retval[0] if retval else None
 358
 359
 360 def get_element_html_by_class(class_name, html):
 361     """Return the html of the first tag with the specified class in the passed HTML document"""
 362     retval = get_elements_html_by_class(class_name, html)
 363     return retval[0] if retval else None
 364
 365
 366 def get_element_by_attribute(attribute, value, html, **kwargs):
 367     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 368     return retval[0] if retval else None
 369
 370
 371 def get_element_html_by_attribute(attribute, value, html, **kargs):
 372     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 373     return retval[0] if retval else None
 374
 375
 376 def get_elements_by_class(class_name, html, **kargs):
 377     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 378     return get_elements_by_attribute(
 379         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 380         html, escape_value=False)
 381
 382
 383 def get_elements_html_by_class(class_name, html):
 384     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 385     return get_elements_html_by_attribute(
 386         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 387         html, escape_value=False)
 388
 389
 390 def get_elements_by_attribute(*args, **kwargs):
 391     """Return the content of the tag with the specified attribute in the passed HTML document"""
 392     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 393
 394
 395 def get_elements_html_by_attribute(*args, **kwargs):
 396     """Return the html of the tag with the specified attribute in the passed HTML document"""
 397     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 398
 399
 400 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 401     """
 402     Return the text (content) and the html (whole) of the tag with the specified
 403     attribute in the passed HTML document
 404     """
 405
 406     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 407
 408     value = re.escape(value) if escape_value else value
 409
 410     partial_element_re = rf'''(?x)
 411         <(?P<tag>[a-zA-Z0-9:._-]+)
 412          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 413          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 414         '''
 415
 416     for m in re.finditer(partial_element_re, html):
 417         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 418
 419         yield (
 420             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 421             whole
 422         )
 423
 424
 425 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 426     """
 427     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 428     closing tag for the first opening tag it has encountered, and can be used
 429     as a context manager
 430     """
 431
 432     class HTMLBreakOnClosingTagException(Exception):
 433         pass
 434
 435     def __init__(self):
 436         self.tagstack = collections.deque()
 437         html.parser.HTMLParser.__init__(self)
 438
 439     def __enter__(self):
 440         return self
 441
 442     def __exit__(self, *_):
 443         self.close()
 444
 445     def close(self):
 446         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 447         # so data remains buffered; we no longer have any interest in it, thus
 448         # override this method to discard it
 449         pass
 450
 451     def handle_starttag(self, tag, _):
 452         self.tagstack.append(tag)
 453
 454     def handle_endtag(self, tag):
 455         if not self.tagstack:
 456             raise compat_HTMLParseError('no tags in the stack')
 457         while self.tagstack:
 458             inner_tag = self.tagstack.pop()
 459             if inner_tag == tag:
 460                 break
 461         else:
 462             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 463         if not self.tagstack:
 464             raise self.HTMLBreakOnClosingTagException()
 465
 466
 467 def get_element_text_and_html_by_tag(tag, html):
 468     """
 469     For the first element with the specified tag in the passed HTML document
 470     return its' content (text) and the whole element (html)
 471     """
 472     def find_or_raise(haystack, needle, exc):
 473         try:
 474             return haystack.index(needle)
 475         except ValueError:
 476             raise exc
 477     closing_tag = f'</{tag}>'
 478     whole_start = find_or_raise(
 479         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 480     content_start = find_or_raise(
 481         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 482     content_start += whole_start + 1
 483     with HTMLBreakOnClosingTagParser() as parser:
 484         parser.feed(html[whole_start:content_start])
 485         if not parser.tagstack or parser.tagstack[0] != tag:
 486             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 487         offset = content_start
 488         while offset < len(html):
 489             next_closing_tag_start = find_or_raise(
 490                 html[offset:], closing_tag,
 491                 compat_HTMLParseError(f'closing {tag} tag not found'))
 492             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 493             try:
 494                 parser.feed(html[offset:offset + next_closing_tag_end])
 495                 offset += next_closing_tag_end
 496             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 497                 return html[content_start:offset + next_closing_tag_start], \
 498                     html[whole_start:offset + next_closing_tag_end]
 499         raise compat_HTMLParseError('unexpected end of html')
 500
 501
 502 class HTMLAttributeParser(html.parser.HTMLParser):
 503     """Trivial HTML parser to gather the attributes for a single element"""
 504
 505     def __init__(self):
 506         self.attrs = {}
 507         html.parser.HTMLParser.__init__(self)
 508
 509     def handle_starttag(self, tag, attrs):
 510         self.attrs = dict(attrs)
 511
 512
 513 class HTMLListAttrsParser(html.parser.HTMLParser):
 514     """HTML parser to gather the attributes for the elements of a list"""
 515
 516     def __init__(self):
 517         html.parser.HTMLParser.__init__(self)
 518         self.items = []
 519         self._level = 0
 520
 521     def handle_starttag(self, tag, attrs):
 522         if tag == 'li' and self._level == 0:
 523             self.items.append(dict(attrs))
 524         self._level += 1
 525
 526     def handle_endtag(self, tag):
 527         self._level -= 1
 528
 529
 530 def extract_attributes(html_element):
 531     """Given a string for an HTML element such as
 532     <el
 533          a="foo" B="bar" c="&98;az" d=boz
 534          empty= noval entity="&amp;"
 535          sq='"' dq="'"
 536     >
 537     Decode and return a dictionary of attributes.
 538     {
 539         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 540         'empty': '', 'noval': None, 'entity': '&',
 541         'sq': '"', 'dq': '\''
 542     }.
 543     """
 544     parser = HTMLAttributeParser()
 545     with contextlib.suppress(compat_HTMLParseError):
 546         parser.feed(html_element)
 547         parser.close()
 548     return parser.attrs
 549
 550
 551 def parse_list(webpage):
 552     """Given a string for an series of HTML <li> elements,
 553     return a dictionary of their attributes"""
 554     parser = HTMLListAttrsParser()
 555     parser.feed(webpage)
 556     parser.close()
 557     return parser.items
 558
 559
 560 def clean_html(html):
 561     """Clean an HTML snippet into a readable string"""
 562
 563     if html is None:  # Convenience for sanitizing descriptions etc.
 564         return html
 565
 566     html = re.sub(r'\s+', ' ', html)
 567     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 568     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 569     # Strip html tags
 570     html = re.sub('<.*?>', '', html)
 571     # Replace html entities
 572     html = unescapeHTML(html)
 573     return html.strip()
 574
 575
 576 class LenientJSONDecoder(json.JSONDecoder):
 577     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 578         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 579         super().__init__(*args, **kwargs)
 580
 581     def decode(self, s):
 582         if self.transform_source:
 583             s = self.transform_source(s)
 584         if self.ignore_extra:
 585             return self.raw_decode(s.lstrip())[0]
 586         return super().decode(s)
 587
 588
 589 def sanitize_open(filename, open_mode):
 590     """Try to open the given filename, and slightly tweak it if this fails.
 591
 592     Attempts to open the given filename. If this fails, it tries to change
 593     the filename slightly, step by step, until it's either able to open it
 594     or it fails and raises a final exception, like the standard open()
 595     function.
 596
 597     It returns the tuple (stream, definitive_file_name).
 598     """
 599     if filename == '-':
 600         if sys.platform == 'win32':
 601             import msvcrt
 602             # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout
 603             with contextlib.suppress(io.UnsupportedOperation):
 604                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 605         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 606
 607     for attempt in range(2):
 608         try:
 609             try:
 610                 if sys.platform == 'win32':
 611                     # FIXME: An exclusive lock also locks the file from being read.
 612                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 613                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 614                     raise LockingUnsupportedError()
 615                 stream = locked_file(filename, open_mode, block=False).__enter__()
 616             except OSError:
 617                 stream = open(filename, open_mode)
 618             return stream, filename
 619         except OSError as err:
 620             if attempt or err.errno in (errno.EACCES,):
 621                 raise
 622             old_filename, filename = filename, sanitize_path(filename)
 623             if old_filename == filename:
 624                 raise
 625
 626
 627 def timeconvert(timestr):
 628     """Convert RFC 2822 defined time string into system timestamp"""
 629     timestamp = None
 630     timetuple = email.utils.parsedate_tz(timestr)
 631     if timetuple is not None:
 632         timestamp = email.utils.mktime_tz(timetuple)
 633     return timestamp
 634
 635
 636 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 637     """Sanitizes a string so it could be used as part of a filename.
 638     @param restricted   Use a stricter subset of allowed characters
 639     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 640                         If unset, yt-dlp's new sanitization rules are in effect
 641     """
 642     if s == '':
 643         return ''
 644
 645     def replace_insane(char):
 646         if restricted and char in ACCENT_CHARS:
 647             return ACCENT_CHARS[char]
 648         elif not restricted and char == '\n':
 649             return '\0 '
 650         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 651             return ''
 652         elif char == '"':
 653             return '' if restricted else '\''
 654         elif char == ':':
 655             return '\0_\0-' if restricted else '\0 \0-'
 656         elif char in '\\/|*<>':
 657             return '\0_'
 658         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 659             return '\0_'
 660         return char
 661
 662     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 663     result = ''.join(map(replace_insane, s))
 664     if is_id is NO_DEFAULT:
 665         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 666         STRIP_RE = r'(?:\0.|[ _-])*'
 667         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 668     result = result.replace('\0', '') or '_'
 669
 670     if not is_id:
 671         while '__' in result:
 672             result = result.replace('__', '_')
 673         result = result.strip('_')
 674         # Common case of "Foreign band name - English song title"
 675         if restricted and result.startswith('-_'):
 676             result = result[2:]
 677         if result.startswith('-'):
 678             result = '_' + result[len('-'):]
 679         result = result.lstrip('.')
 680         if not result:
 681             result = '_'
 682     return result
 683
 684
 685 def sanitize_path(s, force=False):
 686     """Sanitizes and normalizes path on Windows"""
 687     if sys.platform == 'win32':
 688         force = False
 689         drive_or_unc, _ = os.path.splitdrive(s)
 690     elif force:
 691         drive_or_unc = ''
 692     else:
 693         return s
 694
 695     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 696     if drive_or_unc:
 697         norm_path.pop(0)
 698     sanitized_path = [
 699         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 700         for path_part in norm_path]
 701     if drive_or_unc:
 702         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 703     elif force and s and s[0] == os.path.sep:
 704         sanitized_path.insert(0, os.path.sep)
 705     return os.path.join(*sanitized_path)
 706
 707
 708 def sanitize_url(url, *, scheme='http'):
 709     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 710     # the number of unwanted failures due to missing protocol
 711     if url is None:
 712         return
 713     elif url.startswith('//'):
 714         return f'{scheme}:{url}'
 715     # Fix some common typos seen so far
 716     COMMON_TYPOS = (
 717         # https://github.com/ytdl-org/youtube-dl/issues/15649
 718         (r'^httpss://', r'https://'),
 719         # https://bx1.be/lives/direct-tv/
 720         (r'^rmtp([es]?)://', r'rtmp\1://'),
 721     )
 722     for mistake, fixup in COMMON_TYPOS:
 723         if re.match(mistake, url):
 724             return re.sub(mistake, fixup, url)
 725     return url
 726
 727
 728 def extract_basic_auth(url):
 729     parts = urllib.parse.urlsplit(url)
 730     if parts.username is None:
 731         return url, None
 732     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 733         parts.hostname if parts.port is None
 734         else '%s:%d' % (parts.hostname, parts.port))))
 735     auth_payload = base64.b64encode(
 736         ('%s:%s' % (parts.username, parts.password or '')).encode())
 737     return url, f'Basic {auth_payload.decode()}'
 738
 739
 740 def sanitized_Request(url, *args, **kwargs):
 741     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 742     if auth_header is not None:
 743         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 744         headers['Authorization'] = auth_header
 745     return urllib.request.Request(url, *args, **kwargs)
 746
 747
 748 def expand_path(s):
 749     """Expand shell variables and ~"""
 750     return os.path.expandvars(compat_expanduser(s))
 751
 752
 753 def orderedSet(iterable, *, lazy=False):
 754     """Remove all duplicates from the input iterable"""
 755     def _iter():
 756         seen = []  # Do not use set since the items can be unhashable
 757         for x in iterable:
 758             if x not in seen:
 759                 seen.append(x)
 760                 yield x
 761
 762     return _iter() if lazy else list(_iter())
 763
 764
 765 def _htmlentity_transform(entity_with_semicolon):
 766     """Transforms an HTML entity to a character."""
 767     entity = entity_with_semicolon[:-1]
 768
 769     # Known non-numeric HTML entity
 770     if entity in html.entities.name2codepoint:
 771         return chr(html.entities.name2codepoint[entity])
 772
 773     # TODO: HTML5 allows entities without a semicolon. For example,
 774     # '&Eacuteric' should be decoded as 'Éric'.
 775     if entity_with_semicolon in html.entities.html5:
 776         return html.entities.html5[entity_with_semicolon]
 777
 778     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 779     if mobj is not None:
 780         numstr = mobj.group(1)
 781         if numstr.startswith('x'):
 782             base = 16
 783             numstr = '0%s' % numstr
 784         else:
 785             base = 10
 786         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 787         with contextlib.suppress(ValueError):
 788             return chr(int(numstr, base))
 789
 790     # Unknown entity in name, return its literal representation
 791     return '&%s;' % entity
 792
 793
 794 def unescapeHTML(s):
 795     if s is None:
 796         return None
 797     assert isinstance(s, str)
 798
 799     return re.sub(
 800         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 801
 802
 803 def escapeHTML(text):
 804     return (
 805         text
 806         .replace('&', '&amp;')
 807         .replace('<', '&lt;')
 808         .replace('>', '&gt;')
 809         .replace('"', '&quot;')
 810         .replace("'", '&#39;')
 811     )
 812
 813
 814 def process_communicate_or_kill(p, *args, **kwargs):
 815     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 816                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 817     return Popen.communicate_or_kill(p, *args, **kwargs)
 818
 819
 820 class Popen(subprocess.Popen):
 821     if sys.platform == 'win32':
 822         _startupinfo = subprocess.STARTUPINFO()
 823         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 824     else:
 825         _startupinfo = None
 826
 827     def __init__(self, *args, text=False, **kwargs):
 828         if text is True:
 829             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 830             kwargs.setdefault('encoding', 'utf-8')
 831             kwargs.setdefault('errors', 'replace')
 832         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 833
 834     def communicate_or_kill(self, *args, **kwargs):
 835         try:
 836             return self.communicate(*args, **kwargs)
 837         except BaseException:  # Including KeyboardInterrupt
 838             self.kill(timeout=None)
 839             raise
 840
 841     def kill(self, *, timeout=0):
 842         super().kill()
 843         if timeout != 0:
 844             self.wait(timeout=timeout)
 845
 846     @classmethod
 847     def run(cls, *args, **kwargs):
 848         with cls(*args, **kwargs) as proc:
 849             stdout, stderr = proc.communicate_or_kill()
 850             return stdout or '', stderr or '', proc.returncode
 851
 852
 853 def get_subprocess_encoding():
 854     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 855         # For subprocess calls, encode with locale encoding
 856         # Refer to http://stackoverflow.com/a/9951851/35070
 857         encoding = preferredencoding()
 858     else:
 859         encoding = sys.getfilesystemencoding()
 860     if encoding is None:
 861         encoding = 'utf-8'
 862     return encoding
 863
 864
 865 def encodeFilename(s, for_subprocess=False):
 866     assert isinstance(s, str)
 867     return s
 868
 869
 870 def decodeFilename(b, for_subprocess=False):
 871     return b
 872
 873
 874 def encodeArgument(s):
 875     # Legacy code that uses byte strings
 876     # Uncomment the following line after fixing all post processors
 877     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 878     return s if isinstance(s, str) else s.decode('ascii')
 879
 880
 881 def decodeArgument(b):
 882     return b
 883
 884
 885 def decodeOption(optval):
 886     if optval is None:
 887         return optval
 888     if isinstance(optval, bytes):
 889         optval = optval.decode(preferredencoding())
 890
 891     assert isinstance(optval, str)
 892     return optval
 893
 894
 895 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 896
 897
 898 def timetuple_from_msec(msec):
 899     secs, msec = divmod(msec, 1000)
 900     mins, secs = divmod(secs, 60)
 901     hrs, mins = divmod(mins, 60)
 902     return _timetuple(hrs, mins, secs, msec)
 903
 904
 905 def formatSeconds(secs, delim=':', msec=False):
 906     time = timetuple_from_msec(secs * 1000)
 907     if time.hours:
 908         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 909     elif time.minutes:
 910         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 911     else:
 912         ret = '%d' % time.seconds
 913     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 914
 915
 916 def _ssl_load_windows_store_certs(ssl_context, storename):
 917     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 918     try:
 919         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 920                  if encoding == 'x509_asn' and (
 921                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 922     except PermissionError:
 923         return
 924     for cert in certs:
 925         with contextlib.suppress(ssl.SSLError):
 926             ssl_context.load_verify_locations(cadata=cert)
 927
 928
 929 def make_HTTPS_handler(params, **kwargs):
 930     opts_check_certificate = not params.get('nocheckcertificate')
 931     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 932     context.check_hostname = opts_check_certificate
 933     if params.get('legacyserverconnect'):
 934         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 935         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 936         context.set_ciphers('DEFAULT')
 937
 938     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 939     if opts_check_certificate:
 940         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 941             context.load_verify_locations(cafile=certifi.where())
 942         else:
 943             try:
 944                 context.load_default_certs()
 945                 # Work around the issue in load_default_certs when there are bad certificates. See:
 946                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 947                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 948             except ssl.SSLError:
 949                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 950                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 951                     for storename in ('CA', 'ROOT'):
 952                         _ssl_load_windows_store_certs(context, storename)
 953                 context.set_default_verify_paths()
 954
 955     client_certfile = params.get('client_certificate')
 956     if client_certfile:
 957         try:
 958             context.load_cert_chain(
 959                 client_certfile, keyfile=params.get('client_certificate_key'),
 960                 password=params.get('client_certificate_password'))
 961         except ssl.SSLError:
 962             raise YoutubeDLError('Unable to load client certificate')
 963
 964     # Some servers may reject requests if ALPN extension is not sent. See:
 965     # https://github.com/python/cpython/issues/85140
 966     # https://github.com/yt-dlp/yt-dlp/issues/3878
 967     with contextlib.suppress(NotImplementedError):
 968         context.set_alpn_protocols(['http/1.1'])
 969
 970     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 971
 972
 973 def bug_reports_message(before=';'):
 974     from .update import REPOSITORY
 975
 976     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 977            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 978
 979     before = before.rstrip()
 980     if not before or before.endswith(('.', '!', '?')):
 981         msg = msg[0].title() + msg[1:]
 982
 983     return (before + ' ' if before else '') + msg
 984
 985
 986 class YoutubeDLError(Exception):
 987     """Base exception for YoutubeDL errors."""
 988     msg = None
 989
 990     def __init__(self, msg=None):
 991         if msg is not None:
 992             self.msg = msg
 993         elif self.msg is None:
 994             self.msg = type(self).__name__
 995         super().__init__(self.msg)
 996
 997
 998 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
 999 if hasattr(ssl, 'CertificateError'):
1000     network_exceptions.append(ssl.CertificateError)
1001 network_exceptions = tuple(network_exceptions)
1002
1003
1004 class ExtractorError(YoutubeDLError):
1005     """Error during info extraction."""
1006
1007     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1008         """ tb, if given, is the original traceback (so that it can be printed out).
1009         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1010         """
1011         if sys.exc_info()[0] in network_exceptions:
1012             expected = True
1013
1014         self.orig_msg = str(msg)
1015         self.traceback = tb
1016         self.expected = expected
1017         self.cause = cause
1018         self.video_id = video_id
1019         self.ie = ie
1020         self.exc_info = sys.exc_info()  # preserve original exception
1021         if isinstance(self.exc_info[1], ExtractorError):
1022             self.exc_info = self.exc_info[1].exc_info
1023
1024         super().__init__(''.join((
1025             format_field(ie, None, '[%s] '),
1026             format_field(video_id, None, '%s: '),
1027             msg,
1028             format_field(cause, None, ' (caused by %r)'),
1029             '' if expected else bug_reports_message())))
1030
1031     def format_traceback(self):
1032         return join_nonempty(
1033             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1034             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1035             delim='\n') or None
1036
1037
1038 class UnsupportedError(ExtractorError):
1039     def __init__(self, url):
1040         super().__init__(
1041             'Unsupported URL: %s' % url, expected=True)
1042         self.url = url
1043
1044
1045 class RegexNotFoundError(ExtractorError):
1046     """Error when a regex didn't match"""
1047     pass
1048
1049
1050 class GeoRestrictedError(ExtractorError):
1051     """Geographic restriction Error exception.
1052
1053     This exception may be thrown when a video is not available from your
1054     geographic location due to geographic restrictions imposed by a website.
1055     """
1056
1057     def __init__(self, msg, countries=None, **kwargs):
1058         kwargs['expected'] = True
1059         super().__init__(msg, **kwargs)
1060         self.countries = countries
1061
1062
1063 class UserNotLive(ExtractorError):
1064     """Error when a channel/user is not live"""
1065
1066     def __init__(self, msg=None, **kwargs):
1067         kwargs['expected'] = True
1068         super().__init__(msg or 'The channel is not currently live', **kwargs)
1069
1070
1071 class DownloadError(YoutubeDLError):
1072     """Download Error exception.
1073
1074     This exception may be thrown by FileDownloader objects if they are not
1075     configured to continue on errors. They will contain the appropriate
1076     error message.
1077     """
1078
1079     def __init__(self, msg, exc_info=None):
1080         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1081         super().__init__(msg)
1082         self.exc_info = exc_info
1083
1084
1085 class EntryNotInPlaylist(YoutubeDLError):
1086     """Entry not in playlist exception.
1087
1088     This exception will be thrown by YoutubeDL when a requested entry
1089     is not found in the playlist info_dict
1090     """
1091     msg = 'Entry not found in info'
1092
1093
1094 class SameFileError(YoutubeDLError):
1095     """Same File exception.
1096
1097     This exception will be thrown by FileDownloader objects if they detect
1098     multiple files would have to be downloaded to the same file on disk.
1099     """
1100     msg = 'Fixed output name but more than one file to download'
1101
1102     def __init__(self, filename=None):
1103         if filename is not None:
1104             self.msg += f': {filename}'
1105         super().__init__(self.msg)
1106
1107
1108 class PostProcessingError(YoutubeDLError):
1109     """Post Processing exception.
1110
1111     This exception may be raised by PostProcessor's .run() method to
1112     indicate an error in the postprocessing task.
1113     """
1114
1115
1116 class DownloadCancelled(YoutubeDLError):
1117     """ Exception raised when the download queue should be interrupted """
1118     msg = 'The download was cancelled'
1119
1120
1121 class ExistingVideoReached(DownloadCancelled):
1122     """ --break-on-existing triggered """
1123     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1124
1125
1126 class RejectedVideoReached(DownloadCancelled):
1127     """ --break-on-reject triggered """
1128     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1129
1130
1131 class MaxDownloadsReached(DownloadCancelled):
1132     """ --max-downloads limit has been reached. """
1133     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1134
1135
1136 class ReExtractInfo(YoutubeDLError):
1137     """ Video info needs to be re-extracted. """
1138
1139     def __init__(self, msg, expected=False):
1140         super().__init__(msg)
1141         self.expected = expected
1142
1143
1144 class ThrottledDownload(ReExtractInfo):
1145     """ Download speed below --throttled-rate. """
1146     msg = 'The download speed is below throttle limit'
1147
1148     def __init__(self):
1149         super().__init__(self.msg, expected=False)
1150
1151
1152 class UnavailableVideoError(YoutubeDLError):
1153     """Unavailable Format exception.
1154
1155     This exception will be thrown when a video is requested
1156     in a format that is not available for that video.
1157     """
1158     msg = 'Unable to download video'
1159
1160     def __init__(self, err=None):
1161         if err is not None:
1162             self.msg += f': {err}'
1163         super().__init__(self.msg)
1164
1165
1166 class ContentTooShortError(YoutubeDLError):
1167     """Content Too Short exception.
1168
1169     This exception may be raised by FileDownloader objects when a file they
1170     download is too small for what the server announced first, indicating
1171     the connection was probably interrupted.
1172     """
1173
1174     def __init__(self, downloaded, expected):
1175         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1176         # Both in bytes
1177         self.downloaded = downloaded
1178         self.expected = expected
1179
1180
1181 class XAttrMetadataError(YoutubeDLError):
1182     def __init__(self, code=None, msg='Unknown error'):
1183         super().__init__(msg)
1184         self.code = code
1185         self.msg = msg
1186
1187         # Parsing code and msg
1188         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1189                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1190             self.reason = 'NO_SPACE'
1191         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1192             self.reason = 'VALUE_TOO_LONG'
1193         else:
1194             self.reason = 'NOT_SUPPORTED'
1195
1196
1197 class XAttrUnavailableError(YoutubeDLError):
1198     pass
1199
1200
1201 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1202     hc = http_class(*args, **kwargs)
1203     source_address = ydl_handler._params.get('source_address')
1204
1205     if source_address is not None:
1206         # This is to workaround _create_connection() from socket where it will try all
1207         # address data from getaddrinfo() including IPv6. This filters the result from
1208         # getaddrinfo() based on the source_address value.
1209         # This is based on the cpython socket.create_connection() function.
1210         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1211         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1212             host, port = address
1213             err = None
1214             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1215             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1216             ip_addrs = [addr for addr in addrs if addr[0] == af]
1217             if addrs and not ip_addrs:
1218                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1219                 raise OSError(
1220                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1221                     % (ip_version, source_address[0]))
1222             for res in ip_addrs:
1223                 af, socktype, proto, canonname, sa = res
1224                 sock = None
1225                 try:
1226                     sock = socket.socket(af, socktype, proto)
1227                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1228                         sock.settimeout(timeout)
1229                     sock.bind(source_address)
1230                     sock.connect(sa)
1231                     err = None  # Explicitly break reference cycle
1232                     return sock
1233                 except OSError as _:
1234                     err = _
1235                     if sock is not None:
1236                         sock.close()
1237             if err is not None:
1238                 raise err
1239             else:
1240                 raise OSError('getaddrinfo returns an empty list')
1241         if hasattr(hc, '_create_connection'):
1242             hc._create_connection = _create_connection
1243         hc.source_address = (source_address, 0)
1244
1245     return hc
1246
1247
1248 def handle_youtubedl_headers(headers):
1249     filtered_headers = headers
1250
1251     if 'Youtubedl-no-compression' in filtered_headers:
1252         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1253         del filtered_headers['Youtubedl-no-compression']
1254
1255     return filtered_headers
1256
1257
1258 class YoutubeDLHandler(urllib.request.HTTPHandler):
1259     """Handler for HTTP requests and responses.
1260
1261     This class, when installed with an OpenerDirector, automatically adds
1262     the standard headers to every HTTP request and handles gzipped and
1263     deflated responses from web servers. If compression is to be avoided in
1264     a particular request, the original request in the program code only has
1265     to include the HTTP header "Youtubedl-no-compression", which will be
1266     removed before making the real request.
1267
1268     Part of this code was copied from:
1269
1270     http://techknack.net/python-urllib2-handlers/
1271
1272     Andrew Rowls, the author of that code, agreed to release it to the
1273     public domain.
1274     """
1275
1276     def __init__(self, params, *args, **kwargs):
1277         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1278         self._params = params
1279
1280     def http_open(self, req):
1281         conn_class = http.client.HTTPConnection
1282
1283         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1284         if socks_proxy:
1285             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1286             del req.headers['Ytdl-socks-proxy']
1287
1288         return self.do_open(functools.partial(
1289             _create_http_connection, self, conn_class, False),
1290             req)
1291
1292     @staticmethod
1293     def deflate(data):
1294         if not data:
1295             return data
1296         try:
1297             return zlib.decompress(data, -zlib.MAX_WBITS)
1298         except zlib.error:
1299             return zlib.decompress(data)
1300
1301     @staticmethod
1302     def brotli(data):
1303         if not data:
1304             return data
1305         return brotli.decompress(data)
1306
1307     def http_request(self, req):
1308         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1309         # always respected by websites, some tend to give out URLs with non percent-encoded
1310         # non-ASCII characters (see telemb.py, ard.py [#3412])
1311         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1312         # To work around aforementioned issue we will replace request's original URL with
1313         # percent-encoded one
1314         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1315         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1316         url = req.get_full_url()
1317         url_escaped = escape_url(url)
1318
1319         # Substitute URL if any change after escaping
1320         if url != url_escaped:
1321             req = update_Request(req, url=url_escaped)
1322
1323         for h, v in self._params.get('http_headers', std_headers).items():
1324             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1325             # The dict keys are capitalized because of this bug by urllib
1326             if h.capitalize() not in req.headers:
1327                 req.add_header(h, v)
1328
1329         if 'Accept-encoding' not in req.headers:
1330             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1331
1332         req.headers = handle_youtubedl_headers(req.headers)
1333
1334         return super().do_request_(req)
1335
1336     def http_response(self, req, resp):
1337         old_resp = resp
1338         # gzip
1339         if resp.headers.get('Content-encoding', '') == 'gzip':
1340             content = resp.read()
1341             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1342             try:
1343                 uncompressed = io.BytesIO(gz.read())
1344             except OSError as original_ioerror:
1345                 # There may be junk add the end of the file
1346                 # See http://stackoverflow.com/q/4928560/35070 for details
1347                 for i in range(1, 1024):
1348                     try:
1349                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1350                         uncompressed = io.BytesIO(gz.read())
1351                     except OSError:
1352                         continue
1353                     break
1354                 else:
1355                     raise original_ioerror
1356             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1357             resp.msg = old_resp.msg
1358             del resp.headers['Content-encoding']
1359         # deflate
1360         if resp.headers.get('Content-encoding', '') == 'deflate':
1361             gz = io.BytesIO(self.deflate(resp.read()))
1362             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1363             resp.msg = old_resp.msg
1364             del resp.headers['Content-encoding']
1365         # brotli
1366         if resp.headers.get('Content-encoding', '') == 'br':
1367             resp = urllib.request.addinfourl(
1368                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1369             resp.msg = old_resp.msg
1370             del resp.headers['Content-encoding']
1371         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1372         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1373         if 300 <= resp.code < 400:
1374             location = resp.headers.get('Location')
1375             if location:
1376                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1377                 location = location.encode('iso-8859-1').decode()
1378                 location_escaped = escape_url(location)
1379                 if location != location_escaped:
1380                     del resp.headers['Location']
1381                     resp.headers['Location'] = location_escaped
1382         return resp
1383
1384     https_request = http_request
1385     https_response = http_response
1386
1387
1388 def make_socks_conn_class(base_class, socks_proxy):
1389     assert issubclass(base_class, (
1390         http.client.HTTPConnection, http.client.HTTPSConnection))
1391
1392     url_components = urllib.parse.urlparse(socks_proxy)
1393     if url_components.scheme.lower() == 'socks5':
1394         socks_type = ProxyType.SOCKS5
1395     elif url_components.scheme.lower() in ('socks', 'socks4'):
1396         socks_type = ProxyType.SOCKS4
1397     elif url_components.scheme.lower() == 'socks4a':
1398         socks_type = ProxyType.SOCKS4A
1399
1400     def unquote_if_non_empty(s):
1401         if not s:
1402             return s
1403         return urllib.parse.unquote_plus(s)
1404
1405     proxy_args = (
1406         socks_type,
1407         url_components.hostname, url_components.port or 1080,
1408         True,  # Remote DNS
1409         unquote_if_non_empty(url_components.username),
1410         unquote_if_non_empty(url_components.password),
1411     )
1412
1413     class SocksConnection(base_class):
1414         def connect(self):
1415             self.sock = sockssocket()
1416             self.sock.setproxy(*proxy_args)
1417             if isinstance(self.timeout, (int, float)):
1418                 self.sock.settimeout(self.timeout)
1419             self.sock.connect((self.host, self.port))
1420
1421             if isinstance(self, http.client.HTTPSConnection):
1422                 if hasattr(self, '_context'):  # Python > 2.6
1423                     self.sock = self._context.wrap_socket(
1424                         self.sock, server_hostname=self.host)
1425                 else:
1426                     self.sock = ssl.wrap_socket(self.sock)
1427
1428     return SocksConnection
1429
1430
1431 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1432     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1433         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1434         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1435         self._params = params
1436
1437     def https_open(self, req):
1438         kwargs = {}
1439         conn_class = self._https_conn_class
1440
1441         if hasattr(self, '_context'):  # python > 2.6
1442             kwargs['context'] = self._context
1443         if hasattr(self, '_check_hostname'):  # python 3.x
1444             kwargs['check_hostname'] = self._check_hostname
1445
1446         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1447         if socks_proxy:
1448             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1449             del req.headers['Ytdl-socks-proxy']
1450
1451         try:
1452             return self.do_open(
1453                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1454         except urllib.error.URLError as e:
1455             if (isinstance(e.reason, ssl.SSLError)
1456                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1457                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1458             raise
1459
1460
1461 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1462     """
1463     See [1] for cookie file format.
1464
1465     1. https://curl.haxx.se/docs/http-cookies.html
1466     """
1467     _HTTPONLY_PREFIX = '#HttpOnly_'
1468     _ENTRY_LEN = 7
1469     _HEADER = '''# Netscape HTTP Cookie File
1470 # This file is generated by yt-dlp.  Do not edit.
1471
1472 '''
1473     _CookieFileEntry = collections.namedtuple(
1474         'CookieFileEntry',
1475         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1476
1477     def __init__(self, filename=None, *args, **kwargs):
1478         super().__init__(None, *args, **kwargs)
1479         if self.is_path(filename):
1480             filename = os.fspath(filename)
1481         self.filename = filename
1482
1483     @staticmethod
1484     def _true_or_false(cndn):
1485         return 'TRUE' if cndn else 'FALSE'
1486
1487     @staticmethod
1488     def is_path(file):
1489         return isinstance(file, (str, bytes, os.PathLike))
1490
1491     @contextlib.contextmanager
1492     def open(self, file, *, write=False):
1493         if self.is_path(file):
1494             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1495                 yield f
1496         else:
1497             if write:
1498                 file.truncate(0)
1499             yield file
1500
1501     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1502         now = time.time()
1503         for cookie in self:
1504             if (not ignore_discard and cookie.discard
1505                     or not ignore_expires and cookie.is_expired(now)):
1506                 continue
1507             name, value = cookie.name, cookie.value
1508             if value is None:
1509                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1510                 # with no name, whereas http.cookiejar regards it as a
1511                 # cookie with no value.
1512                 name, value = '', name
1513             f.write('%s\n' % '\t'.join((
1514                 cookie.domain,
1515                 self._true_or_false(cookie.domain.startswith('.')),
1516                 cookie.path,
1517                 self._true_or_false(cookie.secure),
1518                 str_or_none(cookie.expires, default=''),
1519                 name, value
1520             )))
1521
1522     def save(self, filename=None, *args, **kwargs):
1523         """
1524         Save cookies to a file.
1525         Code is taken from CPython 3.6
1526         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1527
1528         if filename is None:
1529             if self.filename is not None:
1530                 filename = self.filename
1531             else:
1532                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1533
1534         # Store session cookies with `expires` set to 0 instead of an empty string
1535         for cookie in self:
1536             if cookie.expires is None:
1537                 cookie.expires = 0
1538
1539         with self.open(filename, write=True) as f:
1540             f.write(self._HEADER)
1541             self._really_save(f, *args, **kwargs)
1542
1543     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1544         """Load cookies from a file."""
1545         if filename is None:
1546             if self.filename is not None:
1547                 filename = self.filename
1548             else:
1549                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1550
1551         def prepare_line(line):
1552             if line.startswith(self._HTTPONLY_PREFIX):
1553                 line = line[len(self._HTTPONLY_PREFIX):]
1554             # comments and empty lines are fine
1555             if line.startswith('#') or not line.strip():
1556                 return line
1557             cookie_list = line.split('\t')
1558             if len(cookie_list) != self._ENTRY_LEN:
1559                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1560             cookie = self._CookieFileEntry(*cookie_list)
1561             if cookie.expires_at and not cookie.expires_at.isdigit():
1562                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1563             return line
1564
1565         cf = io.StringIO()
1566         with self.open(filename) as f:
1567             for line in f:
1568                 try:
1569                     cf.write(prepare_line(line))
1570                 except http.cookiejar.LoadError as e:
1571                     if f'{line.strip()} '[0] in '[{"':
1572                         raise http.cookiejar.LoadError(
1573                             'Cookies file must be Netscape formatted, not JSON. See  '
1574                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1575                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1576                     continue
1577         cf.seek(0)
1578         self._really_load(cf, filename, ignore_discard, ignore_expires)
1579         # Session cookies are denoted by either `expires` field set to
1580         # an empty string or 0. MozillaCookieJar only recognizes the former
1581         # (see [1]). So we need force the latter to be recognized as session
1582         # cookies on our own.
1583         # Session cookies may be important for cookies-based authentication,
1584         # e.g. usually, when user does not check 'Remember me' check box while
1585         # logging in on a site, some important cookies are stored as session
1586         # cookies so that not recognizing them will result in failed login.
1587         # 1. https://bugs.python.org/issue17164
1588         for cookie in self:
1589             # Treat `expires=0` cookies as session cookies
1590             if cookie.expires == 0:
1591                 cookie.expires = None
1592                 cookie.discard = True
1593
1594
1595 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1596     def __init__(self, cookiejar=None):
1597         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1598
1599     def http_response(self, request, response):
1600         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1601
1602     https_request = urllib.request.HTTPCookieProcessor.http_request
1603     https_response = http_response
1604
1605
1606 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1607     """YoutubeDL redirect handler
1608
1609     The code is based on HTTPRedirectHandler implementation from CPython [1].
1610
1611     This redirect handler solves two issues:
1612      - ensures redirect URL is always unicode under python 2
1613      - introduces support for experimental HTTP response status code
1614        308 Permanent Redirect [2] used by some sites [3]
1615
1616     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1617     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1618     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1619     """
1620
1621     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1622
1623     def redirect_request(self, req, fp, code, msg, headers, newurl):
1624         """Return a Request or None in response to a redirect.
1625
1626         This is called by the http_error_30x methods when a
1627         redirection response is received.  If a redirection should
1628         take place, return a new Request to allow http_error_30x to
1629         perform the redirect.  Otherwise, raise HTTPError if no-one
1630         else should try to handle this url.  Return None if you can't
1631         but another Handler might.
1632         """
1633         m = req.get_method()
1634         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1635                  or code in (301, 302, 303) and m == "POST")):
1636             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1637         # Strictly (according to RFC 2616), 301 or 302 in response to
1638         # a POST MUST NOT cause a redirection without confirmation
1639         # from the user (of urllib.request, in this case).  In practice,
1640         # essentially all clients do redirect in this case, so we do
1641         # the same.
1642
1643         # Be conciliant with URIs containing a space.  This is mainly
1644         # redundant with the more complete encoding done in http_error_302(),
1645         # but it is kept for compatibility with other callers.
1646         newurl = newurl.replace(' ', '%20')
1647
1648         CONTENT_HEADERS = ("content-length", "content-type")
1649         # NB: don't use dict comprehension for python 2.6 compatibility
1650         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1651
1652         # A 303 must either use GET or HEAD for subsequent request
1653         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1654         if code == 303 and m != 'HEAD':
1655             m = 'GET'
1656         # 301 and 302 redirects are commonly turned into a GET from a POST
1657         # for subsequent requests by browsers, so we'll do the same.
1658         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1659         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1660         if code in (301, 302) and m == 'POST':
1661             m = 'GET'
1662
1663         return urllib.request.Request(
1664             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1665             unverifiable=True, method=m)
1666
1667
1668 def extract_timezone(date_str):
1669     m = re.search(
1670         r'''(?x)
1671             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1672             (?P<tz>Z|                                            # just the UTC Z, or
1673                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1674                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1675                    [ ]?                                          # optional space
1676                 (?P<sign>\+|-)                                   # +/-
1677                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1678             $)
1679         ''', date_str)
1680     if not m:
1681         timezone = datetime.timedelta()
1682     else:
1683         date_str = date_str[:-len(m.group('tz'))]
1684         if not m.group('sign'):
1685             timezone = datetime.timedelta()
1686         else:
1687             sign = 1 if m.group('sign') == '+' else -1
1688             timezone = datetime.timedelta(
1689                 hours=sign * int(m.group('hours')),
1690                 minutes=sign * int(m.group('minutes')))
1691     return timezone, date_str
1692
1693
1694 def parse_iso8601(date_str, delimiter='T', timezone=None):
1695     """ Return a UNIX timestamp from the given date """
1696
1697     if date_str is None:
1698         return None
1699
1700     date_str = re.sub(r'\.[0-9]+', '', date_str)
1701
1702     if timezone is None:
1703         timezone, date_str = extract_timezone(date_str)
1704
1705     with contextlib.suppress(ValueError):
1706         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1707         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1708         return calendar.timegm(dt.timetuple())
1709
1710
1711 def date_formats(day_first=True):
1712     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1713
1714
1715 def unified_strdate(date_str, day_first=True):
1716     """Return a string with the date in the format YYYYMMDD"""
1717
1718     if date_str is None:
1719         return None
1720     upload_date = None
1721     # Replace commas
1722     date_str = date_str.replace(',', ' ')
1723     # Remove AM/PM + timezone
1724     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1725     _, date_str = extract_timezone(date_str)
1726
1727     for expression in date_formats(day_first):
1728         with contextlib.suppress(ValueError):
1729             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1730     if upload_date is None:
1731         timetuple = email.utils.parsedate_tz(date_str)
1732         if timetuple:
1733             with contextlib.suppress(ValueError):
1734                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1735     if upload_date is not None:
1736         return str(upload_date)
1737
1738
1739 def unified_timestamp(date_str, day_first=True):
1740     if date_str is None:
1741         return None
1742
1743     date_str = re.sub(r'[,|]', '', date_str)
1744
1745     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1746     timezone, date_str = extract_timezone(date_str)
1747
1748     # Remove AM/PM + timezone
1749     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1750
1751     # Remove unrecognized timezones from ISO 8601 alike timestamps
1752     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1753     if m:
1754         date_str = date_str[:-len(m.group('tz'))]
1755
1756     # Python only supports microseconds, so remove nanoseconds
1757     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1758     if m:
1759         date_str = m.group(1)
1760
1761     for expression in date_formats(day_first):
1762         with contextlib.suppress(ValueError):
1763             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1764             return calendar.timegm(dt.timetuple())
1765     timetuple = email.utils.parsedate_tz(date_str)
1766     if timetuple:
1767         return calendar.timegm(timetuple) + pm_delta * 3600
1768
1769
1770 def determine_ext(url, default_ext='unknown_video'):
1771     if url is None or '.' not in url:
1772         return default_ext
1773     guess = url.partition('?')[0].rpartition('.')[2]
1774     if re.match(r'^[A-Za-z0-9]+$', guess):
1775         return guess
1776     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1777     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1778         return guess.rstrip('/')
1779     else:
1780         return default_ext
1781
1782
1783 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1784     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1785
1786
1787 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1788     R"""
1789     Return a datetime object from a string.
1790     Supported format:
1791         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1792
1793     @param format       strftime format of DATE
1794     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1795                         auto: round to the unit provided in date_str (if applicable).
1796     """
1797     auto_precision = False
1798     if precision == 'auto':
1799         auto_precision = True
1800         precision = 'microsecond'
1801     today = datetime_round(datetime.datetime.utcnow(), precision)
1802     if date_str in ('now', 'today'):
1803         return today
1804     if date_str == 'yesterday':
1805         return today - datetime.timedelta(days=1)
1806     match = re.match(
1807         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1808         date_str)
1809     if match is not None:
1810         start_time = datetime_from_str(match.group('start'), precision, format)
1811         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1812         unit = match.group('unit')
1813         if unit == 'month' or unit == 'year':
1814             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1815             unit = 'day'
1816         else:
1817             if unit == 'week':
1818                 unit = 'day'
1819                 time *= 7
1820             delta = datetime.timedelta(**{unit + 's': time})
1821             new_date = start_time + delta
1822         if auto_precision:
1823             return datetime_round(new_date, unit)
1824         return new_date
1825
1826     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1827
1828
1829 def date_from_str(date_str, format='%Y%m%d', strict=False):
1830     R"""
1831     Return a date object from a string using datetime_from_str
1832
1833     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1834                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1835     """
1836     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1837         raise ValueError(f'Invalid date format "{date_str}"')
1838     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1839
1840
1841 def datetime_add_months(dt, months):
1842     """Increment/Decrement a datetime object by months."""
1843     month = dt.month + months - 1
1844     year = dt.year + month // 12
1845     month = month % 12 + 1
1846     day = min(dt.day, calendar.monthrange(year, month)[1])
1847     return dt.replace(year, month, day)
1848
1849
1850 def datetime_round(dt, precision='day'):
1851     """
1852     Round a datetime object's time to a specific precision
1853     """
1854     if precision == 'microsecond':
1855         return dt
1856
1857     unit_seconds = {
1858         'day': 86400,
1859         'hour': 3600,
1860         'minute': 60,
1861         'second': 1,
1862     }
1863     roundto = lambda x, n: ((x + n / 2) // n) * n
1864     timestamp = calendar.timegm(dt.timetuple())
1865     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1866
1867
1868 def hyphenate_date(date_str):
1869     """
1870     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1871     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1872     if match is not None:
1873         return '-'.join(match.groups())
1874     else:
1875         return date_str
1876
1877
1878 class DateRange:
1879     """Represents a time interval between two dates"""
1880
1881     def __init__(self, start=None, end=None):
1882         """start and end must be strings in the format accepted by date"""
1883         if start is not None:
1884             self.start = date_from_str(start, strict=True)
1885         else:
1886             self.start = datetime.datetime.min.date()
1887         if end is not None:
1888             self.end = date_from_str(end, strict=True)
1889         else:
1890             self.end = datetime.datetime.max.date()
1891         if self.start > self.end:
1892             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1893
1894     @classmethod
1895     def day(cls, day):
1896         """Returns a range that only contains the given day"""
1897         return cls(day, day)
1898
1899     def __contains__(self, date):
1900         """Check if the date is in the range"""
1901         if not isinstance(date, datetime.date):
1902             date = date_from_str(date)
1903         return self.start <= date <= self.end
1904
1905     def __str__(self):
1906         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1907
1908     def __eq__(self, other):
1909         return (isinstance(other, DateRange)
1910                 and self.start == other.start and self.end == other.end)
1911
1912
1913 def platform_name():
1914     """ Returns the platform name as a str """
1915     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1916     return platform.platform()
1917
1918
1919 @functools.cache
1920 def system_identifier():
1921     python_implementation = platform.python_implementation()
1922     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1923         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1924
1925     return 'Python %s (%s %s) - %s %s' % (
1926         platform.python_version(),
1927         python_implementation,
1928         platform.architecture()[0],
1929         platform.platform(),
1930         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1931     )
1932
1933
1934 @functools.cache
1935 def get_windows_version():
1936     ''' Get Windows version. returns () if it's not running on Windows '''
1937     if compat_os_name == 'nt':
1938         return version_tuple(platform.win32_ver()[1])
1939     else:
1940         return ()
1941
1942
1943 def write_string(s, out=None, encoding=None):
1944     assert isinstance(s, str)
1945     out = out or sys.stderr
1946
1947     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1948         s = re.sub(r'([\r\n]+)', r' \1', s)
1949
1950     enc, buffer = None, out
1951     if 'b' in getattr(out, 'mode', ''):
1952         enc = encoding or preferredencoding()
1953     elif hasattr(out, 'buffer'):
1954         buffer = out.buffer
1955         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1956
1957     buffer.write(s.encode(enc, 'ignore') if enc else s)
1958     out.flush()
1959
1960
1961 def bytes_to_intlist(bs):
1962     if not bs:
1963         return []
1964     if isinstance(bs[0], int):  # Python 3
1965         return list(bs)
1966     else:
1967         return [ord(c) for c in bs]
1968
1969
1970 def intlist_to_bytes(xs):
1971     if not xs:
1972         return b''
1973     return struct.pack('%dB' % len(xs), *xs)
1974
1975
1976 class LockingUnsupportedError(OSError):
1977     msg = 'File locking is not supported'
1978
1979     def __init__(self):
1980         super().__init__(self.msg)
1981
1982
1983 # Cross-platform file locking
1984 if sys.platform == 'win32':
1985     import ctypes.wintypes
1986     import msvcrt
1987
1988     class OVERLAPPED(ctypes.Structure):
1989         _fields_ = [
1990             ('Internal', ctypes.wintypes.LPVOID),
1991             ('InternalHigh', ctypes.wintypes.LPVOID),
1992             ('Offset', ctypes.wintypes.DWORD),
1993             ('OffsetHigh', ctypes.wintypes.DWORD),
1994             ('hEvent', ctypes.wintypes.HANDLE),
1995         ]
1996
1997     kernel32 = ctypes.windll.kernel32
1998     LockFileEx = kernel32.LockFileEx
1999     LockFileEx.argtypes = [
2000         ctypes.wintypes.HANDLE,     # hFile
2001         ctypes.wintypes.DWORD,      # dwFlags
2002         ctypes.wintypes.DWORD,      # dwReserved
2003         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2004         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2005         ctypes.POINTER(OVERLAPPED)  # Overlapped
2006     ]
2007     LockFileEx.restype = ctypes.wintypes.BOOL
2008     UnlockFileEx = kernel32.UnlockFileEx
2009     UnlockFileEx.argtypes = [
2010         ctypes.wintypes.HANDLE,     # hFile
2011         ctypes.wintypes.DWORD,      # dwReserved
2012         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2013         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2014         ctypes.POINTER(OVERLAPPED)  # Overlapped
2015     ]
2016     UnlockFileEx.restype = ctypes.wintypes.BOOL
2017     whole_low = 0xffffffff
2018     whole_high = 0x7fffffff
2019
2020     def _lock_file(f, exclusive, block):
2021         overlapped = OVERLAPPED()
2022         overlapped.Offset = 0
2023         overlapped.OffsetHigh = 0
2024         overlapped.hEvent = 0
2025         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2026
2027         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2028                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2029                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2030             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2031             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2032
2033     def _unlock_file(f):
2034         assert f._lock_file_overlapped_p
2035         handle = msvcrt.get_osfhandle(f.fileno())
2036         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2037             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2038
2039 else:
2040     try:
2041         import fcntl
2042
2043         def _lock_file(f, exclusive, block):
2044             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2045             if not block:
2046                 flags |= fcntl.LOCK_NB
2047             try:
2048                 fcntl.flock(f, flags)
2049             except BlockingIOError:
2050                 raise
2051             except OSError:  # AOSP does not have flock()
2052                 fcntl.lockf(f, flags)
2053
2054         def _unlock_file(f):
2055             try:
2056                 fcntl.flock(f, fcntl.LOCK_UN)
2057             except OSError:
2058                 fcntl.lockf(f, fcntl.LOCK_UN)
2059
2060     except ImportError:
2061
2062         def _lock_file(f, exclusive, block):
2063             raise LockingUnsupportedError()
2064
2065         def _unlock_file(f):
2066             raise LockingUnsupportedError()
2067
2068
2069 class locked_file:
2070     locked = False
2071
2072     def __init__(self, filename, mode, block=True, encoding=None):
2073         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2074             raise NotImplementedError(mode)
2075         self.mode, self.block = mode, block
2076
2077         writable = any(f in mode for f in 'wax+')
2078         readable = any(f in mode for f in 'r+')
2079         flags = functools.reduce(operator.ior, (
2080             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2081             getattr(os, 'O_BINARY', 0),  # Windows only
2082             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2083             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2084             os.O_APPEND if 'a' in mode else 0,
2085             os.O_EXCL if 'x' in mode else 0,
2086             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2087         ))
2088
2089         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2090
2091     def __enter__(self):
2092         exclusive = 'r' not in self.mode
2093         try:
2094             _lock_file(self.f, exclusive, self.block)
2095             self.locked = True
2096         except OSError:
2097             self.f.close()
2098             raise
2099         if 'w' in self.mode:
2100             try:
2101                 self.f.truncate()
2102             except OSError as e:
2103                 if e.errno not in (
2104                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2105                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2106                 ):
2107                     raise
2108         return self
2109
2110     def unlock(self):
2111         if not self.locked:
2112             return
2113         try:
2114             _unlock_file(self.f)
2115         finally:
2116             self.locked = False
2117
2118     def __exit__(self, *_):
2119         try:
2120             self.unlock()
2121         finally:
2122             self.f.close()
2123
2124     open = __enter__
2125     close = __exit__
2126
2127     def __getattr__(self, attr):
2128         return getattr(self.f, attr)
2129
2130     def __iter__(self):
2131         return iter(self.f)
2132
2133
2134 @functools.cache
2135 def get_filesystem_encoding():
2136     encoding = sys.getfilesystemencoding()
2137     return encoding if encoding is not None else 'utf-8'
2138
2139
2140 def shell_quote(args):
2141     quoted_args = []
2142     encoding = get_filesystem_encoding()
2143     for a in args:
2144         if isinstance(a, bytes):
2145             # We may get a filename encoded with 'encodeFilename'
2146             a = a.decode(encoding)
2147         quoted_args.append(compat_shlex_quote(a))
2148     return ' '.join(quoted_args)
2149
2150
2151 def smuggle_url(url, data):
2152     """ Pass additional data in a URL for internal use. """
2153
2154     url, idata = unsmuggle_url(url, {})
2155     data.update(idata)
2156     sdata = urllib.parse.urlencode(
2157         {'__youtubedl_smuggle': json.dumps(data)})
2158     return url + '#' + sdata
2159
2160
2161 def unsmuggle_url(smug_url, default=None):
2162     if '#__youtubedl_smuggle' not in smug_url:
2163         return smug_url, default
2164     url, _, sdata = smug_url.rpartition('#')
2165     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2166     data = json.loads(jsond)
2167     return url, data
2168
2169
2170 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2171     """ Formats numbers with decimal sufixes like K, M, etc """
2172     num, factor = float_or_none(num), float(factor)
2173     if num is None or num < 0:
2174         return None
2175     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2176     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2177     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2178     if factor == 1024:
2179         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2180     converted = num / (factor ** exponent)
2181     return fmt % (converted, suffix)
2182
2183
2184 def format_bytes(bytes):
2185     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2186
2187
2188 def lookup_unit_table(unit_table, s):
2189     units_re = '|'.join(re.escape(u) for u in unit_table)
2190     m = re.match(
2191         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2192     if not m:
2193         return None
2194     num_str = m.group('num').replace(',', '.')
2195     mult = unit_table[m.group('unit')]
2196     return int(float(num_str) * mult)
2197
2198
2199 def parse_filesize(s):
2200     if s is None:
2201         return None
2202
2203     # The lower-case forms are of course incorrect and unofficial,
2204     # but we support those too
2205     _UNIT_TABLE = {
2206         'B': 1,
2207         'b': 1,
2208         'bytes': 1,
2209         'KiB': 1024,
2210         'KB': 1000,
2211         'kB': 1024,
2212         'Kb': 1000,
2213         'kb': 1000,
2214         'kilobytes': 1000,
2215         'kibibytes': 1024,
2216         'MiB': 1024 ** 2,
2217         'MB': 1000 ** 2,
2218         'mB': 1024 ** 2,
2219         'Mb': 1000 ** 2,
2220         'mb': 1000 ** 2,
2221         'megabytes': 1000 ** 2,
2222         'mebibytes': 1024 ** 2,
2223         'GiB': 1024 ** 3,
2224         'GB': 1000 ** 3,
2225         'gB': 1024 ** 3,
2226         'Gb': 1000 ** 3,
2227         'gb': 1000 ** 3,
2228         'gigabytes': 1000 ** 3,
2229         'gibibytes': 1024 ** 3,
2230         'TiB': 1024 ** 4,
2231         'TB': 1000 ** 4,
2232         'tB': 1024 ** 4,
2233         'Tb': 1000 ** 4,
2234         'tb': 1000 ** 4,
2235         'terabytes': 1000 ** 4,
2236         'tebibytes': 1024 ** 4,
2237         'PiB': 1024 ** 5,
2238         'PB': 1000 ** 5,
2239         'pB': 1024 ** 5,
2240         'Pb': 1000 ** 5,
2241         'pb': 1000 ** 5,
2242         'petabytes': 1000 ** 5,
2243         'pebibytes': 1024 ** 5,
2244         'EiB': 1024 ** 6,
2245         'EB': 1000 ** 6,
2246         'eB': 1024 ** 6,
2247         'Eb': 1000 ** 6,
2248         'eb': 1000 ** 6,
2249         'exabytes': 1000 ** 6,
2250         'exbibytes': 1024 ** 6,
2251         'ZiB': 1024 ** 7,
2252         'ZB': 1000 ** 7,
2253         'zB': 1024 ** 7,
2254         'Zb': 1000 ** 7,
2255         'zb': 1000 ** 7,
2256         'zettabytes': 1000 ** 7,
2257         'zebibytes': 1024 ** 7,
2258         'YiB': 1024 ** 8,
2259         'YB': 1000 ** 8,
2260         'yB': 1024 ** 8,
2261         'Yb': 1000 ** 8,
2262         'yb': 1000 ** 8,
2263         'yottabytes': 1000 ** 8,
2264         'yobibytes': 1024 ** 8,
2265     }
2266
2267     return lookup_unit_table(_UNIT_TABLE, s)
2268
2269
2270 def parse_count(s):
2271     if s is None:
2272         return None
2273
2274     s = re.sub(r'^[^\d]+\s', '', s).strip()
2275
2276     if re.match(r'^[\d,.]+$', s):
2277         return str_to_int(s)
2278
2279     _UNIT_TABLE = {
2280         'k': 1000,
2281         'K': 1000,
2282         'm': 1000 ** 2,
2283         'M': 1000 ** 2,
2284         'kk': 1000 ** 2,
2285         'KK': 1000 ** 2,
2286         'b': 1000 ** 3,
2287         'B': 1000 ** 3,
2288     }
2289
2290     ret = lookup_unit_table(_UNIT_TABLE, s)
2291     if ret is not None:
2292         return ret
2293
2294     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2295     if mobj:
2296         return str_to_int(mobj.group(1))
2297
2298
2299 def parse_resolution(s, *, lenient=False):
2300     if s is None:
2301         return {}
2302
2303     if lenient:
2304         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2305     else:
2306         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2307     if mobj:
2308         return {
2309             'width': int(mobj.group('w')),
2310             'height': int(mobj.group('h')),
2311         }
2312
2313     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2314     if mobj:
2315         return {'height': int(mobj.group(1))}
2316
2317     mobj = re.search(r'\b([48])[kK]\b', s)
2318     if mobj:
2319         return {'height': int(mobj.group(1)) * 540}
2320
2321     return {}
2322
2323
2324 def parse_bitrate(s):
2325     if not isinstance(s, str):
2326         return
2327     mobj = re.search(r'\b(\d+)\s*kbps', s)
2328     if mobj:
2329         return int(mobj.group(1))
2330
2331
2332 def month_by_name(name, lang='en'):
2333     """ Return the number of a month by (locale-independently) English name """
2334
2335     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2336
2337     try:
2338         return month_names.index(name) + 1
2339     except ValueError:
2340         return None
2341
2342
2343 def month_by_abbreviation(abbrev):
2344     """ Return the number of a month by (locale-independently) English
2345         abbreviations """
2346
2347     try:
2348         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2349     except ValueError:
2350         return None
2351
2352
2353 def fix_xml_ampersands(xml_str):
2354     """Replace all the '&' by '&amp;' in XML"""
2355     return re.sub(
2356         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2357         '&amp;',
2358         xml_str)
2359
2360
2361 def setproctitle(title):
2362     assert isinstance(title, str)
2363
2364     # ctypes in Jython is not complete
2365     # http://bugs.jython.org/issue2148
2366     if sys.platform.startswith('java'):
2367         return
2368
2369     try:
2370         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2371     except OSError:
2372         return
2373     except TypeError:
2374         # LoadLibrary in Windows Python 2.7.13 only expects
2375         # a bytestring, but since unicode_literals turns
2376         # every string into a unicode string, it fails.
2377         return
2378     title_bytes = title.encode()
2379     buf = ctypes.create_string_buffer(len(title_bytes))
2380     buf.value = title_bytes
2381     try:
2382         libc.prctl(15, buf, 0, 0, 0)
2383     except AttributeError:
2384         return  # Strange libc, just skip this
2385
2386
2387 def remove_start(s, start):
2388     return s[len(start):] if s is not None and s.startswith(start) else s
2389
2390
2391 def remove_end(s, end):
2392     return s[:-len(end)] if s is not None and s.endswith(end) else s
2393
2394
2395 def remove_quotes(s):
2396     if s is None or len(s) < 2:
2397         return s
2398     for quote in ('"', "'", ):
2399         if s[0] == quote and s[-1] == quote:
2400             return s[1:-1]
2401     return s
2402
2403
2404 def get_domain(url):
2405     """
2406     This implementation is inconsistent, but is kept for compatibility.
2407     Use this only for "webpage_url_domain"
2408     """
2409     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2410
2411
2412 def url_basename(url):
2413     path = urllib.parse.urlparse(url).path
2414     return path.strip('/').split('/')[-1]
2415
2416
2417 def base_url(url):
2418     return re.match(r'https?://[^?#&]+/', url).group()
2419
2420
2421 def urljoin(base, path):
2422     if isinstance(path, bytes):
2423         path = path.decode()
2424     if not isinstance(path, str) or not path:
2425         return None
2426     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2427         return path
2428     if isinstance(base, bytes):
2429         base = base.decode()
2430     if not isinstance(base, str) or not re.match(
2431             r'^(?:https?:)?//', base):
2432         return None
2433     return urllib.parse.urljoin(base, path)
2434
2435
2436 class HEADRequest(urllib.request.Request):
2437     def get_method(self):
2438         return 'HEAD'
2439
2440
2441 class PUTRequest(urllib.request.Request):
2442     def get_method(self):
2443         return 'PUT'
2444
2445
2446 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2447     if get_attr and v is not None:
2448         v = getattr(v, get_attr, None)
2449     try:
2450         return int(v) * invscale // scale
2451     except (ValueError, TypeError, OverflowError):
2452         return default
2453
2454
2455 def str_or_none(v, default=None):
2456     return default if v is None else str(v)
2457
2458
2459 def str_to_int(int_str):
2460     """ A more relaxed version of int_or_none """
2461     if isinstance(int_str, int):
2462         return int_str
2463     elif isinstance(int_str, str):
2464         int_str = re.sub(r'[,\.\+]', '', int_str)
2465         return int_or_none(int_str)
2466
2467
2468 def float_or_none(v, scale=1, invscale=1, default=None):
2469     if v is None:
2470         return default
2471     try:
2472         return float(v) * invscale / scale
2473     except (ValueError, TypeError):
2474         return default
2475
2476
2477 def bool_or_none(v, default=None):
2478     return v if isinstance(v, bool) else default
2479
2480
2481 def strip_or_none(v, default=None):
2482     return v.strip() if isinstance(v, str) else default
2483
2484
2485 def url_or_none(url):
2486     if not url or not isinstance(url, str):
2487         return None
2488     url = url.strip()
2489     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2490
2491
2492 def request_to_url(req):
2493     if isinstance(req, urllib.request.Request):
2494         return req.get_full_url()
2495     else:
2496         return req
2497
2498
2499 def strftime_or_none(timestamp, date_format, default=None):
2500     datetime_object = None
2501     try:
2502         if isinstance(timestamp, (int, float)):  # unix timestamp
2503             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2504         elif isinstance(timestamp, str):  # assume YYYYMMDD
2505             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2506         return datetime_object.strftime(date_format)
2507     except (ValueError, TypeError, AttributeError):
2508         return default
2509
2510
2511 def parse_duration(s):
2512     if not isinstance(s, str):
2513         return None
2514     s = s.strip()
2515     if not s:
2516         return None
2517
2518     days, hours, mins, secs, ms = [None] * 5
2519     m = re.match(r'''(?x)
2520             (?P<before_secs>
2521                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2522             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2523             (?P<ms>[.:][0-9]+)?Z?$
2524         ''', s)
2525     if m:
2526         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2527     else:
2528         m = re.match(
2529             r'''(?ix)(?:P?
2530                 (?:
2531                     [0-9]+\s*y(?:ears?)?,?\s*
2532                 )?
2533                 (?:
2534                     [0-9]+\s*m(?:onths?)?,?\s*
2535                 )?
2536                 (?:
2537                     [0-9]+\s*w(?:eeks?)?,?\s*
2538                 )?
2539                 (?:
2540                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2541                 )?
2542                 T)?
2543                 (?:
2544                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2545                 )?
2546                 (?:
2547                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2548                 )?
2549                 (?:
2550                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2551                 )?Z?$''', s)
2552         if m:
2553             days, hours, mins, secs, ms = m.groups()
2554         else:
2555             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2556             if m:
2557                 hours, mins = m.groups()
2558             else:
2559                 return None
2560
2561     if ms:
2562         ms = ms.replace(':', '.')
2563     return sum(float(part or 0) * mult for part, mult in (
2564         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2565
2566
2567 def prepend_extension(filename, ext, expected_real_ext=None):
2568     name, real_ext = os.path.splitext(filename)
2569     return (
2570         f'{name}.{ext}{real_ext}'
2571         if not expected_real_ext or real_ext[1:] == expected_real_ext
2572         else f'{filename}.{ext}')
2573
2574
2575 def replace_extension(filename, ext, expected_real_ext=None):
2576     name, real_ext = os.path.splitext(filename)
2577     return '{}.{}'.format(
2578         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2579         ext)
2580
2581
2582 def check_executable(exe, args=[]):
2583     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2584     args can be a list of arguments for a short output (like -version) """
2585     try:
2586         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2587     except OSError:
2588         return False
2589     return exe
2590
2591
2592 def _get_exe_version_output(exe, args, *, to_screen=None):
2593     if to_screen:
2594         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2595     try:
2596         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2597         # SIGTTOU if yt-dlp is run in the background.
2598         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2599         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2600                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2601     except OSError:
2602         return False
2603     return stdout
2604
2605
2606 def detect_exe_version(output, version_re=None, unrecognized='present'):
2607     assert isinstance(output, str)
2608     if version_re is None:
2609         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2610     m = re.search(version_re, output)
2611     if m:
2612         return m.group(1)
2613     else:
2614         return unrecognized
2615
2616
2617 def get_exe_version(exe, args=['--version'],
2618                     version_re=None, unrecognized='present'):
2619     """ Returns the version of the specified executable,
2620     or False if the executable is not present """
2621     out = _get_exe_version_output(exe, args)
2622     return detect_exe_version(out, version_re, unrecognized) if out else False
2623
2624
2625 def frange(start=0, stop=None, step=1):
2626     """Float range"""
2627     if stop is None:
2628         start, stop = 0, start
2629     sign = [-1, 1][step > 0] if step else 0
2630     while sign * start < sign * stop:
2631         yield start
2632         start += step
2633
2634
2635 class LazyList(collections.abc.Sequence):
2636     """Lazy immutable list from an iterable
2637     Note that slices of a LazyList are lists and not LazyList"""
2638
2639     class IndexError(IndexError):
2640         pass
2641
2642     def __init__(self, iterable, *, reverse=False, _cache=None):
2643         self._iterable = iter(iterable)
2644         self._cache = [] if _cache is None else _cache
2645         self._reversed = reverse
2646
2647     def __iter__(self):
2648         if self._reversed:
2649             # We need to consume the entire iterable to iterate in reverse
2650             yield from self.exhaust()
2651             return
2652         yield from self._cache
2653         for item in self._iterable:
2654             self._cache.append(item)
2655             yield item
2656
2657     def _exhaust(self):
2658         self._cache.extend(self._iterable)
2659         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2660         return self._cache
2661
2662     def exhaust(self):
2663         """Evaluate the entire iterable"""
2664         return self._exhaust()[::-1 if self._reversed else 1]
2665
2666     @staticmethod
2667     def _reverse_index(x):
2668         return None if x is None else ~x
2669
2670     def __getitem__(self, idx):
2671         if isinstance(idx, slice):
2672             if self._reversed:
2673                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2674             start, stop, step = idx.start, idx.stop, idx.step or 1
2675         elif isinstance(idx, int):
2676             if self._reversed:
2677                 idx = self._reverse_index(idx)
2678             start, stop, step = idx, idx, 0
2679         else:
2680             raise TypeError('indices must be integers or slices')
2681         if ((start or 0) < 0 or (stop or 0) < 0
2682                 or (start is None and step < 0)
2683                 or (stop is None and step > 0)):
2684             # We need to consume the entire iterable to be able to slice from the end
2685             # Obviously, never use this with infinite iterables
2686             self._exhaust()
2687             try:
2688                 return self._cache[idx]
2689             except IndexError as e:
2690                 raise self.IndexError(e) from e
2691         n = max(start or 0, stop or 0) - len(self._cache) + 1
2692         if n > 0:
2693             self._cache.extend(itertools.islice(self._iterable, n))
2694         try:
2695             return self._cache[idx]
2696         except IndexError as e:
2697             raise self.IndexError(e) from e
2698
2699     def __bool__(self):
2700         try:
2701             self[-1] if self._reversed else self[0]
2702         except self.IndexError:
2703             return False
2704         return True
2705
2706     def __len__(self):
2707         self._exhaust()
2708         return len(self._cache)
2709
2710     def __reversed__(self):
2711         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2712
2713     def __copy__(self):
2714         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2715
2716     def __repr__(self):
2717         # repr and str should mimic a list. So we exhaust the iterable
2718         return repr(self.exhaust())
2719
2720     def __str__(self):
2721         return repr(self.exhaust())
2722
2723
2724 class PagedList:
2725
2726     class IndexError(IndexError):
2727         pass
2728
2729     def __len__(self):
2730         # This is only useful for tests
2731         return len(self.getslice())
2732
2733     def __init__(self, pagefunc, pagesize, use_cache=True):
2734         self._pagefunc = pagefunc
2735         self._pagesize = pagesize
2736         self._pagecount = float('inf')
2737         self._use_cache = use_cache
2738         self._cache = {}
2739
2740     def getpage(self, pagenum):
2741         page_results = self._cache.get(pagenum)
2742         if page_results is None:
2743             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2744         if self._use_cache:
2745             self._cache[pagenum] = page_results
2746         return page_results
2747
2748     def getslice(self, start=0, end=None):
2749         return list(self._getslice(start, end))
2750
2751     def _getslice(self, start, end):
2752         raise NotImplementedError('This method must be implemented by subclasses')
2753
2754     def __getitem__(self, idx):
2755         assert self._use_cache, 'Indexing PagedList requires cache'
2756         if not isinstance(idx, int) or idx < 0:
2757             raise TypeError('indices must be non-negative integers')
2758         entries = self.getslice(idx, idx + 1)
2759         if not entries:
2760             raise self.IndexError()
2761         return entries[0]
2762
2763
2764 class OnDemandPagedList(PagedList):
2765     """Download pages until a page with less than maximum results"""
2766
2767     def _getslice(self, start, end):
2768         for pagenum in itertools.count(start // self._pagesize):
2769             firstid = pagenum * self._pagesize
2770             nextfirstid = pagenum * self._pagesize + self._pagesize
2771             if start >= nextfirstid:
2772                 continue
2773
2774             startv = (
2775                 start % self._pagesize
2776                 if firstid <= start < nextfirstid
2777                 else 0)
2778             endv = (
2779                 ((end - 1) % self._pagesize) + 1
2780                 if (end is not None and firstid <= end <= nextfirstid)
2781                 else None)
2782
2783             try:
2784                 page_results = self.getpage(pagenum)
2785             except Exception:
2786                 self._pagecount = pagenum - 1
2787                 raise
2788             if startv != 0 or endv is not None:
2789                 page_results = page_results[startv:endv]
2790             yield from page_results
2791
2792             # A little optimization - if current page is not "full", ie. does
2793             # not contain page_size videos then we can assume that this page
2794             # is the last one - there are no more ids on further pages -
2795             # i.e. no need to query again.
2796             if len(page_results) + startv < self._pagesize:
2797                 break
2798
2799             # If we got the whole page, but the next page is not interesting,
2800             # break out early as well
2801             if end == nextfirstid:
2802                 break
2803
2804
2805 class InAdvancePagedList(PagedList):
2806     """PagedList with total number of pages known in advance"""
2807
2808     def __init__(self, pagefunc, pagecount, pagesize):
2809         PagedList.__init__(self, pagefunc, pagesize, True)
2810         self._pagecount = pagecount
2811
2812     def _getslice(self, start, end):
2813         start_page = start // self._pagesize
2814         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2815         skip_elems = start - start_page * self._pagesize
2816         only_more = None if end is None else end - start
2817         for pagenum in range(start_page, end_page):
2818             page_results = self.getpage(pagenum)
2819             if skip_elems:
2820                 page_results = page_results[skip_elems:]
2821                 skip_elems = None
2822             if only_more is not None:
2823                 if len(page_results) < only_more:
2824                     only_more -= len(page_results)
2825                 else:
2826                     yield from page_results[:only_more]
2827                     break
2828             yield from page_results
2829
2830
2831 class PlaylistEntries:
2832     MissingEntry = object()
2833     is_exhausted = False
2834
2835     def __init__(self, ydl, info_dict):
2836         self.ydl = ydl
2837
2838         # _entries must be assigned now since infodict can change during iteration
2839         entries = info_dict.get('entries')
2840         if entries is None:
2841             raise EntryNotInPlaylist('There are no entries')
2842         elif isinstance(entries, list):
2843             self.is_exhausted = True
2844
2845         requested_entries = info_dict.get('requested_entries')
2846         self.is_incomplete = bool(requested_entries)
2847         if self.is_incomplete:
2848             assert self.is_exhausted
2849             self._entries = [self.MissingEntry] * max(requested_entries)
2850             for i, entry in zip(requested_entries, entries):
2851                 self._entries[i - 1] = entry
2852         elif isinstance(entries, (list, PagedList, LazyList)):
2853             self._entries = entries
2854         else:
2855             self._entries = LazyList(entries)
2856
2857     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2858         (?P<start>[+-]?\d+)?
2859         (?P<range>[:-]
2860             (?P<end>[+-]?\d+|inf(?:inite)?)?
2861             (?::(?P<step>[+-]?\d+))?
2862         )?''')
2863
2864     @classmethod
2865     def parse_playlist_items(cls, string):
2866         for segment in string.split(','):
2867             if not segment:
2868                 raise ValueError('There is two or more consecutive commas')
2869             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2870             if not mobj:
2871                 raise ValueError(f'{segment!r} is not a valid specification')
2872             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2873             if int_or_none(step) == 0:
2874                 raise ValueError(f'Step in {segment!r} cannot be zero')
2875             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2876
2877     def get_requested_items(self):
2878         playlist_items = self.ydl.params.get('playlist_items')
2879         playlist_start = self.ydl.params.get('playliststart', 1)
2880         playlist_end = self.ydl.params.get('playlistend')
2881         # For backwards compatibility, interpret -1 as whole list
2882         if playlist_end in (-1, None):
2883             playlist_end = ''
2884         if not playlist_items:
2885             playlist_items = f'{playlist_start}:{playlist_end}'
2886         elif playlist_start != 1 or playlist_end:
2887             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2888
2889         for index in self.parse_playlist_items(playlist_items):
2890             for i, entry in self[index]:
2891                 yield i, entry
2892                 if not entry:
2893                     continue
2894                 try:
2895                     # TODO: Add auto-generated fields
2896                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2897                 except (ExistingVideoReached, RejectedVideoReached):
2898                     return
2899
2900     def get_full_count(self):
2901         if self.is_exhausted and not self.is_incomplete:
2902             return len(self)
2903         elif isinstance(self._entries, InAdvancePagedList):
2904             if self._entries._pagesize == 1:
2905                 return self._entries._pagecount
2906
2907     @functools.cached_property
2908     def _getter(self):
2909         if isinstance(self._entries, list):
2910             def get_entry(i):
2911                 try:
2912                     entry = self._entries[i]
2913                 except IndexError:
2914                     entry = self.MissingEntry
2915                     if not self.is_incomplete:
2916                         raise self.IndexError()
2917                 if entry is self.MissingEntry:
2918                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2919                 return entry
2920         else:
2921             def get_entry(i):
2922                 try:
2923                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2924                 except (LazyList.IndexError, PagedList.IndexError):
2925                     raise self.IndexError()
2926         return get_entry
2927
2928     def __getitem__(self, idx):
2929         if isinstance(idx, int):
2930             idx = slice(idx, idx)
2931
2932         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2933         step = 1 if idx.step is None else idx.step
2934         if idx.start is None:
2935             start = 0 if step > 0 else len(self) - 1
2936         else:
2937             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2938
2939         # NB: Do not call len(self) when idx == [:]
2940         if idx.stop is None:
2941             stop = 0 if step < 0 else float('inf')
2942         else:
2943             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2944         stop += [-1, 1][step > 0]
2945
2946         for i in frange(start, stop, step):
2947             if i < 0:
2948                 continue
2949             try:
2950                 entry = self._getter(i)
2951             except self.IndexError:
2952                 self.is_exhausted = True
2953                 if step > 0:
2954                     break
2955                 continue
2956             yield i + 1, entry
2957
2958     def __len__(self):
2959         return len(tuple(self[:]))
2960
2961     class IndexError(IndexError):
2962         pass
2963
2964
2965 def uppercase_escape(s):
2966     unicode_escape = codecs.getdecoder('unicode_escape')
2967     return re.sub(
2968         r'\\U[0-9a-fA-F]{8}',
2969         lambda m: unicode_escape(m.group(0))[0],
2970         s)
2971
2972
2973 def lowercase_escape(s):
2974     unicode_escape = codecs.getdecoder('unicode_escape')
2975     return re.sub(
2976         r'\\u[0-9a-fA-F]{4}',
2977         lambda m: unicode_escape(m.group(0))[0],
2978         s)
2979
2980
2981 def escape_rfc3986(s):
2982     """Escape non-ASCII characters as suggested by RFC 3986"""
2983     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2984
2985
2986 def escape_url(url):
2987     """Escape URL as suggested by RFC 3986"""
2988     url_parsed = urllib.parse.urlparse(url)
2989     return url_parsed._replace(
2990         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2991         path=escape_rfc3986(url_parsed.path),
2992         params=escape_rfc3986(url_parsed.params),
2993         query=escape_rfc3986(url_parsed.query),
2994         fragment=escape_rfc3986(url_parsed.fragment)
2995     ).geturl()
2996
2997
2998 def parse_qs(url):
2999     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3000
3001
3002 def read_batch_urls(batch_fd):
3003     def fixup(url):
3004         if not isinstance(url, str):
3005             url = url.decode('utf-8', 'replace')
3006         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3007         for bom in BOM_UTF8:
3008             if url.startswith(bom):
3009                 url = url[len(bom):]
3010         url = url.lstrip()
3011         if not url or url.startswith(('#', ';', ']')):
3012             return False
3013         # "#" cannot be stripped out since it is part of the URI
3014         # However, it can be safely stripped out if following a whitespace
3015         return re.split(r'\s#', url, 1)[0].rstrip()
3016
3017     with contextlib.closing(batch_fd) as fd:
3018         return [url for url in map(fixup, fd) if url]
3019
3020
3021 def urlencode_postdata(*args, **kargs):
3022     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3023
3024
3025 def update_url_query(url, query):
3026     if not query:
3027         return url
3028     parsed_url = urllib.parse.urlparse(url)
3029     qs = urllib.parse.parse_qs(parsed_url.query)
3030     qs.update(query)
3031     return urllib.parse.urlunparse(parsed_url._replace(
3032         query=urllib.parse.urlencode(qs, True)))
3033
3034
3035 def update_Request(req, url=None, data=None, headers=None, query=None):
3036     req_headers = req.headers.copy()
3037     req_headers.update(headers or {})
3038     req_data = data or req.data
3039     req_url = update_url_query(url or req.get_full_url(), query)
3040     req_get_method = req.get_method()
3041     if req_get_method == 'HEAD':
3042         req_type = HEADRequest
3043     elif req_get_method == 'PUT':
3044         req_type = PUTRequest
3045     else:
3046         req_type = urllib.request.Request
3047     new_req = req_type(
3048         req_url, data=req_data, headers=req_headers,
3049         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3050     if hasattr(req, 'timeout'):
3051         new_req.timeout = req.timeout
3052     return new_req
3053
3054
3055 def _multipart_encode_impl(data, boundary):
3056     content_type = 'multipart/form-data; boundary=%s' % boundary
3057
3058     out = b''
3059     for k, v in data.items():
3060         out += b'--' + boundary.encode('ascii') + b'\r\n'
3061         if isinstance(k, str):
3062             k = k.encode()
3063         if isinstance(v, str):
3064             v = v.encode()
3065         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3066         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3067         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3068         if boundary.encode('ascii') in content:
3069             raise ValueError('Boundary overlaps with data')
3070         out += content
3071
3072     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3073
3074     return out, content_type
3075
3076
3077 def multipart_encode(data, boundary=None):
3078     '''
3079     Encode a dict to RFC 7578-compliant form-data
3080
3081     data:
3082         A dict where keys and values can be either Unicode or bytes-like
3083         objects.
3084     boundary:
3085         If specified a Unicode object, it's used as the boundary. Otherwise
3086         a random boundary is generated.
3087
3088     Reference: https://tools.ietf.org/html/rfc7578
3089     '''
3090     has_specified_boundary = boundary is not None
3091
3092     while True:
3093         if boundary is None:
3094             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3095
3096         try:
3097             out, content_type = _multipart_encode_impl(data, boundary)
3098             break
3099         except ValueError:
3100             if has_specified_boundary:
3101                 raise
3102             boundary = None
3103
3104     return out, content_type
3105
3106
3107 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3108     for val in map(d.get, variadic(key_or_keys)):
3109         if val is not None and (val or not skip_false_values):
3110             return val
3111     return default
3112
3113
3114 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3115     for f in funcs:
3116         try:
3117             val = f(*args, **kwargs)
3118         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3119             pass
3120         else:
3121             if expected_type is None or isinstance(val, expected_type):
3122                 return val
3123
3124
3125 def try_get(src, getter, expected_type=None):
3126     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3127
3128
3129 def filter_dict(dct, cndn=lambda _, v: v is not None):
3130     return {k: v for k, v in dct.items() if cndn(k, v)}
3131
3132
3133 def merge_dicts(*dicts):
3134     merged = {}
3135     for a_dict in dicts:
3136         for k, v in a_dict.items():
3137             if (v is not None and k not in merged
3138                     or isinstance(v, str) and merged[k] == ''):
3139                 merged[k] = v
3140     return merged
3141
3142
3143 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3144     return string if isinstance(string, str) else str(string, encoding, errors)
3145
3146
3147 US_RATINGS = {
3148     'G': 0,
3149     'PG': 10,
3150     'PG-13': 13,
3151     'R': 16,
3152     'NC': 18,
3153 }
3154
3155
3156 TV_PARENTAL_GUIDELINES = {
3157     'TV-Y': 0,
3158     'TV-Y7': 7,
3159     'TV-G': 0,
3160     'TV-PG': 0,
3161     'TV-14': 14,
3162     'TV-MA': 17,
3163 }
3164
3165
3166 def parse_age_limit(s):
3167     # isinstance(False, int) is True. So type() must be used instead
3168     if type(s) is int:  # noqa: E721
3169         return s if 0 <= s <= 21 else None
3170     elif not isinstance(s, str):
3171         return None
3172     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3173     if m:
3174         return int(m.group('age'))
3175     s = s.upper()
3176     if s in US_RATINGS:
3177         return US_RATINGS[s]
3178     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3179     if m:
3180         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3181     return None
3182
3183
3184 def strip_jsonp(code):
3185     return re.sub(
3186         r'''(?sx)^
3187             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3188             (?:\s*&&\s*(?P=func_name))?
3189             \s*\(\s*(?P<callback_data>.*)\);?
3190             \s*?(?://[^\n]*)*$''',
3191         r'\g<callback_data>', code)
3192
3193
3194 def js_to_json(code, vars={}):
3195     # vars is a dict of var, val pairs to substitute
3196     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3197     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3198     INTEGER_TABLE = (
3199         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3200         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3201     )
3202
3203     def fix_kv(m):
3204         v = m.group(0)
3205         if v in ('true', 'false', 'null'):
3206             return v
3207         elif v in ('undefined', 'void 0'):
3208             return 'null'
3209         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3210             return ""
3211
3212         if v[0] in ("'", '"'):
3213             v = re.sub(r'(?s)\\.|"', lambda m: {
3214                 '"': '\\"',
3215                 "\\'": "'",
3216                 '\\\n': '',
3217                 '\\x': '\\u00',
3218             }.get(m.group(0), m.group(0)), v[1:-1])
3219         else:
3220             for regex, base in INTEGER_TABLE:
3221                 im = re.match(regex, v)
3222                 if im:
3223                     i = int(im.group(1), base)
3224                     return '"%d":' % i if v.endswith(':') else '%d' % i
3225
3226             if v in vars:
3227                 return vars[v]
3228
3229         return '"%s"' % v
3230
3231     def create_map(mobj):
3232         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3233
3234     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3235     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3236
3237     return re.sub(r'''(?sx)
3238         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3239         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3240         {comment}|,(?={skip}[\]}}])|
3241         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3242         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3243         [0-9]+(?={skip}:)|
3244         !+
3245         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3246
3247
3248 def qualities(quality_ids):
3249     """ Get a numeric quality value out of a list of possible values """
3250     def q(qid):
3251         try:
3252             return quality_ids.index(qid)
3253         except ValueError:
3254             return -1
3255     return q
3256
3257
3258 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3259
3260
3261 DEFAULT_OUTTMPL = {
3262     'default': '%(title)s [%(id)s].%(ext)s',
3263     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3264 }
3265 OUTTMPL_TYPES = {
3266     'chapter': None,
3267     'subtitle': None,
3268     'thumbnail': None,
3269     'description': 'description',
3270     'annotation': 'annotations.xml',
3271     'infojson': 'info.json',
3272     'link': None,
3273     'pl_video': None,
3274     'pl_thumbnail': None,
3275     'pl_description': 'description',
3276     'pl_infojson': 'info.json',
3277 }
3278
3279 # As of [1] format syntax is:
3280 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3281 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3282 STR_FORMAT_RE_TMPL = r'''(?x)
3283     (?<!%)(?P<prefix>(?:%%)*)
3284     %
3285     (?P<has_key>\((?P<key>{0})\))?
3286     (?P<format>
3287         (?P<conversion>[#0\-+ ]+)?
3288         (?P<min_width>\d+)?
3289         (?P<precision>\.\d+)?
3290         (?P<len_mod>[hlL])?  # unused in python
3291         {1}  # conversion type
3292     )
3293 '''
3294
3295
3296 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3297
3298
3299 def limit_length(s, length):
3300     """ Add ellipses to overly long strings """
3301     if s is None:
3302         return None
3303     ELLIPSES = '...'
3304     if len(s) > length:
3305         return s[:length - len(ELLIPSES)] + ELLIPSES
3306     return s
3307
3308
3309 def version_tuple(v):
3310     return tuple(int(e) for e in re.split(r'[-.]', v))
3311
3312
3313 def is_outdated_version(version, limit, assume_new=True):
3314     if not version:
3315         return not assume_new
3316     try:
3317         return version_tuple(version) < version_tuple(limit)
3318     except ValueError:
3319         return not assume_new
3320
3321
3322 def ytdl_is_updateable():
3323     """ Returns if yt-dlp can be updated with -U """
3324
3325     from .update import is_non_updateable
3326
3327     return not is_non_updateable()
3328
3329
3330 def args_to_str(args):
3331     # Get a short string representation for a subprocess command
3332     return ' '.join(compat_shlex_quote(a) for a in args)
3333
3334
3335 def error_to_compat_str(err):
3336     return str(err)
3337
3338
3339 def error_to_str(err):
3340     return f'{type(err).__name__}: {err}'
3341
3342
3343 def mimetype2ext(mt):
3344     if mt is None:
3345         return None
3346
3347     mt, _, params = mt.partition(';')
3348     mt = mt.strip()
3349
3350     FULL_MAP = {
3351         'audio/mp4': 'm4a',
3352         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3353         # it's the most popular one
3354         'audio/mpeg': 'mp3',
3355         'audio/x-wav': 'wav',
3356         'audio/wav': 'wav',
3357         'audio/wave': 'wav',
3358     }
3359
3360     ext = FULL_MAP.get(mt)
3361     if ext is not None:
3362         return ext
3363
3364     SUBTYPE_MAP = {
3365         '3gpp': '3gp',
3366         'smptett+xml': 'tt',
3367         'ttaf+xml': 'dfxp',
3368         'ttml+xml': 'ttml',
3369         'x-flv': 'flv',
3370         'x-mp4-fragmented': 'mp4',
3371         'x-ms-sami': 'sami',
3372         'x-ms-wmv': 'wmv',
3373         'mpegurl': 'm3u8',
3374         'x-mpegurl': 'm3u8',
3375         'vnd.apple.mpegurl': 'm3u8',
3376         'dash+xml': 'mpd',
3377         'f4m+xml': 'f4m',
3378         'hds+xml': 'f4m',
3379         'vnd.ms-sstr+xml': 'ism',
3380         'quicktime': 'mov',
3381         'mp2t': 'ts',
3382         'x-wav': 'wav',
3383         'filmstrip+json': 'fs',
3384         'svg+xml': 'svg',
3385     }
3386
3387     _, _, subtype = mt.rpartition('/')
3388     ext = SUBTYPE_MAP.get(subtype.lower())
3389     if ext is not None:
3390         return ext
3391
3392     SUFFIX_MAP = {
3393         'json': 'json',
3394         'xml': 'xml',
3395         'zip': 'zip',
3396         'gzip': 'gz',
3397     }
3398
3399     _, _, suffix = subtype.partition('+')
3400     ext = SUFFIX_MAP.get(suffix)
3401     if ext is not None:
3402         return ext
3403
3404     return subtype.replace('+', '.')
3405
3406
3407 def ext2mimetype(ext_or_url):
3408     if not ext_or_url:
3409         return None
3410     if '.' not in ext_or_url:
3411         ext_or_url = f'file.{ext_or_url}'
3412     return mimetypes.guess_type(ext_or_url)[0]
3413
3414
3415 def parse_codecs(codecs_str):
3416     # http://tools.ietf.org/html/rfc6381
3417     if not codecs_str:
3418         return {}
3419     split_codecs = list(filter(None, map(
3420         str.strip, codecs_str.strip().strip(',').split(','))))
3421     vcodec, acodec, scodec, hdr = None, None, None, None
3422     for full_codec in split_codecs:
3423         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3424         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3425                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3426             if vcodec:
3427                 continue
3428             vcodec = full_codec
3429             if parts[0] in ('dvh1', 'dvhe'):
3430                 hdr = 'DV'
3431             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3432                 hdr = 'HDR10'
3433             elif parts[:2] == ['vp9', '2']:
3434                 hdr = 'HDR10'
3435         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3436                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3437             acodec = acodec or full_codec
3438         elif parts[0] in ('stpp', 'wvtt'):
3439             scodec = scodec or full_codec
3440         else:
3441             write_string(f'WARNING: Unknown codec {full_codec}\n')
3442     if vcodec or acodec or scodec:
3443         return {
3444             'vcodec': vcodec or 'none',
3445             'acodec': acodec or 'none',
3446             'dynamic_range': hdr,
3447             **({'scodec': scodec} if scodec is not None else {}),
3448         }
3449     elif len(split_codecs) == 2:
3450         return {
3451             'vcodec': split_codecs[0],
3452             'acodec': split_codecs[1],
3453         }
3454     return {}
3455
3456
3457 def urlhandle_detect_ext(url_handle):
3458     getheader = url_handle.headers.get
3459
3460     cd = getheader('Content-Disposition')
3461     if cd:
3462         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3463         if m:
3464             e = determine_ext(m.group('filename'), default_ext=None)
3465             if e:
3466                 return e
3467
3468     return mimetype2ext(getheader('Content-Type'))
3469
3470
3471 def encode_data_uri(data, mime_type):
3472     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3473
3474
3475 def age_restricted(content_limit, age_limit):
3476     """ Returns True iff the content should be blocked """
3477
3478     if age_limit is None:  # No limit set
3479         return False
3480     if content_limit is None:
3481         return False  # Content available for everyone
3482     return age_limit < content_limit
3483
3484
3485 # List of known byte-order-marks (BOM)
3486 BOMS = [
3487     (b'\xef\xbb\xbf', 'utf-8'),
3488     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3489     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3490     (b'\xff\xfe', 'utf-16-le'),
3491     (b'\xfe\xff', 'utf-16-be'),
3492 ]
3493
3494
3495 def is_html(first_bytes):
3496     """ Detect whether a file contains HTML by examining its first bytes. """
3497
3498     encoding = 'utf-8'
3499     for bom, enc in BOMS:
3500         while first_bytes.startswith(bom):
3501             encoding, first_bytes = enc, first_bytes[len(bom):]
3502
3503     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3504
3505
3506 def determine_protocol(info_dict):
3507     protocol = info_dict.get('protocol')
3508     if protocol is not None:
3509         return protocol
3510
3511     url = sanitize_url(info_dict['url'])
3512     if url.startswith('rtmp'):
3513         return 'rtmp'
3514     elif url.startswith('mms'):
3515         return 'mms'
3516     elif url.startswith('rtsp'):
3517         return 'rtsp'
3518
3519     ext = determine_ext(url)
3520     if ext == 'm3u8':
3521         return 'm3u8'
3522     elif ext == 'f4m':
3523         return 'f4m'
3524
3525     return urllib.parse.urlparse(url).scheme
3526
3527
3528 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3529     """ Render a list of rows, each as a list of values.
3530     Text after a \t will be right aligned """
3531     def width(string):
3532         return len(remove_terminal_sequences(string).replace('\t', ''))
3533
3534     def get_max_lens(table):
3535         return [max(width(str(v)) for v in col) for col in zip(*table)]
3536
3537     def filter_using_list(row, filterArray):
3538         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3539
3540     max_lens = get_max_lens(data) if hide_empty else []
3541     header_row = filter_using_list(header_row, max_lens)
3542     data = [filter_using_list(row, max_lens) for row in data]
3543
3544     table = [header_row] + data
3545     max_lens = get_max_lens(table)
3546     extra_gap += 1
3547     if delim:
3548         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3549         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3550     for row in table:
3551         for pos, text in enumerate(map(str, row)):
3552             if '\t' in text:
3553                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3554             else:
3555                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3556     ret = '\n'.join(''.join(row).rstrip() for row in table)
3557     return ret
3558
3559
3560 def _match_one(filter_part, dct, incomplete):
3561     # TODO: Generalize code with YoutubeDL._build_format_filter
3562     STRING_OPERATORS = {
3563         '*=': operator.contains,
3564         '^=': lambda attr, value: attr.startswith(value),
3565         '$=': lambda attr, value: attr.endswith(value),
3566         '~=': lambda attr, value: re.search(value, attr),
3567     }
3568     COMPARISON_OPERATORS = {
3569         **STRING_OPERATORS,
3570         '<=': operator.le,  # "<=" must be defined above "<"
3571         '<': operator.lt,
3572         '>=': operator.ge,
3573         '>': operator.gt,
3574         '=': operator.eq,
3575     }
3576
3577     if isinstance(incomplete, bool):
3578         is_incomplete = lambda _: incomplete
3579     else:
3580         is_incomplete = lambda k: k in incomplete
3581
3582     operator_rex = re.compile(r'''(?x)
3583         (?P<key>[a-z_]+)
3584         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3585         (?:
3586             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3587             (?P<strval>.+?)
3588         )
3589         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3590     m = operator_rex.fullmatch(filter_part.strip())
3591     if m:
3592         m = m.groupdict()
3593         unnegated_op = COMPARISON_OPERATORS[m['op']]
3594         if m['negation']:
3595             op = lambda attr, value: not unnegated_op(attr, value)
3596         else:
3597             op = unnegated_op
3598         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3599         if m['quote']:
3600             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3601         actual_value = dct.get(m['key'])
3602         numeric_comparison = None
3603         if isinstance(actual_value, (int, float)):
3604             # If the original field is a string and matching comparisonvalue is
3605             # a number we should respect the origin of the original field
3606             # and process comparison value as a string (see
3607             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3608             try:
3609                 numeric_comparison = int(comparison_value)
3610             except ValueError:
3611                 numeric_comparison = parse_filesize(comparison_value)
3612                 if numeric_comparison is None:
3613                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3614                 if numeric_comparison is None:
3615                     numeric_comparison = parse_duration(comparison_value)
3616         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3617             raise ValueError('Operator %s only supports string values!' % m['op'])
3618         if actual_value is None:
3619             return is_incomplete(m['key']) or m['none_inclusive']
3620         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3621
3622     UNARY_OPERATORS = {
3623         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3624         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3625     }
3626     operator_rex = re.compile(r'''(?x)
3627         (?P<op>%s)\s*(?P<key>[a-z_]+)
3628         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3629     m = operator_rex.fullmatch(filter_part.strip())
3630     if m:
3631         op = UNARY_OPERATORS[m.group('op')]
3632         actual_value = dct.get(m.group('key'))
3633         if is_incomplete(m.group('key')) and actual_value is None:
3634             return True
3635         return op(actual_value)
3636
3637     raise ValueError('Invalid filter part %r' % filter_part)
3638
3639
3640 def match_str(filter_str, dct, incomplete=False):
3641     """ Filter a dictionary with a simple string syntax.
3642     @returns           Whether the filter passes
3643     @param incomplete  Set of keys that is expected to be missing from dct.
3644                        Can be True/False to indicate all/none of the keys may be missing.
3645                        All conditions on incomplete keys pass if the key is missing
3646     """
3647     return all(
3648         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3649         for filter_part in re.split(r'(?<!\\)&', filter_str))
3650
3651
3652 def match_filter_func(filters):
3653     if not filters:
3654         return None
3655     filters = set(variadic(filters))
3656
3657     interactive = '-' in filters
3658     if interactive:
3659         filters.remove('-')
3660
3661     def _match_func(info_dict, incomplete=False):
3662         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3663             return NO_DEFAULT if interactive and not incomplete else None
3664         else:
3665             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3666             filter_str = ') | ('.join(map(str.strip, filters))
3667             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3668     return _match_func
3669
3670
3671 class download_range_func:
3672     def __init__(self, chapters, ranges):
3673         self.chapters, self.ranges = chapters, ranges
3674
3675     def __call__(self, info_dict, ydl):
3676         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3677                    else 'Cannot match chapters since chapter information is unavailable')
3678         for regex in self.chapters or []:
3679             for i, chapter in enumerate(info_dict.get('chapters') or []):
3680                 if re.search(regex, chapter['title']):
3681                     warning = None
3682                     yield {**chapter, 'index': i}
3683         if self.chapters and warning:
3684             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3685
3686         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3687
3688     def __eq__(self, other):
3689         return (isinstance(other, download_range_func)
3690                 and self.chapters == other.chapters and self.ranges == other.ranges)
3691
3692
3693 def parse_dfxp_time_expr(time_expr):
3694     if not time_expr:
3695         return
3696
3697     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3698     if mobj:
3699         return float(mobj.group('time_offset'))
3700
3701     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3702     if mobj:
3703         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3704
3705
3706 def srt_subtitles_timecode(seconds):
3707     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3708
3709
3710 def ass_subtitles_timecode(seconds):
3711     time = timetuple_from_msec(seconds * 1000)
3712     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3713
3714
3715 def dfxp2srt(dfxp_data):
3716     '''
3717     @param dfxp_data A bytes-like object containing DFXP data
3718     @returns A unicode object containing converted SRT data
3719     '''
3720     LEGACY_NAMESPACES = (
3721         (b'http://www.w3.org/ns/ttml', [
3722             b'http://www.w3.org/2004/11/ttaf1',
3723             b'http://www.w3.org/2006/04/ttaf1',
3724             b'http://www.w3.org/2006/10/ttaf1',
3725         ]),
3726         (b'http://www.w3.org/ns/ttml#styling', [
3727             b'http://www.w3.org/ns/ttml#style',
3728         ]),
3729     )
3730
3731     SUPPORTED_STYLING = [
3732         'color',
3733         'fontFamily',
3734         'fontSize',
3735         'fontStyle',
3736         'fontWeight',
3737         'textDecoration'
3738     ]
3739
3740     _x = functools.partial(xpath_with_ns, ns_map={
3741         'xml': 'http://www.w3.org/XML/1998/namespace',
3742         'ttml': 'http://www.w3.org/ns/ttml',
3743         'tts': 'http://www.w3.org/ns/ttml#styling',
3744     })
3745
3746     styles = {}
3747     default_style = {}
3748
3749     class TTMLPElementParser:
3750         _out = ''
3751         _unclosed_elements = []
3752         _applied_styles = []
3753
3754         def start(self, tag, attrib):
3755             if tag in (_x('ttml:br'), 'br'):
3756                 self._out += '\n'
3757             else:
3758                 unclosed_elements = []
3759                 style = {}
3760                 element_style_id = attrib.get('style')
3761                 if default_style:
3762                     style.update(default_style)
3763                 if element_style_id:
3764                     style.update(styles.get(element_style_id, {}))
3765                 for prop in SUPPORTED_STYLING:
3766                     prop_val = attrib.get(_x('tts:' + prop))
3767                     if prop_val:
3768                         style[prop] = prop_val
3769                 if style:
3770                     font = ''
3771                     for k, v in sorted(style.items()):
3772                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3773                             continue
3774                         if k == 'color':
3775                             font += ' color="%s"' % v
3776                         elif k == 'fontSize':
3777                             font += ' size="%s"' % v
3778                         elif k == 'fontFamily':
3779                             font += ' face="%s"' % v
3780                         elif k == 'fontWeight' and v == 'bold':
3781                             self._out += '<b>'
3782                             unclosed_elements.append('b')
3783                         elif k == 'fontStyle' and v == 'italic':
3784                             self._out += '<i>'
3785                             unclosed_elements.append('i')
3786                         elif k == 'textDecoration' and v == 'underline':
3787                             self._out += '<u>'
3788                             unclosed_elements.append('u')
3789                     if font:
3790                         self._out += '<font' + font + '>'
3791                         unclosed_elements.append('font')
3792                     applied_style = {}
3793                     if self._applied_styles:
3794                         applied_style.update(self._applied_styles[-1])
3795                     applied_style.update(style)
3796                     self._applied_styles.append(applied_style)
3797                 self._unclosed_elements.append(unclosed_elements)
3798
3799         def end(self, tag):
3800             if tag not in (_x('ttml:br'), 'br'):
3801                 unclosed_elements = self._unclosed_elements.pop()
3802                 for element in reversed(unclosed_elements):
3803                     self._out += '</%s>' % element
3804                 if unclosed_elements and self._applied_styles:
3805                     self._applied_styles.pop()
3806
3807         def data(self, data):
3808             self._out += data
3809
3810         def close(self):
3811             return self._out.strip()
3812
3813     def parse_node(node):
3814         target = TTMLPElementParser()
3815         parser = xml.etree.ElementTree.XMLParser(target=target)
3816         parser.feed(xml.etree.ElementTree.tostring(node))
3817         return parser.close()
3818
3819     for k, v in LEGACY_NAMESPACES:
3820         for ns in v:
3821             dfxp_data = dfxp_data.replace(ns, k)
3822
3823     dfxp = compat_etree_fromstring(dfxp_data)
3824     out = []
3825     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3826
3827     if not paras:
3828         raise ValueError('Invalid dfxp/TTML subtitle')
3829
3830     repeat = False
3831     while True:
3832         for style in dfxp.findall(_x('.//ttml:style')):
3833             style_id = style.get('id') or style.get(_x('xml:id'))
3834             if not style_id:
3835                 continue
3836             parent_style_id = style.get('style')
3837             if parent_style_id:
3838                 if parent_style_id not in styles:
3839                     repeat = True
3840                     continue
3841                 styles[style_id] = styles[parent_style_id].copy()
3842             for prop in SUPPORTED_STYLING:
3843                 prop_val = style.get(_x('tts:' + prop))
3844                 if prop_val:
3845                     styles.setdefault(style_id, {})[prop] = prop_val
3846         if repeat:
3847             repeat = False
3848         else:
3849             break
3850
3851     for p in ('body', 'div'):
3852         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3853         if ele is None:
3854             continue
3855         style = styles.get(ele.get('style'))
3856         if not style:
3857             continue
3858         default_style.update(style)
3859
3860     for para, index in zip(paras, itertools.count(1)):
3861         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3862         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3863         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3864         if begin_time is None:
3865             continue
3866         if not end_time:
3867             if not dur:
3868                 continue
3869             end_time = begin_time + dur
3870         out.append('%d\n%s --> %s\n%s\n\n' % (
3871             index,
3872             srt_subtitles_timecode(begin_time),
3873             srt_subtitles_timecode(end_time),
3874             parse_node(para)))
3875
3876     return ''.join(out)
3877
3878
3879 def cli_option(params, command_option, param, separator=None):
3880     param = params.get(param)
3881     return ([] if param is None
3882             else [command_option, str(param)] if separator is None
3883             else [f'{command_option}{separator}{param}'])
3884
3885
3886 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3887     param = params.get(param)
3888     assert param in (True, False, None)
3889     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3890
3891
3892 def cli_valueless_option(params, command_option, param, expected_value=True):
3893     return [command_option] if params.get(param) == expected_value else []
3894
3895
3896 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3897     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3898         if use_compat:
3899             return argdict
3900         else:
3901             argdict = None
3902     if argdict is None:
3903         return default
3904     assert isinstance(argdict, dict)
3905
3906     assert isinstance(keys, (list, tuple))
3907     for key_list in keys:
3908         arg_list = list(filter(
3909             lambda x: x is not None,
3910             [argdict.get(key.lower()) for key in variadic(key_list)]))
3911         if arg_list:
3912             return [arg for args in arg_list for arg in args]
3913     return default
3914
3915
3916 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3917     main_key, exe = main_key.lower(), exe.lower()
3918     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3919     keys = [f'{root_key}{k}' for k in (keys or [''])]
3920     if root_key in keys:
3921         if main_key != exe:
3922             keys.append((main_key, exe))
3923         keys.append('default')
3924     else:
3925         use_compat = False
3926     return cli_configuration_args(argdict, keys, default, use_compat)
3927
3928
3929 class ISO639Utils:
3930     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3931     _lang_map = {
3932         'aa': 'aar',
3933         'ab': 'abk',
3934         'ae': 'ave',
3935         'af': 'afr',
3936         'ak': 'aka',
3937         'am': 'amh',
3938         'an': 'arg',
3939         'ar': 'ara',
3940         'as': 'asm',
3941         'av': 'ava',
3942         'ay': 'aym',
3943         'az': 'aze',
3944         'ba': 'bak',
3945         'be': 'bel',
3946         'bg': 'bul',
3947         'bh': 'bih',
3948         'bi': 'bis',
3949         'bm': 'bam',
3950         'bn': 'ben',
3951         'bo': 'bod',
3952         'br': 'bre',
3953         'bs': 'bos',
3954         'ca': 'cat',
3955         'ce': 'che',
3956         'ch': 'cha',
3957         'co': 'cos',
3958         'cr': 'cre',
3959         'cs': 'ces',
3960         'cu': 'chu',
3961         'cv': 'chv',
3962         'cy': 'cym',
3963         'da': 'dan',
3964         'de': 'deu',
3965         'dv': 'div',
3966         'dz': 'dzo',
3967         'ee': 'ewe',
3968         'el': 'ell',
3969         'en': 'eng',
3970         'eo': 'epo',
3971         'es': 'spa',
3972         'et': 'est',
3973         'eu': 'eus',
3974         'fa': 'fas',
3975         'ff': 'ful',
3976         'fi': 'fin',
3977         'fj': 'fij',
3978         'fo': 'fao',
3979         'fr': 'fra',
3980         'fy': 'fry',
3981         'ga': 'gle',
3982         'gd': 'gla',
3983         'gl': 'glg',
3984         'gn': 'grn',
3985         'gu': 'guj',
3986         'gv': 'glv',
3987         'ha': 'hau',
3988         'he': 'heb',
3989         'iw': 'heb',  # Replaced by he in 1989 revision
3990         'hi': 'hin',
3991         'ho': 'hmo',
3992         'hr': 'hrv',
3993         'ht': 'hat',
3994         'hu': 'hun',
3995         'hy': 'hye',
3996         'hz': 'her',
3997         'ia': 'ina',
3998         'id': 'ind',
3999         'in': 'ind',  # Replaced by id in 1989 revision
4000         'ie': 'ile',
4001         'ig': 'ibo',
4002         'ii': 'iii',
4003         'ik': 'ipk',
4004         'io': 'ido',
4005         'is': 'isl',
4006         'it': 'ita',
4007         'iu': 'iku',
4008         'ja': 'jpn',
4009         'jv': 'jav',
4010         'ka': 'kat',
4011         'kg': 'kon',
4012         'ki': 'kik',
4013         'kj': 'kua',
4014         'kk': 'kaz',
4015         'kl': 'kal',
4016         'km': 'khm',
4017         'kn': 'kan',
4018         'ko': 'kor',
4019         'kr': 'kau',
4020         'ks': 'kas',
4021         'ku': 'kur',
4022         'kv': 'kom',
4023         'kw': 'cor',
4024         'ky': 'kir',
4025         'la': 'lat',
4026         'lb': 'ltz',
4027         'lg': 'lug',
4028         'li': 'lim',
4029         'ln': 'lin',
4030         'lo': 'lao',
4031         'lt': 'lit',
4032         'lu': 'lub',
4033         'lv': 'lav',
4034         'mg': 'mlg',
4035         'mh': 'mah',
4036         'mi': 'mri',
4037         'mk': 'mkd',
4038         'ml': 'mal',
4039         'mn': 'mon',
4040         'mr': 'mar',
4041         'ms': 'msa',
4042         'mt': 'mlt',
4043         'my': 'mya',
4044         'na': 'nau',
4045         'nb': 'nob',
4046         'nd': 'nde',
4047         'ne': 'nep',
4048         'ng': 'ndo',
4049         'nl': 'nld',
4050         'nn': 'nno',
4051         'no': 'nor',
4052         'nr': 'nbl',
4053         'nv': 'nav',
4054         'ny': 'nya',
4055         'oc': 'oci',
4056         'oj': 'oji',
4057         'om': 'orm',
4058         'or': 'ori',
4059         'os': 'oss',
4060         'pa': 'pan',
4061         'pi': 'pli',
4062         'pl': 'pol',
4063         'ps': 'pus',
4064         'pt': 'por',
4065         'qu': 'que',
4066         'rm': 'roh',
4067         'rn': 'run',
4068         'ro': 'ron',
4069         'ru': 'rus',
4070         'rw': 'kin',
4071         'sa': 'san',
4072         'sc': 'srd',
4073         'sd': 'snd',
4074         'se': 'sme',
4075         'sg': 'sag',
4076         'si': 'sin',
4077         'sk': 'slk',
4078         'sl': 'slv',
4079         'sm': 'smo',
4080         'sn': 'sna',
4081         'so': 'som',
4082         'sq': 'sqi',
4083         'sr': 'srp',
4084         'ss': 'ssw',
4085         'st': 'sot',
4086         'su': 'sun',
4087         'sv': 'swe',
4088         'sw': 'swa',
4089         'ta': 'tam',
4090         'te': 'tel',
4091         'tg': 'tgk',
4092         'th': 'tha',
4093         'ti': 'tir',
4094         'tk': 'tuk',
4095         'tl': 'tgl',
4096         'tn': 'tsn',
4097         'to': 'ton',
4098         'tr': 'tur',
4099         'ts': 'tso',
4100         'tt': 'tat',
4101         'tw': 'twi',
4102         'ty': 'tah',
4103         'ug': 'uig',
4104         'uk': 'ukr',
4105         'ur': 'urd',
4106         'uz': 'uzb',
4107         've': 'ven',
4108         'vi': 'vie',
4109         'vo': 'vol',
4110         'wa': 'wln',
4111         'wo': 'wol',
4112         'xh': 'xho',
4113         'yi': 'yid',
4114         'ji': 'yid',  # Replaced by yi in 1989 revision
4115         'yo': 'yor',
4116         'za': 'zha',
4117         'zh': 'zho',
4118         'zu': 'zul',
4119     }
4120
4121     @classmethod
4122     def short2long(cls, code):
4123         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4124         return cls._lang_map.get(code[:2])
4125
4126     @classmethod
4127     def long2short(cls, code):
4128         """Convert language code from ISO 639-2/T to ISO 639-1"""
4129         for short_name, long_name in cls._lang_map.items():
4130             if long_name == code:
4131                 return short_name
4132
4133
4134 class ISO3166Utils:
4135     # From http://data.okfn.org/data/core/country-list
4136     _country_map = {
4137         'AF': 'Afghanistan',
4138         'AX': 'Åland Islands',
4139         'AL': 'Albania',
4140         'DZ': 'Algeria',
4141         'AS': 'American Samoa',
4142         'AD': 'Andorra',
4143         'AO': 'Angola',
4144         'AI': 'Anguilla',
4145         'AQ': 'Antarctica',
4146         'AG': 'Antigua and Barbuda',
4147         'AR': 'Argentina',
4148         'AM': 'Armenia',
4149         'AW': 'Aruba',
4150         'AU': 'Australia',
4151         'AT': 'Austria',
4152         'AZ': 'Azerbaijan',
4153         'BS': 'Bahamas',
4154         'BH': 'Bahrain',
4155         'BD': 'Bangladesh',
4156         'BB': 'Barbados',
4157         'BY': 'Belarus',
4158         'BE': 'Belgium',
4159         'BZ': 'Belize',
4160         'BJ': 'Benin',
4161         'BM': 'Bermuda',
4162         'BT': 'Bhutan',
4163         'BO': 'Bolivia, Plurinational State of',
4164         'BQ': 'Bonaire, Sint Eustatius and Saba',
4165         'BA': 'Bosnia and Herzegovina',
4166         'BW': 'Botswana',
4167         'BV': 'Bouvet Island',
4168         'BR': 'Brazil',
4169         'IO': 'British Indian Ocean Territory',
4170         'BN': 'Brunei Darussalam',
4171         'BG': 'Bulgaria',
4172         'BF': 'Burkina Faso',
4173         'BI': 'Burundi',
4174         'KH': 'Cambodia',
4175         'CM': 'Cameroon',
4176         'CA': 'Canada',
4177         'CV': 'Cape Verde',
4178         'KY': 'Cayman Islands',
4179         'CF': 'Central African Republic',
4180         'TD': 'Chad',
4181         'CL': 'Chile',
4182         'CN': 'China',
4183         'CX': 'Christmas Island',
4184         'CC': 'Cocos (Keeling) Islands',
4185         'CO': 'Colombia',
4186         'KM': 'Comoros',
4187         'CG': 'Congo',
4188         'CD': 'Congo, the Democratic Republic of the',
4189         'CK': 'Cook Islands',
4190         'CR': 'Costa Rica',
4191         'CI': 'Côte d\'Ivoire',
4192         'HR': 'Croatia',
4193         'CU': 'Cuba',
4194         'CW': 'Curaçao',
4195         'CY': 'Cyprus',
4196         'CZ': 'Czech Republic',
4197         'DK': 'Denmark',
4198         'DJ': 'Djibouti',
4199         'DM': 'Dominica',
4200         'DO': 'Dominican Republic',
4201         'EC': 'Ecuador',
4202         'EG': 'Egypt',
4203         'SV': 'El Salvador',
4204         'GQ': 'Equatorial Guinea',
4205         'ER': 'Eritrea',
4206         'EE': 'Estonia',
4207         'ET': 'Ethiopia',
4208         'FK': 'Falkland Islands (Malvinas)',
4209         'FO': 'Faroe Islands',
4210         'FJ': 'Fiji',
4211         'FI': 'Finland',
4212         'FR': 'France',
4213         'GF': 'French Guiana',
4214         'PF': 'French Polynesia',
4215         'TF': 'French Southern Territories',
4216         'GA': 'Gabon',
4217         'GM': 'Gambia',
4218         'GE': 'Georgia',
4219         'DE': 'Germany',
4220         'GH': 'Ghana',
4221         'GI': 'Gibraltar',
4222         'GR': 'Greece',
4223         'GL': 'Greenland',
4224         'GD': 'Grenada',
4225         'GP': 'Guadeloupe',
4226         'GU': 'Guam',
4227         'GT': 'Guatemala',
4228         'GG': 'Guernsey',
4229         'GN': 'Guinea',
4230         'GW': 'Guinea-Bissau',
4231         'GY': 'Guyana',
4232         'HT': 'Haiti',
4233         'HM': 'Heard Island and McDonald Islands',
4234         'VA': 'Holy See (Vatican City State)',
4235         'HN': 'Honduras',
4236         'HK': 'Hong Kong',
4237         'HU': 'Hungary',
4238         'IS': 'Iceland',
4239         'IN': 'India',
4240         'ID': 'Indonesia',
4241         'IR': 'Iran, Islamic Republic of',
4242         'IQ': 'Iraq',
4243         'IE': 'Ireland',
4244         'IM': 'Isle of Man',
4245         'IL': 'Israel',
4246         'IT': 'Italy',
4247         'JM': 'Jamaica',
4248         'JP': 'Japan',
4249         'JE': 'Jersey',
4250         'JO': 'Jordan',
4251         'KZ': 'Kazakhstan',
4252         'KE': 'Kenya',
4253         'KI': 'Kiribati',
4254         'KP': 'Korea, Democratic People\'s Republic of',
4255         'KR': 'Korea, Republic of',
4256         'KW': 'Kuwait',
4257         'KG': 'Kyrgyzstan',
4258         'LA': 'Lao People\'s Democratic Republic',
4259         'LV': 'Latvia',
4260         'LB': 'Lebanon',
4261         'LS': 'Lesotho',
4262         'LR': 'Liberia',
4263         'LY': 'Libya',
4264         'LI': 'Liechtenstein',
4265         'LT': 'Lithuania',
4266         'LU': 'Luxembourg',
4267         'MO': 'Macao',
4268         'MK': 'Macedonia, the Former Yugoslav Republic of',
4269         'MG': 'Madagascar',
4270         'MW': 'Malawi',
4271         'MY': 'Malaysia',
4272         'MV': 'Maldives',
4273         'ML': 'Mali',
4274         'MT': 'Malta',
4275         'MH': 'Marshall Islands',
4276         'MQ': 'Martinique',
4277         'MR': 'Mauritania',
4278         'MU': 'Mauritius',
4279         'YT': 'Mayotte',
4280         'MX': 'Mexico',
4281         'FM': 'Micronesia, Federated States of',
4282         'MD': 'Moldova, Republic of',
4283         'MC': 'Monaco',
4284         'MN': 'Mongolia',
4285         'ME': 'Montenegro',
4286         'MS': 'Montserrat',
4287         'MA': 'Morocco',
4288         'MZ': 'Mozambique',
4289         'MM': 'Myanmar',
4290         'NA': 'Namibia',
4291         'NR': 'Nauru',
4292         'NP': 'Nepal',
4293         'NL': 'Netherlands',
4294         'NC': 'New Caledonia',
4295         'NZ': 'New Zealand',
4296         'NI': 'Nicaragua',
4297         'NE': 'Niger',
4298         'NG': 'Nigeria',
4299         'NU': 'Niue',
4300         'NF': 'Norfolk Island',
4301         'MP': 'Northern Mariana Islands',
4302         'NO': 'Norway',
4303         'OM': 'Oman',
4304         'PK': 'Pakistan',
4305         'PW': 'Palau',
4306         'PS': 'Palestine, State of',
4307         'PA': 'Panama',
4308         'PG': 'Papua New Guinea',
4309         'PY': 'Paraguay',
4310         'PE': 'Peru',
4311         'PH': 'Philippines',
4312         'PN': 'Pitcairn',
4313         'PL': 'Poland',
4314         'PT': 'Portugal',
4315         'PR': 'Puerto Rico',
4316         'QA': 'Qatar',
4317         'RE': 'Réunion',
4318         'RO': 'Romania',
4319         'RU': 'Russian Federation',
4320         'RW': 'Rwanda',
4321         'BL': 'Saint Barthélemy',
4322         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4323         'KN': 'Saint Kitts and Nevis',
4324         'LC': 'Saint Lucia',
4325         'MF': 'Saint Martin (French part)',
4326         'PM': 'Saint Pierre and Miquelon',
4327         'VC': 'Saint Vincent and the Grenadines',
4328         'WS': 'Samoa',
4329         'SM': 'San Marino',
4330         'ST': 'Sao Tome and Principe',
4331         'SA': 'Saudi Arabia',
4332         'SN': 'Senegal',
4333         'RS': 'Serbia',
4334         'SC': 'Seychelles',
4335         'SL': 'Sierra Leone',
4336         'SG': 'Singapore',
4337         'SX': 'Sint Maarten (Dutch part)',
4338         'SK': 'Slovakia',
4339         'SI': 'Slovenia',
4340         'SB': 'Solomon Islands',
4341         'SO': 'Somalia',
4342         'ZA': 'South Africa',
4343         'GS': 'South Georgia and the South Sandwich Islands',
4344         'SS': 'South Sudan',
4345         'ES': 'Spain',
4346         'LK': 'Sri Lanka',
4347         'SD': 'Sudan',
4348         'SR': 'Suriname',
4349         'SJ': 'Svalbard and Jan Mayen',
4350         'SZ': 'Swaziland',
4351         'SE': 'Sweden',
4352         'CH': 'Switzerland',
4353         'SY': 'Syrian Arab Republic',
4354         'TW': 'Taiwan, Province of China',
4355         'TJ': 'Tajikistan',
4356         'TZ': 'Tanzania, United Republic of',
4357         'TH': 'Thailand',
4358         'TL': 'Timor-Leste',
4359         'TG': 'Togo',
4360         'TK': 'Tokelau',
4361         'TO': 'Tonga',
4362         'TT': 'Trinidad and Tobago',
4363         'TN': 'Tunisia',
4364         'TR': 'Turkey',
4365         'TM': 'Turkmenistan',
4366         'TC': 'Turks and Caicos Islands',
4367         'TV': 'Tuvalu',
4368         'UG': 'Uganda',
4369         'UA': 'Ukraine',
4370         'AE': 'United Arab Emirates',
4371         'GB': 'United Kingdom',
4372         'US': 'United States',
4373         'UM': 'United States Minor Outlying Islands',
4374         'UY': 'Uruguay',
4375         'UZ': 'Uzbekistan',
4376         'VU': 'Vanuatu',
4377         'VE': 'Venezuela, Bolivarian Republic of',
4378         'VN': 'Viet Nam',
4379         'VG': 'Virgin Islands, British',
4380         'VI': 'Virgin Islands, U.S.',
4381         'WF': 'Wallis and Futuna',
4382         'EH': 'Western Sahara',
4383         'YE': 'Yemen',
4384         'ZM': 'Zambia',
4385         'ZW': 'Zimbabwe',
4386         # Not ISO 3166 codes, but used for IP blocks
4387         'AP': 'Asia/Pacific Region',
4388         'EU': 'Europe',
4389     }
4390
4391     @classmethod
4392     def short2full(cls, code):
4393         """Convert an ISO 3166-2 country code to the corresponding full name"""
4394         return cls._country_map.get(code.upper())
4395
4396
4397 class GeoUtils:
4398     # Major IPv4 address blocks per country
4399     _country_ip_map = {
4400         'AD': '46.172.224.0/19',
4401         'AE': '94.200.0.0/13',
4402         'AF': '149.54.0.0/17',
4403         'AG': '209.59.64.0/18',
4404         'AI': '204.14.248.0/21',
4405         'AL': '46.99.0.0/16',
4406         'AM': '46.70.0.0/15',
4407         'AO': '105.168.0.0/13',
4408         'AP': '182.50.184.0/21',
4409         'AQ': '23.154.160.0/24',
4410         'AR': '181.0.0.0/12',
4411         'AS': '202.70.112.0/20',
4412         'AT': '77.116.0.0/14',
4413         'AU': '1.128.0.0/11',
4414         'AW': '181.41.0.0/18',
4415         'AX': '185.217.4.0/22',
4416         'AZ': '5.197.0.0/16',
4417         'BA': '31.176.128.0/17',
4418         'BB': '65.48.128.0/17',
4419         'BD': '114.130.0.0/16',
4420         'BE': '57.0.0.0/8',
4421         'BF': '102.178.0.0/15',
4422         'BG': '95.42.0.0/15',
4423         'BH': '37.131.0.0/17',
4424         'BI': '154.117.192.0/18',
4425         'BJ': '137.255.0.0/16',
4426         'BL': '185.212.72.0/23',
4427         'BM': '196.12.64.0/18',
4428         'BN': '156.31.0.0/16',
4429         'BO': '161.56.0.0/16',
4430         'BQ': '161.0.80.0/20',
4431         'BR': '191.128.0.0/12',
4432         'BS': '24.51.64.0/18',
4433         'BT': '119.2.96.0/19',
4434         'BW': '168.167.0.0/16',
4435         'BY': '178.120.0.0/13',
4436         'BZ': '179.42.192.0/18',
4437         'CA': '99.224.0.0/11',
4438         'CD': '41.243.0.0/16',
4439         'CF': '197.242.176.0/21',
4440         'CG': '160.113.0.0/16',
4441         'CH': '85.0.0.0/13',
4442         'CI': '102.136.0.0/14',
4443         'CK': '202.65.32.0/19',
4444         'CL': '152.172.0.0/14',
4445         'CM': '102.244.0.0/14',
4446         'CN': '36.128.0.0/10',
4447         'CO': '181.240.0.0/12',
4448         'CR': '201.192.0.0/12',
4449         'CU': '152.206.0.0/15',
4450         'CV': '165.90.96.0/19',
4451         'CW': '190.88.128.0/17',
4452         'CY': '31.153.0.0/16',
4453         'CZ': '88.100.0.0/14',
4454         'DE': '53.0.0.0/8',
4455         'DJ': '197.241.0.0/17',
4456         'DK': '87.48.0.0/12',
4457         'DM': '192.243.48.0/20',
4458         'DO': '152.166.0.0/15',
4459         'DZ': '41.96.0.0/12',
4460         'EC': '186.68.0.0/15',
4461         'EE': '90.190.0.0/15',
4462         'EG': '156.160.0.0/11',
4463         'ER': '196.200.96.0/20',
4464         'ES': '88.0.0.0/11',
4465         'ET': '196.188.0.0/14',
4466         'EU': '2.16.0.0/13',
4467         'FI': '91.152.0.0/13',
4468         'FJ': '144.120.0.0/16',
4469         'FK': '80.73.208.0/21',
4470         'FM': '119.252.112.0/20',
4471         'FO': '88.85.32.0/19',
4472         'FR': '90.0.0.0/9',
4473         'GA': '41.158.0.0/15',
4474         'GB': '25.0.0.0/8',
4475         'GD': '74.122.88.0/21',
4476         'GE': '31.146.0.0/16',
4477         'GF': '161.22.64.0/18',
4478         'GG': '62.68.160.0/19',
4479         'GH': '154.160.0.0/12',
4480         'GI': '95.164.0.0/16',
4481         'GL': '88.83.0.0/19',
4482         'GM': '160.182.0.0/15',
4483         'GN': '197.149.192.0/18',
4484         'GP': '104.250.0.0/19',
4485         'GQ': '105.235.224.0/20',
4486         'GR': '94.64.0.0/13',
4487         'GT': '168.234.0.0/16',
4488         'GU': '168.123.0.0/16',
4489         'GW': '197.214.80.0/20',
4490         'GY': '181.41.64.0/18',
4491         'HK': '113.252.0.0/14',
4492         'HN': '181.210.0.0/16',
4493         'HR': '93.136.0.0/13',
4494         'HT': '148.102.128.0/17',
4495         'HU': '84.0.0.0/14',
4496         'ID': '39.192.0.0/10',
4497         'IE': '87.32.0.0/12',
4498         'IL': '79.176.0.0/13',
4499         'IM': '5.62.80.0/20',
4500         'IN': '117.192.0.0/10',
4501         'IO': '203.83.48.0/21',
4502         'IQ': '37.236.0.0/14',
4503         'IR': '2.176.0.0/12',
4504         'IS': '82.221.0.0/16',
4505         'IT': '79.0.0.0/10',
4506         'JE': '87.244.64.0/18',
4507         'JM': '72.27.0.0/17',
4508         'JO': '176.29.0.0/16',
4509         'JP': '133.0.0.0/8',
4510         'KE': '105.48.0.0/12',
4511         'KG': '158.181.128.0/17',
4512         'KH': '36.37.128.0/17',
4513         'KI': '103.25.140.0/22',
4514         'KM': '197.255.224.0/20',
4515         'KN': '198.167.192.0/19',
4516         'KP': '175.45.176.0/22',
4517         'KR': '175.192.0.0/10',
4518         'KW': '37.36.0.0/14',
4519         'KY': '64.96.0.0/15',
4520         'KZ': '2.72.0.0/13',
4521         'LA': '115.84.64.0/18',
4522         'LB': '178.135.0.0/16',
4523         'LC': '24.92.144.0/20',
4524         'LI': '82.117.0.0/19',
4525         'LK': '112.134.0.0/15',
4526         'LR': '102.183.0.0/16',
4527         'LS': '129.232.0.0/17',
4528         'LT': '78.56.0.0/13',
4529         'LU': '188.42.0.0/16',
4530         'LV': '46.109.0.0/16',
4531         'LY': '41.252.0.0/14',
4532         'MA': '105.128.0.0/11',
4533         'MC': '88.209.64.0/18',
4534         'MD': '37.246.0.0/16',
4535         'ME': '178.175.0.0/17',
4536         'MF': '74.112.232.0/21',
4537         'MG': '154.126.0.0/17',
4538         'MH': '117.103.88.0/21',
4539         'MK': '77.28.0.0/15',
4540         'ML': '154.118.128.0/18',
4541         'MM': '37.111.0.0/17',
4542         'MN': '49.0.128.0/17',
4543         'MO': '60.246.0.0/16',
4544         'MP': '202.88.64.0/20',
4545         'MQ': '109.203.224.0/19',
4546         'MR': '41.188.64.0/18',
4547         'MS': '208.90.112.0/22',
4548         'MT': '46.11.0.0/16',
4549         'MU': '105.16.0.0/12',
4550         'MV': '27.114.128.0/18',
4551         'MW': '102.70.0.0/15',
4552         'MX': '187.192.0.0/11',
4553         'MY': '175.136.0.0/13',
4554         'MZ': '197.218.0.0/15',
4555         'NA': '41.182.0.0/16',
4556         'NC': '101.101.0.0/18',
4557         'NE': '197.214.0.0/18',
4558         'NF': '203.17.240.0/22',
4559         'NG': '105.112.0.0/12',
4560         'NI': '186.76.0.0/15',
4561         'NL': '145.96.0.0/11',
4562         'NO': '84.208.0.0/13',
4563         'NP': '36.252.0.0/15',
4564         'NR': '203.98.224.0/19',
4565         'NU': '49.156.48.0/22',
4566         'NZ': '49.224.0.0/14',
4567         'OM': '5.36.0.0/15',
4568         'PA': '186.72.0.0/15',
4569         'PE': '186.160.0.0/14',
4570         'PF': '123.50.64.0/18',
4571         'PG': '124.240.192.0/19',
4572         'PH': '49.144.0.0/13',
4573         'PK': '39.32.0.0/11',
4574         'PL': '83.0.0.0/11',
4575         'PM': '70.36.0.0/20',
4576         'PR': '66.50.0.0/16',
4577         'PS': '188.161.0.0/16',
4578         'PT': '85.240.0.0/13',
4579         'PW': '202.124.224.0/20',
4580         'PY': '181.120.0.0/14',
4581         'QA': '37.210.0.0/15',
4582         'RE': '102.35.0.0/16',
4583         'RO': '79.112.0.0/13',
4584         'RS': '93.86.0.0/15',
4585         'RU': '5.136.0.0/13',
4586         'RW': '41.186.0.0/16',
4587         'SA': '188.48.0.0/13',
4588         'SB': '202.1.160.0/19',
4589         'SC': '154.192.0.0/11',
4590         'SD': '102.120.0.0/13',
4591         'SE': '78.64.0.0/12',
4592         'SG': '8.128.0.0/10',
4593         'SI': '188.196.0.0/14',
4594         'SK': '78.98.0.0/15',
4595         'SL': '102.143.0.0/17',
4596         'SM': '89.186.32.0/19',
4597         'SN': '41.82.0.0/15',
4598         'SO': '154.115.192.0/18',
4599         'SR': '186.179.128.0/17',
4600         'SS': '105.235.208.0/21',
4601         'ST': '197.159.160.0/19',
4602         'SV': '168.243.0.0/16',
4603         'SX': '190.102.0.0/20',
4604         'SY': '5.0.0.0/16',
4605         'SZ': '41.84.224.0/19',
4606         'TC': '65.255.48.0/20',
4607         'TD': '154.68.128.0/19',
4608         'TG': '196.168.0.0/14',
4609         'TH': '171.96.0.0/13',
4610         'TJ': '85.9.128.0/18',
4611         'TK': '27.96.24.0/21',
4612         'TL': '180.189.160.0/20',
4613         'TM': '95.85.96.0/19',
4614         'TN': '197.0.0.0/11',
4615         'TO': '175.176.144.0/21',
4616         'TR': '78.160.0.0/11',
4617         'TT': '186.44.0.0/15',
4618         'TV': '202.2.96.0/19',
4619         'TW': '120.96.0.0/11',
4620         'TZ': '156.156.0.0/14',
4621         'UA': '37.52.0.0/14',
4622         'UG': '102.80.0.0/13',
4623         'US': '6.0.0.0/8',
4624         'UY': '167.56.0.0/13',
4625         'UZ': '84.54.64.0/18',
4626         'VA': '212.77.0.0/19',
4627         'VC': '207.191.240.0/21',
4628         'VE': '186.88.0.0/13',
4629         'VG': '66.81.192.0/20',
4630         'VI': '146.226.0.0/16',
4631         'VN': '14.160.0.0/11',
4632         'VU': '202.80.32.0/20',
4633         'WF': '117.20.32.0/21',
4634         'WS': '202.4.32.0/19',
4635         'YE': '134.35.0.0/16',
4636         'YT': '41.242.116.0/22',
4637         'ZA': '41.0.0.0/11',
4638         'ZM': '102.144.0.0/13',
4639         'ZW': '102.177.192.0/18',
4640     }
4641
4642     @classmethod
4643     def random_ipv4(cls, code_or_block):
4644         if len(code_or_block) == 2:
4645             block = cls._country_ip_map.get(code_or_block.upper())
4646             if not block:
4647                 return None
4648         else:
4649             block = code_or_block
4650         addr, preflen = block.split('/')
4651         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4652         addr_max = addr_min | (0xffffffff >> int(preflen))
4653         return str(socket.inet_ntoa(
4654             struct.pack('!L', random.randint(addr_min, addr_max))))
4655
4656
4657 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4658     def __init__(self, proxies=None):
4659         # Set default handlers
4660         for type in ('http', 'https'):
4661             setattr(self, '%s_open' % type,
4662                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4663                         meth(r, proxy, type))
4664         urllib.request.ProxyHandler.__init__(self, proxies)
4665
4666     def proxy_open(self, req, proxy, type):
4667         req_proxy = req.headers.get('Ytdl-request-proxy')
4668         if req_proxy is not None:
4669             proxy = req_proxy
4670             del req.headers['Ytdl-request-proxy']
4671
4672         if proxy == '__noproxy__':
4673             return None  # No Proxy
4674         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4675             req.add_header('Ytdl-socks-proxy', proxy)
4676             # yt-dlp's http/https handlers do wrapping the socket with socks
4677             return None
4678         return urllib.request.ProxyHandler.proxy_open(
4679             self, req, proxy, type)
4680
4681
4682 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4683 # released into Public Domain
4684 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4685
4686 def long_to_bytes(n, blocksize=0):
4687     """long_to_bytes(n:long, blocksize:int) : string
4688     Convert a long integer to a byte string.
4689
4690     If optional blocksize is given and greater than zero, pad the front of the
4691     byte string with binary zeros so that the length is a multiple of
4692     blocksize.
4693     """
4694     # after much testing, this algorithm was deemed to be the fastest
4695     s = b''
4696     n = int(n)
4697     while n > 0:
4698         s = struct.pack('>I', n & 0xffffffff) + s
4699         n = n >> 32
4700     # strip off leading zeros
4701     for i in range(len(s)):
4702         if s[i] != b'\000'[0]:
4703             break
4704     else:
4705         # only happens when n == 0
4706         s = b'\000'
4707         i = 0
4708     s = s[i:]
4709     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4710     # de-padding being done above, but sigh...
4711     if blocksize > 0 and len(s) % blocksize:
4712         s = (blocksize - len(s) % blocksize) * b'\000' + s
4713     return s
4714
4715
4716 def bytes_to_long(s):
4717     """bytes_to_long(string) : long
4718     Convert a byte string to a long integer.
4719
4720     This is (essentially) the inverse of long_to_bytes().
4721     """
4722     acc = 0
4723     length = len(s)
4724     if length % 4:
4725         extra = (4 - length % 4)
4726         s = b'\000' * extra + s
4727         length = length + extra
4728     for i in range(0, length, 4):
4729         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4730     return acc
4731
4732
4733 def ohdave_rsa_encrypt(data, exponent, modulus):
4734     '''
4735     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4736
4737     Input:
4738         data: data to encrypt, bytes-like object
4739         exponent, modulus: parameter e and N of RSA algorithm, both integer
4740     Output: hex string of encrypted data
4741
4742     Limitation: supports one block encryption only
4743     '''
4744
4745     payload = int(binascii.hexlify(data[::-1]), 16)
4746     encrypted = pow(payload, exponent, modulus)
4747     return '%x' % encrypted
4748
4749
4750 def pkcs1pad(data, length):
4751     """
4752     Padding input data with PKCS#1 scheme
4753
4754     @param {int[]} data        input data
4755     @param {int}   length      target length
4756     @returns {int[]}           padded data
4757     """
4758     if len(data) > length - 11:
4759         raise ValueError('Input data too long for PKCS#1 padding')
4760
4761     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4762     return [0, 2] + pseudo_random + [0] + data
4763
4764
4765 def _base_n_table(n, table):
4766     if not table and not n:
4767         raise ValueError('Either table or n must be specified')
4768     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4769
4770     if n and n != len(table):
4771         raise ValueError(f'base {n} exceeds table length {len(table)}')
4772     return table
4773
4774
4775 def encode_base_n(num, n=None, table=None):
4776     """Convert given int to a base-n string"""
4777     table = _base_n_table(n, table)
4778     if not num:
4779         return table[0]
4780
4781     result, base = '', len(table)
4782     while num:
4783         result = table[num % base] + result
4784         num = num // base
4785     return result
4786
4787
4788 def decode_base_n(string, n=None, table=None):
4789     """Convert given base-n string to int"""
4790     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4791     result, base = 0, len(table)
4792     for char in string:
4793         result = result * base + table[char]
4794     return result
4795
4796
4797 def decode_base(value, digits):
4798     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4799                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4800     return decode_base_n(value, table=digits)
4801
4802
4803 def decode_packed_codes(code):
4804     mobj = re.search(PACKED_CODES_RE, code)
4805     obfuscated_code, base, count, symbols = mobj.groups()
4806     base = int(base)
4807     count = int(count)
4808     symbols = symbols.split('|')
4809     symbol_table = {}
4810
4811     while count:
4812         count -= 1
4813         base_n_count = encode_base_n(count, base)
4814         symbol_table[base_n_count] = symbols[count] or base_n_count
4815
4816     return re.sub(
4817         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4818         obfuscated_code)
4819
4820
4821 def caesar(s, alphabet, shift):
4822     if shift == 0:
4823         return s
4824     l = len(alphabet)
4825     return ''.join(
4826         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4827         for c in s)
4828
4829
4830 def rot47(s):
4831     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4832
4833
4834 def parse_m3u8_attributes(attrib):
4835     info = {}
4836     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4837         if val.startswith('"'):
4838             val = val[1:-1]
4839         info[key] = val
4840     return info
4841
4842
4843 def urshift(val, n):
4844     return val >> n if val >= 0 else (val + 0x100000000) >> n
4845
4846
4847 # Based on png2str() written by @gdkchan and improved by @yokrysty
4848 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4849 def decode_png(png_data):
4850     # Reference: https://www.w3.org/TR/PNG/
4851     header = png_data[8:]
4852
4853     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4854         raise OSError('Not a valid PNG file.')
4855
4856     int_map = {1: '>B', 2: '>H', 4: '>I'}
4857     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4858
4859     chunks = []
4860
4861     while header:
4862         length = unpack_integer(header[:4])
4863         header = header[4:]
4864
4865         chunk_type = header[:4]
4866         header = header[4:]
4867
4868         chunk_data = header[:length]
4869         header = header[length:]
4870
4871         header = header[4:]  # Skip CRC
4872
4873         chunks.append({
4874             'type': chunk_type,
4875             'length': length,
4876             'data': chunk_data
4877         })
4878
4879     ihdr = chunks[0]['data']
4880
4881     width = unpack_integer(ihdr[:4])
4882     height = unpack_integer(ihdr[4:8])
4883
4884     idat = b''
4885
4886     for chunk in chunks:
4887         if chunk['type'] == b'IDAT':
4888             idat += chunk['data']
4889
4890     if not idat:
4891         raise OSError('Unable to read PNG data.')
4892
4893     decompressed_data = bytearray(zlib.decompress(idat))
4894
4895     stride = width * 3
4896     pixels = []
4897
4898     def _get_pixel(idx):
4899         x = idx % stride
4900         y = idx // stride
4901         return pixels[y][x]
4902
4903     for y in range(height):
4904         basePos = y * (1 + stride)
4905         filter_type = decompressed_data[basePos]
4906
4907         current_row = []
4908
4909         pixels.append(current_row)
4910
4911         for x in range(stride):
4912             color = decompressed_data[1 + basePos + x]
4913             basex = y * stride + x
4914             left = 0
4915             up = 0
4916
4917             if x > 2:
4918                 left = _get_pixel(basex - 3)
4919             if y > 0:
4920                 up = _get_pixel(basex - stride)
4921
4922             if filter_type == 1:  # Sub
4923                 color = (color + left) & 0xff
4924             elif filter_type == 2:  # Up
4925                 color = (color + up) & 0xff
4926             elif filter_type == 3:  # Average
4927                 color = (color + ((left + up) >> 1)) & 0xff
4928             elif filter_type == 4:  # Paeth
4929                 a = left
4930                 b = up
4931                 c = 0
4932
4933                 if x > 2 and y > 0:
4934                     c = _get_pixel(basex - stride - 3)
4935
4936                 p = a + b - c
4937
4938                 pa = abs(p - a)
4939                 pb = abs(p - b)
4940                 pc = abs(p - c)
4941
4942                 if pa <= pb and pa <= pc:
4943                     color = (color + a) & 0xff
4944                 elif pb <= pc:
4945                     color = (color + b) & 0xff
4946                 else:
4947                     color = (color + c) & 0xff
4948
4949             current_row.append(color)
4950
4951     return width, height, pixels
4952
4953
4954 def write_xattr(path, key, value):
4955     # Windows: Write xattrs to NTFS Alternate Data Streams:
4956     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4957     if compat_os_name == 'nt':
4958         assert ':' not in key
4959         assert os.path.exists(path)
4960
4961         try:
4962             with open(f'{path}:{key}', 'wb') as f:
4963                 f.write(value)
4964         except OSError as e:
4965             raise XAttrMetadataError(e.errno, e.strerror)
4966         return
4967
4968     # UNIX Method 1. Use xattrs/pyxattrs modules
4969
4970     setxattr = None
4971     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4972         # Unicode arguments are not supported in pyxattr until version 0.5.0
4973         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4974         if version_tuple(xattr.__version__) >= (0, 5, 0):
4975             setxattr = xattr.set
4976     elif xattr:
4977         setxattr = xattr.setxattr
4978
4979     if setxattr:
4980         try:
4981             setxattr(path, key, value)
4982         except OSError as e:
4983             raise XAttrMetadataError(e.errno, e.strerror)
4984         return
4985
4986     # UNIX Method 2. Use setfattr/xattr executables
4987     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4988            else 'xattr' if check_executable('xattr', ['-h']) else None)
4989     if not exe:
4990         raise XAttrUnavailableError(
4991             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4992             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4993
4994     value = value.decode()
4995     try:
4996         _, stderr, returncode = Popen.run(
4997             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4998             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4999     except OSError as e:
5000         raise XAttrMetadataError(e.errno, e.strerror)
5001     if returncode:
5002         raise XAttrMetadataError(returncode, stderr)
5003
5004
5005 def random_birthday(year_field, month_field, day_field):
5006     start_date = datetime.date(1950, 1, 1)
5007     end_date = datetime.date(1995, 12, 31)
5008     offset = random.randint(0, (end_date - start_date).days)
5009     random_date = start_date + datetime.timedelta(offset)
5010     return {
5011         year_field: str(random_date.year),
5012         month_field: str(random_date.month),
5013         day_field: str(random_date.day),
5014     }
5015
5016
5017 # Templates for internet shortcut files, which are plain text files.
5018 DOT_URL_LINK_TEMPLATE = '''\
5019 [InternetShortcut]
5020 URL=%(url)s
5021 '''
5022
5023 DOT_WEBLOC_LINK_TEMPLATE = '''\
5024 <?xml version="1.0" encoding="UTF-8"?>
5025 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5026 <plist version="1.0">
5027 <dict>
5028 \t<key>URL</key>
5029 \t<string>%(url)s</string>
5030 </dict>
5031 </plist>
5032 '''
5033
5034 DOT_DESKTOP_LINK_TEMPLATE = '''\
5035 [Desktop Entry]
5036 Encoding=UTF-8
5037 Name=%(filename)s
5038 Type=Link
5039 URL=%(url)s
5040 Icon=text-html
5041 '''
5042
5043 LINK_TEMPLATES = {
5044     'url': DOT_URL_LINK_TEMPLATE,
5045     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5046     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5047 }
5048
5049
5050 def iri_to_uri(iri):
5051     """
5052     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5053
5054     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5055     """
5056
5057     iri_parts = urllib.parse.urlparse(iri)
5058
5059     if '[' in iri_parts.netloc:
5060         raise ValueError('IPv6 URIs are not, yet, supported.')
5061         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5062
5063     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5064
5065     net_location = ''
5066     if iri_parts.username:
5067         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5068         if iri_parts.password is not None:
5069             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5070         net_location += '@'
5071
5072     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5073     # The 'idna' encoding produces ASCII text.
5074     if iri_parts.port is not None and iri_parts.port != 80:
5075         net_location += ':' + str(iri_parts.port)
5076
5077     return urllib.parse.urlunparse(
5078         (iri_parts.scheme,
5079             net_location,
5080
5081             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5082
5083             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5084             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5085
5086             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5087             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5088
5089             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5090
5091     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5092
5093
5094 def to_high_limit_path(path):
5095     if sys.platform in ['win32', 'cygwin']:
5096         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5097         return '\\\\?\\' + os.path.abspath(path)
5098
5099     return path
5100
5101
5102 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5103     val = traverse_obj(obj, *variadic(field))
5104     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5105         return default
5106     return template % func(val)
5107
5108
5109 def clean_podcast_url(url):
5110     return re.sub(r'''(?x)
5111         (?:
5112             (?:
5113                 chtbl\.com/track|
5114                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5115                 play\.podtrac\.com
5116             )/[^/]+|
5117             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5118             flex\.acast\.com|
5119             pd(?:
5120                 cn\.co| # https://podcorn.com/analytics-prefix/
5121                 st\.fm # https://podsights.com/docs/
5122             )/e
5123         )/''', '', url)
5124
5125
5126 _HEX_TABLE = '0123456789abcdef'
5127
5128
5129 def random_uuidv4():
5130     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5131
5132
5133 def make_dir(path, to_screen=None):
5134     try:
5135         dn = os.path.dirname(path)
5136         if dn and not os.path.exists(dn):
5137             os.makedirs(dn)
5138         return True
5139     except OSError as err:
5140         if callable(to_screen) is not None:
5141             to_screen('unable to create directory ' + error_to_compat_str(err))
5142         return False
5143
5144
5145 def get_executable_path():
5146     from .update import _get_variant_and_executable_path
5147
5148     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5149
5150
5151 def load_plugins(name, suffix, namespace):
5152     classes = {}
5153     with contextlib.suppress(FileNotFoundError):
5154         plugins_spec = importlib.util.spec_from_file_location(
5155             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5156         plugins = importlib.util.module_from_spec(plugins_spec)
5157         sys.modules[plugins_spec.name] = plugins
5158         plugins_spec.loader.exec_module(plugins)
5159         for name in dir(plugins):
5160             if name in namespace:
5161                 continue
5162             if not name.endswith(suffix):
5163                 continue
5164             klass = getattr(plugins, name)
5165             classes[name] = namespace[name] = klass
5166     return classes
5167
5168
5169 def traverse_obj(
5170         obj, *path_list, default=None, expected_type=None, get_all=True,
5171         casesense=True, is_user_input=False, traverse_string=False):
5172     ''' Traverse nested list/dict/tuple
5173     @param path_list        A list of paths which are checked one by one.
5174                             Each path is a list of keys where each key is a:
5175                               - None:     Do nothing
5176                               - string:   A dictionary key
5177                               - int:      An index into a list
5178                               - tuple:    A list of keys all of which will be traversed
5179                               - Ellipsis: Fetch all values in the object
5180                               - Function: Takes the key and value as arguments
5181                                           and returns whether the key matches or not
5182     @param default          Default value to return
5183     @param expected_type    Only accept final value of this type (Can also be any callable)
5184     @param get_all          Return all the values obtained from a path or only the first one
5185     @param casesense        Whether to consider dictionary keys as case sensitive
5186     @param is_user_input    Whether the keys are generated from user input. If True,
5187                             strings are converted to int/slice if necessary
5188     @param traverse_string  Whether to traverse inside strings. If True, any
5189                             non-compatible object will also be converted into a string
5190     # TODO: Write tests
5191     '''
5192     if not casesense:
5193         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5194         path_list = (map(_lower, variadic(path)) for path in path_list)
5195
5196     def _traverse_obj(obj, path, _current_depth=0):
5197         nonlocal depth
5198         path = tuple(variadic(path))
5199         for i, key in enumerate(path):
5200             if None in (key, obj):
5201                 return obj
5202             if isinstance(key, (list, tuple)):
5203                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5204                 key = ...
5205             if key is ...:
5206                 obj = (obj.values() if isinstance(obj, dict)
5207                        else obj if isinstance(obj, (list, tuple, LazyList))
5208                        else str(obj) if traverse_string else [])
5209                 _current_depth += 1
5210                 depth = max(depth, _current_depth)
5211                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5212             elif callable(key):
5213                 if isinstance(obj, (list, tuple, LazyList)):
5214                     obj = enumerate(obj)
5215                 elif isinstance(obj, dict):
5216                     obj = obj.items()
5217                 else:
5218                     if not traverse_string:
5219                         return None
5220                     obj = str(obj)
5221                 _current_depth += 1
5222                 depth = max(depth, _current_depth)
5223                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5224             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5225                 obj = (obj.get(key) if casesense or (key in obj)
5226                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5227             else:
5228                 if is_user_input:
5229                     key = (int_or_none(key) if ':' not in key
5230                            else slice(*map(int_or_none, key.split(':'))))
5231                     if key == slice(None):
5232                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5233                 if not isinstance(key, (int, slice)):
5234                     return None
5235                 if not isinstance(obj, (list, tuple, LazyList)):
5236                     if not traverse_string:
5237                         return None
5238                     obj = str(obj)
5239                 try:
5240                     obj = obj[key]
5241                 except IndexError:
5242                     return None
5243         return obj
5244
5245     if isinstance(expected_type, type):
5246         type_test = lambda val: val if isinstance(val, expected_type) else None
5247     else:
5248         type_test = expected_type or IDENTITY
5249
5250     for path in path_list:
5251         depth = 0
5252         val = _traverse_obj(obj, path)
5253         if val is not None:
5254             if depth:
5255                 for _ in range(depth - 1):
5256                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5257                 val = [v for v in map(type_test, val) if v is not None]
5258                 if val:
5259                     return val if get_all else val[0]
5260             else:
5261                 val = type_test(val)
5262                 if val is not None:
5263                     return val
5264     return default
5265
5266
5267 def traverse_dict(dictn, keys, casesense=True):
5268     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5269                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5270     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5271
5272
5273 def get_first(obj, keys, **kwargs):
5274     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5275
5276
5277 def variadic(x, allowed_types=(str, bytes, dict)):
5278     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5279
5280
5281 def time_seconds(**kwargs):
5282     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5283     return t.timestamp()
5284
5285
5286 # create a JSON Web Signature (jws) with HS256 algorithm
5287 # the resulting format is in JWS Compact Serialization
5288 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5289 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5290 def jwt_encode_hs256(payload_data, key, headers={}):
5291     header_data = {
5292         'alg': 'HS256',
5293         'typ': 'JWT',
5294     }
5295     if headers:
5296         header_data.update(headers)
5297     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5298     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5299     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5300     signature_b64 = base64.b64encode(h.digest())
5301     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5302     return token
5303
5304
5305 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5306 def jwt_decode_hs256(jwt):
5307     header_b64, payload_b64, signature_b64 = jwt.split('.')
5308     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5309     return payload_data
5310
5311
5312 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5313
5314
5315 @functools.cache
5316 def supports_terminal_sequences(stream):
5317     if compat_os_name == 'nt':
5318         if not WINDOWS_VT_MODE:
5319             return False
5320     elif not os.getenv('TERM'):
5321         return False
5322     try:
5323         return stream.isatty()
5324     except BaseException:
5325         return False
5326
5327
5328 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5329     if get_windows_version() < (10, 0, 10586):
5330         return
5331     global WINDOWS_VT_MODE
5332     try:
5333         Popen.run('', shell=True)
5334     except Exception:
5335         return
5336
5337     WINDOWS_VT_MODE = True
5338     supports_terminal_sequences.cache_clear()
5339
5340
5341 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5342
5343
5344 def remove_terminal_sequences(string):
5345     return _terminal_sequences_re.sub('', string)
5346
5347
5348 def number_of_digits(number):
5349     return len('%d' % number)
5350
5351
5352 def join_nonempty(*values, delim='-', from_dict=None):
5353     if from_dict is not None:
5354         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5355     return delim.join(map(str, filter(None, values)))
5356
5357
5358 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5359     """
5360     Find the largest format dimensions in terms of video width and, for each thumbnail:
5361     * Modify the URL: Match the width with the provided regex and replace with the former width
5362     * Update dimensions
5363
5364     This function is useful with video services that scale the provided thumbnails on demand
5365     """
5366     _keys = ('width', 'height')
5367     max_dimensions = max(
5368         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5369         default=(0, 0))
5370     if not max_dimensions[0]:
5371         return thumbnails
5372     return [
5373         merge_dicts(
5374             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5375             dict(zip(_keys, max_dimensions)), thumbnail)
5376         for thumbnail in thumbnails
5377     ]
5378
5379
5380 def parse_http_range(range):
5381     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5382     if not range:
5383         return None, None, None
5384     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5385     if not crg:
5386         return None, None, None
5387     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5388
5389
5390 def read_stdin(what):
5391     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5392     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5393     return sys.stdin
5394
5395
5396 def determine_file_encoding(data):
5397     """
5398     Detect the text encoding used
5399     @returns (encoding, bytes to skip)
5400     """
5401
5402     # BOM marks are given priority over declarations
5403     for bom, enc in BOMS:
5404         if data.startswith(bom):
5405             return enc, len(bom)
5406
5407     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5408     # We ignore the endianness to get a good enough match
5409     data = data.replace(b'\0', b'')
5410     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5411     return mobj.group(1).decode() if mobj else None, 0
5412
5413
5414 class Config:
5415     own_args = None
5416     parsed_args = None
5417     filename = None
5418     __initialized = False
5419
5420     def __init__(self, parser, label=None):
5421         self.parser, self.label = parser, label
5422         self._loaded_paths, self.configs = set(), []
5423
5424     def init(self, args=None, filename=None):
5425         assert not self.__initialized
5426         self.own_args, self.filename = args, filename
5427         return self.load_configs()
5428
5429     def load_configs(self):
5430         directory = ''
5431         if self.filename:
5432             location = os.path.realpath(self.filename)
5433             directory = os.path.dirname(location)
5434             if location in self._loaded_paths:
5435                 return False
5436             self._loaded_paths.add(location)
5437
5438         self.__initialized = True
5439         opts, _ = self.parser.parse_known_args(self.own_args)
5440         self.parsed_args = self.own_args
5441         for location in opts.config_locations or []:
5442             if location == '-':
5443                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5444                 continue
5445             location = os.path.join(directory, expand_path(location))
5446             if os.path.isdir(location):
5447                 location = os.path.join(location, 'yt-dlp.conf')
5448             if not os.path.exists(location):
5449                 self.parser.error(f'config location {location} does not exist')
5450             self.append_config(self.read_file(location), location)
5451         return True
5452
5453     def __str__(self):
5454         label = join_nonempty(
5455             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5456             delim=' ')
5457         return join_nonempty(
5458             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5459             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5460             delim='\n')
5461
5462     @staticmethod
5463     def read_file(filename, default=[]):
5464         try:
5465             optionf = open(filename, 'rb')
5466         except OSError:
5467             return default  # silently skip if file is not present
5468         try:
5469             enc, skip = determine_file_encoding(optionf.read(512))
5470             optionf.seek(skip, io.SEEK_SET)
5471         except OSError:
5472             enc = None  # silently skip read errors
5473         try:
5474             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5475             contents = optionf.read().decode(enc or preferredencoding())
5476             res = shlex.split(contents, comments=True)
5477         except Exception as err:
5478             raise ValueError(f'Unable to parse "{filename}": {err}')
5479         finally:
5480             optionf.close()
5481         return res
5482
5483     @staticmethod
5484     def hide_login_info(opts):
5485         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5486         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5487
5488         def _scrub_eq(o):
5489             m = eqre.match(o)
5490             if m:
5491                 return m.group('key') + '=PRIVATE'
5492             else:
5493                 return o
5494
5495         opts = list(map(_scrub_eq, opts))
5496         for idx, opt in enumerate(opts):
5497             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5498                 opts[idx + 1] = 'PRIVATE'
5499         return opts
5500
5501     def append_config(self, *args, label=None):
5502         config = type(self)(self.parser, label)
5503         config._loaded_paths = self._loaded_paths
5504         if config.init(*args):
5505             self.configs.append(config)
5506
5507     @property
5508     def all_args(self):
5509         for config in reversed(self.configs):
5510             yield from config.all_args
5511         yield from self.parsed_args or []
5512
5513     def parse_known_args(self, **kwargs):
5514         return self.parser.parse_known_args(self.all_args, **kwargs)
5515
5516     def parse_args(self):
5517         return self.parser.parse_args(self.all_args)
5518
5519
5520 class WebSocketsWrapper():
5521     """Wraps websockets module to use in non-async scopes"""
5522     pool = None
5523
5524     def __init__(self, url, headers=None, connect=True):
5525         self.loop = asyncio.new_event_loop()
5526         # XXX: "loop" is deprecated
5527         self.conn = websockets.connect(
5528             url, extra_headers=headers, ping_interval=None,
5529             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5530         if connect:
5531             self.__enter__()
5532         atexit.register(self.__exit__, None, None, None)
5533
5534     def __enter__(self):
5535         if not self.pool:
5536             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5537         return self
5538
5539     def send(self, *args):
5540         self.run_with_loop(self.pool.send(*args), self.loop)
5541
5542     def recv(self, *args):
5543         return self.run_with_loop(self.pool.recv(*args), self.loop)
5544
5545     def __exit__(self, type, value, traceback):
5546         try:
5547             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5548         finally:
5549             self.loop.close()
5550             self._cancel_all_tasks(self.loop)
5551
5552     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5553     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5554     @staticmethod
5555     def run_with_loop(main, loop):
5556         if not asyncio.iscoroutine(main):
5557             raise ValueError(f'a coroutine was expected, got {main!r}')
5558
5559         try:
5560             return loop.run_until_complete(main)
5561         finally:
5562             loop.run_until_complete(loop.shutdown_asyncgens())
5563             if hasattr(loop, 'shutdown_default_executor'):
5564                 loop.run_until_complete(loop.shutdown_default_executor())
5565
5566     @staticmethod
5567     def _cancel_all_tasks(loop):
5568         to_cancel = asyncio.all_tasks(loop)
5569
5570         if not to_cancel:
5571             return
5572
5573         for task in to_cancel:
5574             task.cancel()
5575
5576         # XXX: "loop" is removed in python 3.10+
5577         loop.run_until_complete(
5578             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5579
5580         for task in to_cancel:
5581             if task.cancelled():
5582                 continue
5583             if task.exception() is not None:
5584                 loop.call_exception_handler({
5585                     'message': 'unhandled exception during asyncio.run() shutdown',
5586                     'exception': task.exception(),
5587                     'task': task,
5588                 })
5589
5590
5591 def merge_headers(*dicts):
5592     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5593     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5594
5595
5596 def cached_method(f):
5597     """Cache a method"""
5598     signature = inspect.signature(f)
5599
5600     @functools.wraps(f)
5601     def wrapper(self, *args, **kwargs):
5602         bound_args = signature.bind(self, *args, **kwargs)
5603         bound_args.apply_defaults()
5604         key = tuple(bound_args.arguments.values())
5605
5606         if not hasattr(self, '__cached_method__cache'):
5607             self.__cached_method__cache = {}
5608         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5609         if key not in cache:
5610             cache[key] = f(self, *args, **kwargs)
5611         return cache[key]
5612     return wrapper
5613
5614
5615 class classproperty:
5616     """property access for class methods"""
5617
5618     def __init__(self, func):
5619         functools.update_wrapper(self, func)
5620         self.func = func
5621
5622     def __get__(self, _, cls):
5623         return self.func(cls)
5624
5625
5626 class Namespace(types.SimpleNamespace):
5627     """Immutable namespace"""
5628
5629     def __iter__(self):
5630         return iter(self.__dict__.values())
5631
5632     @property
5633     def items_(self):
5634         return self.__dict__.items()
5635
5636
5637 MEDIA_EXTENSIONS = Namespace(
5638     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5639     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5640     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5641     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5642     thumbnails=('jpg', 'png', 'webp'),
5643     storyboards=('mhtml', ),
5644     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5645     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5646 )
5647 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5648 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5649
5650 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5651
5652
5653 # Deprecated
5654 has_certifi = bool(certifi)
5655 has_websockets = bool(websockets)