yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import importlib.util
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import mimetypes
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import subprocess
  33 import sys
  34 import tempfile
  35 import time
  36 import traceback
  37 import types
  38 import urllib.parse
  39 import xml.etree.ElementTree
  40 import zlib
  41
  42 from .compat import asyncio, functools  # isort: split
  43 from .compat import (
  44     compat_chr,
  45     compat_cookiejar,
  46     compat_etree_fromstring,
  47     compat_expanduser,
  48     compat_html_entities,
  49     compat_html_entities_html5,
  50     compat_HTMLParseError,
  51     compat_HTMLParser,
  52     compat_http_client,
  53     compat_HTTPError,
  54     compat_os_name,
  55     compat_parse_qs,
  56     compat_shlex_quote,
  57     compat_str,
  58     compat_struct_pack,
  59     compat_struct_unpack,
  60     compat_urllib_error,
  61     compat_urllib_parse_unquote_plus,
  62     compat_urllib_parse_urlencode,
  63     compat_urllib_parse_urlparse,
  64     compat_urllib_request,
  65     compat_urlparse,
  66 )
  67 from .dependencies import brotli, certifi, websockets
  68 from .socks import ProxyType, sockssocket
  69
  70
  71 def register_socks_protocols():
  72     # "Register" SOCKS protocols
  73     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  74     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  75     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  76         if scheme not in compat_urlparse.uses_netloc:
  77             compat_urlparse.uses_netloc.append(scheme)
  78
  79
  80 # This is not clearly defined otherwise
  81 compiled_regex_type = type(re.compile(''))
  82
  83
  84 def random_user_agent():
  85     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  86     _CHROME_VERSIONS = (
  87         '90.0.4430.212',
  88         '90.0.4430.24',
  89         '90.0.4430.70',
  90         '90.0.4430.72',
  91         '90.0.4430.85',
  92         '90.0.4430.93',
  93         '91.0.4472.101',
  94         '91.0.4472.106',
  95         '91.0.4472.114',
  96         '91.0.4472.124',
  97         '91.0.4472.164',
  98         '91.0.4472.19',
  99         '91.0.4472.77',
 100         '92.0.4515.107',
 101         '92.0.4515.115',
 102         '92.0.4515.131',
 103         '92.0.4515.159',
 104         '92.0.4515.43',
 105         '93.0.4556.0',
 106         '93.0.4577.15',
 107         '93.0.4577.63',
 108         '93.0.4577.82',
 109         '94.0.4606.41',
 110         '94.0.4606.54',
 111         '94.0.4606.61',
 112         '94.0.4606.71',
 113         '94.0.4606.81',
 114         '94.0.4606.85',
 115         '95.0.4638.17',
 116         '95.0.4638.50',
 117         '95.0.4638.54',
 118         '95.0.4638.69',
 119         '95.0.4638.74',
 120         '96.0.4664.18',
 121         '96.0.4664.45',
 122         '96.0.4664.55',
 123         '96.0.4664.93',
 124         '97.0.4692.20',
 125     )
 126     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 127
 128
 129 SUPPORTED_ENCODINGS = [
 130     'gzip', 'deflate'
 131 ]
 132 if brotli:
 133     SUPPORTED_ENCODINGS.append('br')
 134
 135 std_headers = {
 136     'User-Agent': random_user_agent(),
 137     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 138     'Accept-Language': 'en-us,en;q=0.5',
 139     'Sec-Fetch-Mode': 'navigate',
 140 }
 141
 142
 143 USER_AGENTS = {
 144     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 145 }
 146
 147
 148 NO_DEFAULT = object()
 149 IDENTITY = lambda x: x
 150
 151 ENGLISH_MONTH_NAMES = [
 152     'January', 'February', 'March', 'April', 'May', 'June',
 153     'July', 'August', 'September', 'October', 'November', 'December']
 154
 155 MONTH_NAMES = {
 156     'en': ENGLISH_MONTH_NAMES,
 157     'fr': [
 158         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 159         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 160 }
 161
 162 KNOWN_EXTENSIONS = (
 163     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 164     'flv', 'f4v', 'f4a', 'f4b',
 165     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 166     'mkv', 'mka', 'mk3d',
 167     'avi', 'divx',
 168     'mov',
 169     'asf', 'wmv', 'wma',
 170     '3gp', '3g2',
 171     'mp3',
 172     'flac',
 173     'ape',
 174     'wav',
 175     'f4f', 'f4m', 'm3u8', 'smil')
 176
 177 # needed for sanitizing filenames in restricted mode
 178 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 179                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 180                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 181
 182 DATE_FORMATS = (
 183     '%d %B %Y',
 184     '%d %b %Y',
 185     '%B %d %Y',
 186     '%B %dst %Y',
 187     '%B %dnd %Y',
 188     '%B %drd %Y',
 189     '%B %dth %Y',
 190     '%b %d %Y',
 191     '%b %dst %Y',
 192     '%b %dnd %Y',
 193     '%b %drd %Y',
 194     '%b %dth %Y',
 195     '%b %dst %Y %I:%M',
 196     '%b %dnd %Y %I:%M',
 197     '%b %drd %Y %I:%M',
 198     '%b %dth %Y %I:%M',
 199     '%Y %m %d',
 200     '%Y-%m-%d',
 201     '%Y.%m.%d.',
 202     '%Y/%m/%d',
 203     '%Y/%m/%d %H:%M',
 204     '%Y/%m/%d %H:%M:%S',
 205     '%Y%m%d%H%M',
 206     '%Y%m%d%H%M%S',
 207     '%Y%m%d',
 208     '%Y-%m-%d %H:%M',
 209     '%Y-%m-%d %H:%M:%S',
 210     '%Y-%m-%d %H:%M:%S.%f',
 211     '%Y-%m-%d %H:%M:%S:%f',
 212     '%d.%m.%Y %H:%M',
 213     '%d.%m.%Y %H.%M',
 214     '%Y-%m-%dT%H:%M:%SZ',
 215     '%Y-%m-%dT%H:%M:%S.%fZ',
 216     '%Y-%m-%dT%H:%M:%S.%f0Z',
 217     '%Y-%m-%dT%H:%M:%S',
 218     '%Y-%m-%dT%H:%M:%S.%f',
 219     '%Y-%m-%dT%H:%M',
 220     '%b %d %Y at %H:%M',
 221     '%b %d %Y at %H:%M:%S',
 222     '%B %d %Y at %H:%M',
 223     '%B %d %Y at %H:%M:%S',
 224     '%H:%M %d-%b-%Y',
 225 )
 226
 227 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 228 DATE_FORMATS_DAY_FIRST.extend([
 229     '%d-%m-%Y',
 230     '%d.%m.%Y',
 231     '%d.%m.%y',
 232     '%d/%m/%Y',
 233     '%d/%m/%y',
 234     '%d/%m/%Y %H:%M:%S',
 235 ])
 236
 237 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 238 DATE_FORMATS_MONTH_FIRST.extend([
 239     '%m-%d-%Y',
 240     '%m.%d.%Y',
 241     '%m/%d/%Y',
 242     '%m/%d/%y',
 243     '%m/%d/%Y %H:%M:%S',
 244 ])
 245
 246 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 247 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 248
 249 NUMBER_RE = r'\d+(?:\.\d+)?'
 250
 251
 252 @functools.cache
 253 def preferredencoding():
 254     """Get preferred encoding.
 255
 256     Returns the best encoding scheme for the system, based on
 257     locale.getpreferredencoding() and some further tweaks.
 258     """
 259     try:
 260         pref = locale.getpreferredencoding()
 261         'TEST'.encode(pref)
 262     except Exception:
 263         pref = 'UTF-8'
 264
 265     return pref
 266
 267
 268 def write_json_file(obj, fn):
 269     """ Encode obj as JSON and write it to fn, atomically if possible """
 270
 271     tf = tempfile.NamedTemporaryFile(
 272         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 273         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 274
 275     try:
 276         with tf:
 277             json.dump(obj, tf, ensure_ascii=False)
 278         if sys.platform == 'win32':
 279             # Need to remove existing file on Windows, else os.rename raises
 280             # WindowsError or FileExistsError.
 281             with contextlib.suppress(OSError):
 282                 os.unlink(fn)
 283         with contextlib.suppress(OSError):
 284             mask = os.umask(0)
 285             os.umask(mask)
 286             os.chmod(tf.name, 0o666 & ~mask)
 287         os.rename(tf.name, fn)
 288     except Exception:
 289         with contextlib.suppress(OSError):
 290             os.remove(tf.name)
 291         raise
 292
 293
 294 def find_xpath_attr(node, xpath, key, val=None):
 295     """ Find the xpath xpath[@key=val] """
 296     assert re.match(r'^[a-zA-Z_-]+$', key)
 297     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 298     return node.find(expr)
 299
 300 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 301 # the namespace parameter
 302
 303
 304 def xpath_with_ns(path, ns_map):
 305     components = [c.split(':') for c in path.split('/')]
 306     replaced = []
 307     for c in components:
 308         if len(c) == 1:
 309             replaced.append(c[0])
 310         else:
 311             ns, tag = c
 312             replaced.append('{%s}%s' % (ns_map[ns], tag))
 313     return '/'.join(replaced)
 314
 315
 316 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 317     def _find_xpath(xpath):
 318         return node.find(xpath)
 319
 320     if isinstance(xpath, (str, compat_str)):
 321         n = _find_xpath(xpath)
 322     else:
 323         for xp in xpath:
 324             n = _find_xpath(xp)
 325             if n is not None:
 326                 break
 327
 328     if n is None:
 329         if default is not NO_DEFAULT:
 330             return default
 331         elif fatal:
 332             name = xpath if name is None else name
 333             raise ExtractorError('Could not find XML element %s' % name)
 334         else:
 335             return None
 336     return n
 337
 338
 339 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 340     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 341     if n is None or n == default:
 342         return n
 343     if n.text is None:
 344         if default is not NO_DEFAULT:
 345             return default
 346         elif fatal:
 347             name = xpath if name is None else name
 348             raise ExtractorError('Could not find XML element\'s text %s' % name)
 349         else:
 350             return None
 351     return n.text
 352
 353
 354 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 355     n = find_xpath_attr(node, xpath, key)
 356     if n is None:
 357         if default is not NO_DEFAULT:
 358             return default
 359         elif fatal:
 360             name = f'{xpath}[@{key}]' if name is None else name
 361             raise ExtractorError('Could not find XML attribute %s' % name)
 362         else:
 363             return None
 364     return n.attrib[key]
 365
 366
 367 def get_element_by_id(id, html, **kwargs):
 368     """Return the content of the tag with the specified ID in the passed HTML document"""
 369     return get_element_by_attribute('id', id, html, **kwargs)
 370
 371
 372 def get_element_html_by_id(id, html, **kwargs):
 373     """Return the html of the tag with the specified ID in the passed HTML document"""
 374     return get_element_html_by_attribute('id', id, html, **kwargs)
 375
 376
 377 def get_element_by_class(class_name, html):
 378     """Return the content of the first tag with the specified class in the passed HTML document"""
 379     retval = get_elements_by_class(class_name, html)
 380     return retval[0] if retval else None
 381
 382
 383 def get_element_html_by_class(class_name, html):
 384     """Return the html of the first tag with the specified class in the passed HTML document"""
 385     retval = get_elements_html_by_class(class_name, html)
 386     return retval[0] if retval else None
 387
 388
 389 def get_element_by_attribute(attribute, value, html, **kwargs):
 390     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 391     return retval[0] if retval else None
 392
 393
 394 def get_element_html_by_attribute(attribute, value, html, **kargs):
 395     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 396     return retval[0] if retval else None
 397
 398
 399 def get_elements_by_class(class_name, html, **kargs):
 400     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_html_by_class(class_name, html):
 407     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 408     return get_elements_html_by_attribute(
 409         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 410         html, escape_value=False)
 411
 412
 413 def get_elements_by_attribute(*args, **kwargs):
 414     """Return the content of the tag with the specified attribute in the passed HTML document"""
 415     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 416
 417
 418 def get_elements_html_by_attribute(*args, **kwargs):
 419     """Return the html of the tag with the specified attribute in the passed HTML document"""
 420     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 421
 422
 423 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 424     """
 425     Return the text (content) and the html (whole) of the tag with the specified
 426     attribute in the passed HTML document
 427     """
 428
 429     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 430
 431     value = re.escape(value) if escape_value else value
 432
 433     partial_element_re = rf'''(?x)
 434         <(?P<tag>[a-zA-Z0-9:._-]+)
 435          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 436          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 437         '''
 438
 439     for m in re.finditer(partial_element_re, html):
 440         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 441
 442         yield (
 443             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 444             whole
 445         )
 446
 447
 448 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 449     """
 450     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 451     closing tag for the first opening tag it has encountered, and can be used
 452     as a context manager
 453     """
 454
 455     class HTMLBreakOnClosingTagException(Exception):
 456         pass
 457
 458     def __init__(self):
 459         self.tagstack = collections.deque()
 460         compat_HTMLParser.__init__(self)
 461
 462     def __enter__(self):
 463         return self
 464
 465     def __exit__(self, *_):
 466         self.close()
 467
 468     def close(self):
 469         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 470         # so data remains buffered; we no longer have any interest in it, thus
 471         # override this method to discard it
 472         pass
 473
 474     def handle_starttag(self, tag, _):
 475         self.tagstack.append(tag)
 476
 477     def handle_endtag(self, tag):
 478         if not self.tagstack:
 479             raise compat_HTMLParseError('no tags in the stack')
 480         while self.tagstack:
 481             inner_tag = self.tagstack.pop()
 482             if inner_tag == tag:
 483                 break
 484         else:
 485             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 486         if not self.tagstack:
 487             raise self.HTMLBreakOnClosingTagException()
 488
 489
 490 def get_element_text_and_html_by_tag(tag, html):
 491     """
 492     For the first element with the specified tag in the passed HTML document
 493     return its' content (text) and the whole element (html)
 494     """
 495     def find_or_raise(haystack, needle, exc):
 496         try:
 497             return haystack.index(needle)
 498         except ValueError:
 499             raise exc
 500     closing_tag = f'</{tag}>'
 501     whole_start = find_or_raise(
 502         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 503     content_start = find_or_raise(
 504         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 505     content_start += whole_start + 1
 506     with HTMLBreakOnClosingTagParser() as parser:
 507         parser.feed(html[whole_start:content_start])
 508         if not parser.tagstack or parser.tagstack[0] != tag:
 509             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 510         offset = content_start
 511         while offset < len(html):
 512             next_closing_tag_start = find_or_raise(
 513                 html[offset:], closing_tag,
 514                 compat_HTMLParseError(f'closing {tag} tag not found'))
 515             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 516             try:
 517                 parser.feed(html[offset:offset + next_closing_tag_end])
 518                 offset += next_closing_tag_end
 519             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 520                 return html[content_start:offset + next_closing_tag_start], \
 521                     html[whole_start:offset + next_closing_tag_end]
 522         raise compat_HTMLParseError('unexpected end of html')
 523
 524
 525 class HTMLAttributeParser(compat_HTMLParser):
 526     """Trivial HTML parser to gather the attributes for a single element"""
 527
 528     def __init__(self):
 529         self.attrs = {}
 530         compat_HTMLParser.__init__(self)
 531
 532     def handle_starttag(self, tag, attrs):
 533         self.attrs = dict(attrs)
 534
 535
 536 class HTMLListAttrsParser(compat_HTMLParser):
 537     """HTML parser to gather the attributes for the elements of a list"""
 538
 539     def __init__(self):
 540         compat_HTMLParser.__init__(self)
 541         self.items = []
 542         self._level = 0
 543
 544     def handle_starttag(self, tag, attrs):
 545         if tag == 'li' and self._level == 0:
 546             self.items.append(dict(attrs))
 547         self._level += 1
 548
 549     def handle_endtag(self, tag):
 550         self._level -= 1
 551
 552
 553 def extract_attributes(html_element):
 554     """Given a string for an HTML element such as
 555     <el
 556          a="foo" B="bar" c="&98;az" d=boz
 557          empty= noval entity="&amp;"
 558          sq='"' dq="'"
 559     >
 560     Decode and return a dictionary of attributes.
 561     {
 562         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 563         'empty': '', 'noval': None, 'entity': '&',
 564         'sq': '"', 'dq': '\''
 565     }.
 566     """
 567     parser = HTMLAttributeParser()
 568     with contextlib.suppress(compat_HTMLParseError):
 569         parser.feed(html_element)
 570         parser.close()
 571     return parser.attrs
 572
 573
 574 def parse_list(webpage):
 575     """Given a string for an series of HTML <li> elements,
 576     return a dictionary of their attributes"""
 577     parser = HTMLListAttrsParser()
 578     parser.feed(webpage)
 579     parser.close()
 580     return parser.items
 581
 582
 583 def clean_html(html):
 584     """Clean an HTML snippet into a readable string"""
 585
 586     if html is None:  # Convenience for sanitizing descriptions etc.
 587         return html
 588
 589     html = re.sub(r'\s+', ' ', html)
 590     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 591     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 592     # Strip html tags
 593     html = re.sub('<.*?>', '', html)
 594     # Replace html entities
 595     html = unescapeHTML(html)
 596     return html.strip()
 597
 598
 599 class LenientJSONDecoder(json.JSONDecoder):
 600     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 601         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 602         super().__init__(*args, **kwargs)
 603
 604     def decode(self, s):
 605         if self.transform_source:
 606             s = self.transform_source(s)
 607         if self.ignore_extra:
 608             return self.raw_decode(s.lstrip())[0]
 609         return super().decode(s)
 610
 611
 612 def sanitize_open(filename, open_mode):
 613     """Try to open the given filename, and slightly tweak it if this fails.
 614
 615     Attempts to open the given filename. If this fails, it tries to change
 616     the filename slightly, step by step, until it's either able to open it
 617     or it fails and raises a final exception, like the standard open()
 618     function.
 619
 620     It returns the tuple (stream, definitive_file_name).
 621     """
 622     if filename == '-':
 623         if sys.platform == 'win32':
 624             import msvcrt
 625             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 626         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 627
 628     for attempt in range(2):
 629         try:
 630             try:
 631                 if sys.platform == 'win32':
 632                     # FIXME: An exclusive lock also locks the file from being read.
 633                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 634                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 635                     raise LockingUnsupportedError()
 636                 stream = locked_file(filename, open_mode, block=False).__enter__()
 637             except OSError:
 638                 stream = open(filename, open_mode)
 639             return stream, filename
 640         except OSError as err:
 641             if attempt or err.errno in (errno.EACCES,):
 642                 raise
 643             old_filename, filename = filename, sanitize_path(filename)
 644             if old_filename == filename:
 645                 raise
 646
 647
 648 def timeconvert(timestr):
 649     """Convert RFC 2822 defined time string into system timestamp"""
 650     timestamp = None
 651     timetuple = email.utils.parsedate_tz(timestr)
 652     if timetuple is not None:
 653         timestamp = email.utils.mktime_tz(timetuple)
 654     return timestamp
 655
 656
 657 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 658     """Sanitizes a string so it could be used as part of a filename.
 659     @param restricted   Use a stricter subset of allowed characters
 660     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 661                         If unset, yt-dlp's new sanitization rules are in effect
 662     """
 663     if s == '':
 664         return ''
 665
 666     def replace_insane(char):
 667         if restricted and char in ACCENT_CHARS:
 668             return ACCENT_CHARS[char]
 669         elif not restricted and char == '\n':
 670             return '\0 '
 671         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 672             return ''
 673         elif char == '"':
 674             return '' if restricted else '\''
 675         elif char == ':':
 676             return '\0_\0-' if restricted else '\0 \0-'
 677         elif char in '\\/|*<>':
 678             return '\0_'
 679         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 680             return '\0_'
 681         return char
 682
 683     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 684     result = ''.join(map(replace_insane, s))
 685     if is_id is NO_DEFAULT:
 686         result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result)  # Remove repeated substitute chars
 687         STRIP_RE = '(?:\0.|[ _-])*'
 688         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 689     result = result.replace('\0', '') or '_'
 690
 691     if not is_id:
 692         while '__' in result:
 693             result = result.replace('__', '_')
 694         result = result.strip('_')
 695         # Common case of "Foreign band name - English song title"
 696         if restricted and result.startswith('-_'):
 697             result = result[2:]
 698         if result.startswith('-'):
 699             result = '_' + result[len('-'):]
 700         result = result.lstrip('.')
 701         if not result:
 702             result = '_'
 703     return result
 704
 705
 706 def sanitize_path(s, force=False):
 707     """Sanitizes and normalizes path on Windows"""
 708     if sys.platform == 'win32':
 709         force = False
 710         drive_or_unc, _ = os.path.splitdrive(s)
 711     elif force:
 712         drive_or_unc = ''
 713     else:
 714         return s
 715
 716     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 717     if drive_or_unc:
 718         norm_path.pop(0)
 719     sanitized_path = [
 720         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 721         for path_part in norm_path]
 722     if drive_or_unc:
 723         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 724     elif force and s and s[0] == os.path.sep:
 725         sanitized_path.insert(0, os.path.sep)
 726     return os.path.join(*sanitized_path)
 727
 728
 729 def sanitize_url(url):
 730     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 731     # the number of unwanted failures due to missing protocol
 732     if url is None:
 733         return
 734     elif url.startswith('//'):
 735         return 'http:%s' % url
 736     # Fix some common typos seen so far
 737     COMMON_TYPOS = (
 738         # https://github.com/ytdl-org/youtube-dl/issues/15649
 739         (r'^httpss://', r'https://'),
 740         # https://bx1.be/lives/direct-tv/
 741         (r'^rmtp([es]?)://', r'rtmp\1://'),
 742     )
 743     for mistake, fixup in COMMON_TYPOS:
 744         if re.match(mistake, url):
 745             return re.sub(mistake, fixup, url)
 746     return url
 747
 748
 749 def extract_basic_auth(url):
 750     parts = compat_urlparse.urlsplit(url)
 751     if parts.username is None:
 752         return url, None
 753     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 754         parts.hostname if parts.port is None
 755         else '%s:%d' % (parts.hostname, parts.port))))
 756     auth_payload = base64.b64encode(
 757         ('%s:%s' % (parts.username, parts.password or '')).encode())
 758     return url, f'Basic {auth_payload.decode()}'
 759
 760
 761 def sanitized_Request(url, *args, **kwargs):
 762     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 763     if auth_header is not None:
 764         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 765         headers['Authorization'] = auth_header
 766     return compat_urllib_request.Request(url, *args, **kwargs)
 767
 768
 769 def expand_path(s):
 770     """Expand shell variables and ~"""
 771     return os.path.expandvars(compat_expanduser(s))
 772
 773
 774 def orderedSet(iterable, *, lazy=False):
 775     """Remove all duplicates from the input iterable"""
 776     def _iter():
 777         seen = []  # Do not use set since the items can be unhashable
 778         for x in iterable:
 779             if x not in seen:
 780                 seen.append(x)
 781                 yield x
 782
 783     return _iter() if lazy else list(_iter())
 784
 785
 786 def _htmlentity_transform(entity_with_semicolon):
 787     """Transforms an HTML entity to a character."""
 788     entity = entity_with_semicolon[:-1]
 789
 790     # Known non-numeric HTML entity
 791     if entity in compat_html_entities.name2codepoint:
 792         return compat_chr(compat_html_entities.name2codepoint[entity])
 793
 794     # TODO: HTML5 allows entities without a semicolon. For example,
 795     # '&Eacuteric' should be decoded as 'Éric'.
 796     if entity_with_semicolon in compat_html_entities_html5:
 797         return compat_html_entities_html5[entity_with_semicolon]
 798
 799     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 800     if mobj is not None:
 801         numstr = mobj.group(1)
 802         if numstr.startswith('x'):
 803             base = 16
 804             numstr = '0%s' % numstr
 805         else:
 806             base = 10
 807         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 808         with contextlib.suppress(ValueError):
 809             return compat_chr(int(numstr, base))
 810
 811     # Unknown entity in name, return its literal representation
 812     return '&%s;' % entity
 813
 814
 815 def unescapeHTML(s):
 816     if s is None:
 817         return None
 818     assert isinstance(s, str)
 819
 820     return re.sub(
 821         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 822
 823
 824 def escapeHTML(text):
 825     return (
 826         text
 827         .replace('&', '&amp;')
 828         .replace('<', '&lt;')
 829         .replace('>', '&gt;')
 830         .replace('"', '&quot;')
 831         .replace("'", '&#39;')
 832     )
 833
 834
 835 def process_communicate_or_kill(p, *args, **kwargs):
 836     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 837                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 838     return Popen.communicate_or_kill(p, *args, **kwargs)
 839
 840
 841 class Popen(subprocess.Popen):
 842     if sys.platform == 'win32':
 843         _startupinfo = subprocess.STARTUPINFO()
 844         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 845     else:
 846         _startupinfo = None
 847
 848     def __init__(self, *args, text=False, **kwargs):
 849         if text is True:
 850             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 851             kwargs.setdefault('encoding', 'utf-8')
 852             kwargs.setdefault('errors', 'replace')
 853         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 854
 855     def communicate_or_kill(self, *args, **kwargs):
 856         try:
 857             return self.communicate(*args, **kwargs)
 858         except BaseException:  # Including KeyboardInterrupt
 859             self.kill(timeout=None)
 860             raise
 861
 862     def kill(self, *, timeout=0):
 863         super().kill()
 864         if timeout != 0:
 865             self.wait(timeout=timeout)
 866
 867     @classmethod
 868     def run(cls, *args, **kwargs):
 869         with cls(*args, **kwargs) as proc:
 870             stdout, stderr = proc.communicate_or_kill()
 871             return stdout or '', stderr or '', proc.returncode
 872
 873
 874 def get_subprocess_encoding():
 875     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 876         # For subprocess calls, encode with locale encoding
 877         # Refer to http://stackoverflow.com/a/9951851/35070
 878         encoding = preferredencoding()
 879     else:
 880         encoding = sys.getfilesystemencoding()
 881     if encoding is None:
 882         encoding = 'utf-8'
 883     return encoding
 884
 885
 886 def encodeFilename(s, for_subprocess=False):
 887     assert isinstance(s, str)
 888     return s
 889
 890
 891 def decodeFilename(b, for_subprocess=False):
 892     return b
 893
 894
 895 def encodeArgument(s):
 896     # Legacy code that uses byte strings
 897     # Uncomment the following line after fixing all post processors
 898     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 899     return s if isinstance(s, str) else s.decode('ascii')
 900
 901
 902 def decodeArgument(b):
 903     return b
 904
 905
 906 def decodeOption(optval):
 907     if optval is None:
 908         return optval
 909     if isinstance(optval, bytes):
 910         optval = optval.decode(preferredencoding())
 911
 912     assert isinstance(optval, compat_str)
 913     return optval
 914
 915
 916 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 917
 918
 919 def timetuple_from_msec(msec):
 920     secs, msec = divmod(msec, 1000)
 921     mins, secs = divmod(secs, 60)
 922     hrs, mins = divmod(mins, 60)
 923     return _timetuple(hrs, mins, secs, msec)
 924
 925
 926 def formatSeconds(secs, delim=':', msec=False):
 927     time = timetuple_from_msec(secs * 1000)
 928     if time.hours:
 929         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 930     elif time.minutes:
 931         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 932     else:
 933         ret = '%d' % time.seconds
 934     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 935
 936
 937 def _ssl_load_windows_store_certs(ssl_context, storename):
 938     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 939     try:
 940         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 941                  if encoding == 'x509_asn' and (
 942                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 943     except PermissionError:
 944         return
 945     for cert in certs:
 946         with contextlib.suppress(ssl.SSLError):
 947             ssl_context.load_verify_locations(cadata=cert)
 948
 949
 950 def make_HTTPS_handler(params, **kwargs):
 951     opts_check_certificate = not params.get('nocheckcertificate')
 952     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 953     context.check_hostname = opts_check_certificate
 954     if params.get('legacyserverconnect'):
 955         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 956         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 957         context.set_ciphers('DEFAULT')
 958
 959     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 960     if opts_check_certificate:
 961         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 962             context.load_verify_locations(cafile=certifi.where())
 963         try:
 964             context.load_default_certs()
 965         # Work around the issue in load_default_certs when there are bad certificates. See:
 966         # https://github.com/yt-dlp/yt-dlp/issues/1060,
 967         # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 968         except ssl.SSLError:
 969             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 970             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 971                 for storename in ('CA', 'ROOT'):
 972                     _ssl_load_windows_store_certs(context, storename)
 973             context.set_default_verify_paths()
 974
 975     client_certfile = params.get('client_certificate')
 976     if client_certfile:
 977         try:
 978             context.load_cert_chain(
 979                 client_certfile, keyfile=params.get('client_certificate_key'),
 980                 password=params.get('client_certificate_password'))
 981         except ssl.SSLError:
 982             raise YoutubeDLError('Unable to load client certificate')
 983
 984     # Some servers may reject requests if ALPN extension is not sent. See:
 985     # https://github.com/python/cpython/issues/85140
 986     # https://github.com/yt-dlp/yt-dlp/issues/3878
 987     with contextlib.suppress(NotImplementedError):
 988         context.set_alpn_protocols(['http/1.1'])
 989
 990     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 991
 992
 993 def bug_reports_message(before=';'):
 994     from .update import REPOSITORY
 995
 996     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 997            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 998
 999     before = before.rstrip()
1000     if not before or before.endswith(('.', '!', '?')):
1001         msg = msg[0].title() + msg[1:]
1002
1003     return (before + ' ' if before else '') + msg
1004
1005
1006 class YoutubeDLError(Exception):
1007     """Base exception for YoutubeDL errors."""
1008     msg = None
1009
1010     def __init__(self, msg=None):
1011         if msg is not None:
1012             self.msg = msg
1013         elif self.msg is None:
1014             self.msg = type(self).__name__
1015         super().__init__(self.msg)
1016
1017
1018 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1019 if hasattr(ssl, 'CertificateError'):
1020     network_exceptions.append(ssl.CertificateError)
1021 network_exceptions = tuple(network_exceptions)
1022
1023
1024 class ExtractorError(YoutubeDLError):
1025     """Error during info extraction."""
1026
1027     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1028         """ tb, if given, is the original traceback (so that it can be printed out).
1029         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1030         """
1031         if sys.exc_info()[0] in network_exceptions:
1032             expected = True
1033
1034         self.orig_msg = str(msg)
1035         self.traceback = tb
1036         self.expected = expected
1037         self.cause = cause
1038         self.video_id = video_id
1039         self.ie = ie
1040         self.exc_info = sys.exc_info()  # preserve original exception
1041         if isinstance(self.exc_info[1], ExtractorError):
1042             self.exc_info = self.exc_info[1].exc_info
1043
1044         super().__init__(''.join((
1045             format_field(ie, None, '[%s] '),
1046             format_field(video_id, None, '%s: '),
1047             msg,
1048             format_field(cause, None, ' (caused by %r)'),
1049             '' if expected else bug_reports_message())))
1050
1051     def format_traceback(self):
1052         return join_nonempty(
1053             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1054             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1055             delim='\n') or None
1056
1057
1058 class UnsupportedError(ExtractorError):
1059     def __init__(self, url):
1060         super().__init__(
1061             'Unsupported URL: %s' % url, expected=True)
1062         self.url = url
1063
1064
1065 class RegexNotFoundError(ExtractorError):
1066     """Error when a regex didn't match"""
1067     pass
1068
1069
1070 class GeoRestrictedError(ExtractorError):
1071     """Geographic restriction Error exception.
1072
1073     This exception may be thrown when a video is not available from your
1074     geographic location due to geographic restrictions imposed by a website.
1075     """
1076
1077     def __init__(self, msg, countries=None, **kwargs):
1078         kwargs['expected'] = True
1079         super().__init__(msg, **kwargs)
1080         self.countries = countries
1081
1082
1083 class DownloadError(YoutubeDLError):
1084     """Download Error exception.
1085
1086     This exception may be thrown by FileDownloader objects if they are not
1087     configured to continue on errors. They will contain the appropriate
1088     error message.
1089     """
1090
1091     def __init__(self, msg, exc_info=None):
1092         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1093         super().__init__(msg)
1094         self.exc_info = exc_info
1095
1096
1097 class EntryNotInPlaylist(YoutubeDLError):
1098     """Entry not in playlist exception.
1099
1100     This exception will be thrown by YoutubeDL when a requested entry
1101     is not found in the playlist info_dict
1102     """
1103     msg = 'Entry not found in info'
1104
1105
1106 class SameFileError(YoutubeDLError):
1107     """Same File exception.
1108
1109     This exception will be thrown by FileDownloader objects if they detect
1110     multiple files would have to be downloaded to the same file on disk.
1111     """
1112     msg = 'Fixed output name but more than one file to download'
1113
1114     def __init__(self, filename=None):
1115         if filename is not None:
1116             self.msg += f': {filename}'
1117         super().__init__(self.msg)
1118
1119
1120 class PostProcessingError(YoutubeDLError):
1121     """Post Processing exception.
1122
1123     This exception may be raised by PostProcessor's .run() method to
1124     indicate an error in the postprocessing task.
1125     """
1126
1127
1128 class DownloadCancelled(YoutubeDLError):
1129     """ Exception raised when the download queue should be interrupted """
1130     msg = 'The download was cancelled'
1131
1132
1133 class ExistingVideoReached(DownloadCancelled):
1134     """ --break-on-existing triggered """
1135     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1136
1137
1138 class RejectedVideoReached(DownloadCancelled):
1139     """ --break-on-reject triggered """
1140     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1141
1142
1143 class MaxDownloadsReached(DownloadCancelled):
1144     """ --max-downloads limit has been reached. """
1145     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1146
1147
1148 class ReExtractInfo(YoutubeDLError):
1149     """ Video info needs to be re-extracted. """
1150
1151     def __init__(self, msg, expected=False):
1152         super().__init__(msg)
1153         self.expected = expected
1154
1155
1156 class ThrottledDownload(ReExtractInfo):
1157     """ Download speed below --throttled-rate. """
1158     msg = 'The download speed is below throttle limit'
1159
1160     def __init__(self):
1161         super().__init__(self.msg, expected=False)
1162
1163
1164 class UnavailableVideoError(YoutubeDLError):
1165     """Unavailable Format exception.
1166
1167     This exception will be thrown when a video is requested
1168     in a format that is not available for that video.
1169     """
1170     msg = 'Unable to download video'
1171
1172     def __init__(self, err=None):
1173         if err is not None:
1174             self.msg += f': {err}'
1175         super().__init__(self.msg)
1176
1177
1178 class ContentTooShortError(YoutubeDLError):
1179     """Content Too Short exception.
1180
1181     This exception may be raised by FileDownloader objects when a file they
1182     download is too small for what the server announced first, indicating
1183     the connection was probably interrupted.
1184     """
1185
1186     def __init__(self, downloaded, expected):
1187         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1188         # Both in bytes
1189         self.downloaded = downloaded
1190         self.expected = expected
1191
1192
1193 class XAttrMetadataError(YoutubeDLError):
1194     def __init__(self, code=None, msg='Unknown error'):
1195         super().__init__(msg)
1196         self.code = code
1197         self.msg = msg
1198
1199         # Parsing code and msg
1200         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1201                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1202             self.reason = 'NO_SPACE'
1203         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1204             self.reason = 'VALUE_TOO_LONG'
1205         else:
1206             self.reason = 'NOT_SUPPORTED'
1207
1208
1209 class XAttrUnavailableError(YoutubeDLError):
1210     pass
1211
1212
1213 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1214     hc = http_class(*args, **kwargs)
1215     source_address = ydl_handler._params.get('source_address')
1216
1217     if source_address is not None:
1218         # This is to workaround _create_connection() from socket where it will try all
1219         # address data from getaddrinfo() including IPv6. This filters the result from
1220         # getaddrinfo() based on the source_address value.
1221         # This is based on the cpython socket.create_connection() function.
1222         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1223         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1224             host, port = address
1225             err = None
1226             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1227             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1228             ip_addrs = [addr for addr in addrs if addr[0] == af]
1229             if addrs and not ip_addrs:
1230                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1231                 raise OSError(
1232                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1233                     % (ip_version, source_address[0]))
1234             for res in ip_addrs:
1235                 af, socktype, proto, canonname, sa = res
1236                 sock = None
1237                 try:
1238                     sock = socket.socket(af, socktype, proto)
1239                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1240                         sock.settimeout(timeout)
1241                     sock.bind(source_address)
1242                     sock.connect(sa)
1243                     err = None  # Explicitly break reference cycle
1244                     return sock
1245                 except OSError as _:
1246                     err = _
1247                     if sock is not None:
1248                         sock.close()
1249             if err is not None:
1250                 raise err
1251             else:
1252                 raise OSError('getaddrinfo returns an empty list')
1253         if hasattr(hc, '_create_connection'):
1254             hc._create_connection = _create_connection
1255         hc.source_address = (source_address, 0)
1256
1257     return hc
1258
1259
1260 def handle_youtubedl_headers(headers):
1261     filtered_headers = headers
1262
1263     if 'Youtubedl-no-compression' in filtered_headers:
1264         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1265         del filtered_headers['Youtubedl-no-compression']
1266
1267     return filtered_headers
1268
1269
1270 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1271     """Handler for HTTP requests and responses.
1272
1273     This class, when installed with an OpenerDirector, automatically adds
1274     the standard headers to every HTTP request and handles gzipped and
1275     deflated responses from web servers. If compression is to be avoided in
1276     a particular request, the original request in the program code only has
1277     to include the HTTP header "Youtubedl-no-compression", which will be
1278     removed before making the real request.
1279
1280     Part of this code was copied from:
1281
1282     http://techknack.net/python-urllib2-handlers/
1283
1284     Andrew Rowls, the author of that code, agreed to release it to the
1285     public domain.
1286     """
1287
1288     def __init__(self, params, *args, **kwargs):
1289         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1290         self._params = params
1291
1292     def http_open(self, req):
1293         conn_class = compat_http_client.HTTPConnection
1294
1295         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1296         if socks_proxy:
1297             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1298             del req.headers['Ytdl-socks-proxy']
1299
1300         return self.do_open(functools.partial(
1301             _create_http_connection, self, conn_class, False),
1302             req)
1303
1304     @staticmethod
1305     def deflate(data):
1306         if not data:
1307             return data
1308         try:
1309             return zlib.decompress(data, -zlib.MAX_WBITS)
1310         except zlib.error:
1311             return zlib.decompress(data)
1312
1313     @staticmethod
1314     def brotli(data):
1315         if not data:
1316             return data
1317         return brotli.decompress(data)
1318
1319     def http_request(self, req):
1320         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1321         # always respected by websites, some tend to give out URLs with non percent-encoded
1322         # non-ASCII characters (see telemb.py, ard.py [#3412])
1323         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1324         # To work around aforementioned issue we will replace request's original URL with
1325         # percent-encoded one
1326         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1327         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1328         url = req.get_full_url()
1329         url_escaped = escape_url(url)
1330
1331         # Substitute URL if any change after escaping
1332         if url != url_escaped:
1333             req = update_Request(req, url=url_escaped)
1334
1335         for h, v in self._params.get('http_headers', std_headers).items():
1336             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1337             # The dict keys are capitalized because of this bug by urllib
1338             if h.capitalize() not in req.headers:
1339                 req.add_header(h, v)
1340
1341         if 'Accept-encoding' not in req.headers:
1342             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1343
1344         req.headers = handle_youtubedl_headers(req.headers)
1345
1346         return req
1347
1348     def http_response(self, req, resp):
1349         old_resp = resp
1350         # gzip
1351         if resp.headers.get('Content-encoding', '') == 'gzip':
1352             content = resp.read()
1353             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1354             try:
1355                 uncompressed = io.BytesIO(gz.read())
1356             except OSError as original_ioerror:
1357                 # There may be junk add the end of the file
1358                 # See http://stackoverflow.com/q/4928560/35070 for details
1359                 for i in range(1, 1024):
1360                     try:
1361                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1362                         uncompressed = io.BytesIO(gz.read())
1363                     except OSError:
1364                         continue
1365                     break
1366                 else:
1367                     raise original_ioerror
1368             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1369             resp.msg = old_resp.msg
1370             del resp.headers['Content-encoding']
1371         # deflate
1372         if resp.headers.get('Content-encoding', '') == 'deflate':
1373             gz = io.BytesIO(self.deflate(resp.read()))
1374             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1375             resp.msg = old_resp.msg
1376             del resp.headers['Content-encoding']
1377         # brotli
1378         if resp.headers.get('Content-encoding', '') == 'br':
1379             resp = compat_urllib_request.addinfourl(
1380                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1381             resp.msg = old_resp.msg
1382             del resp.headers['Content-encoding']
1383         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1384         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1385         if 300 <= resp.code < 400:
1386             location = resp.headers.get('Location')
1387             if location:
1388                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1389                 location = location.encode('iso-8859-1').decode()
1390                 location_escaped = escape_url(location)
1391                 if location != location_escaped:
1392                     del resp.headers['Location']
1393                     resp.headers['Location'] = location_escaped
1394         return resp
1395
1396     https_request = http_request
1397     https_response = http_response
1398
1399
1400 def make_socks_conn_class(base_class, socks_proxy):
1401     assert issubclass(base_class, (
1402         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1403
1404     url_components = compat_urlparse.urlparse(socks_proxy)
1405     if url_components.scheme.lower() == 'socks5':
1406         socks_type = ProxyType.SOCKS5
1407     elif url_components.scheme.lower() in ('socks', 'socks4'):
1408         socks_type = ProxyType.SOCKS4
1409     elif url_components.scheme.lower() == 'socks4a':
1410         socks_type = ProxyType.SOCKS4A
1411
1412     def unquote_if_non_empty(s):
1413         if not s:
1414             return s
1415         return compat_urllib_parse_unquote_plus(s)
1416
1417     proxy_args = (
1418         socks_type,
1419         url_components.hostname, url_components.port or 1080,
1420         True,  # Remote DNS
1421         unquote_if_non_empty(url_components.username),
1422         unquote_if_non_empty(url_components.password),
1423     )
1424
1425     class SocksConnection(base_class):
1426         def connect(self):
1427             self.sock = sockssocket()
1428             self.sock.setproxy(*proxy_args)
1429             if isinstance(self.timeout, (int, float)):
1430                 self.sock.settimeout(self.timeout)
1431             self.sock.connect((self.host, self.port))
1432
1433             if isinstance(self, compat_http_client.HTTPSConnection):
1434                 if hasattr(self, '_context'):  # Python > 2.6
1435                     self.sock = self._context.wrap_socket(
1436                         self.sock, server_hostname=self.host)
1437                 else:
1438                     self.sock = ssl.wrap_socket(self.sock)
1439
1440     return SocksConnection
1441
1442
1443 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1444     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1445         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1446         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1447         self._params = params
1448
1449     def https_open(self, req):
1450         kwargs = {}
1451         conn_class = self._https_conn_class
1452
1453         if hasattr(self, '_context'):  # python > 2.6
1454             kwargs['context'] = self._context
1455         if hasattr(self, '_check_hostname'):  # python 3.x
1456             kwargs['check_hostname'] = self._check_hostname
1457
1458         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1459         if socks_proxy:
1460             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1461             del req.headers['Ytdl-socks-proxy']
1462
1463         try:
1464             return self.do_open(
1465                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1466         except urllib.error.URLError as e:
1467             if (isinstance(e.reason, ssl.SSLError)
1468                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1469                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1470             raise
1471
1472
1473 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1474     """
1475     See [1] for cookie file format.
1476
1477     1. https://curl.haxx.se/docs/http-cookies.html
1478     """
1479     _HTTPONLY_PREFIX = '#HttpOnly_'
1480     _ENTRY_LEN = 7
1481     _HEADER = '''# Netscape HTTP Cookie File
1482 # This file is generated by yt-dlp.  Do not edit.
1483
1484 '''
1485     _CookieFileEntry = collections.namedtuple(
1486         'CookieFileEntry',
1487         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1488
1489     def __init__(self, filename=None, *args, **kwargs):
1490         super().__init__(None, *args, **kwargs)
1491         if self.is_path(filename):
1492             filename = os.fspath(filename)
1493         self.filename = filename
1494
1495     @staticmethod
1496     def _true_or_false(cndn):
1497         return 'TRUE' if cndn else 'FALSE'
1498
1499     @staticmethod
1500     def is_path(file):
1501         return isinstance(file, (str, bytes, os.PathLike))
1502
1503     @contextlib.contextmanager
1504     def open(self, file, *, write=False):
1505         if self.is_path(file):
1506             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1507                 yield f
1508         else:
1509             if write:
1510                 file.truncate(0)
1511             yield file
1512
1513     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1514         now = time.time()
1515         for cookie in self:
1516             if (not ignore_discard and cookie.discard
1517                     or not ignore_expires and cookie.is_expired(now)):
1518                 continue
1519             name, value = cookie.name, cookie.value
1520             if value is None:
1521                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1522                 # with no name, whereas http.cookiejar regards it as a
1523                 # cookie with no value.
1524                 name, value = '', name
1525             f.write('%s\n' % '\t'.join((
1526                 cookie.domain,
1527                 self._true_or_false(cookie.domain.startswith('.')),
1528                 cookie.path,
1529                 self._true_or_false(cookie.secure),
1530                 str_or_none(cookie.expires, default=''),
1531                 name, value
1532             )))
1533
1534     def save(self, filename=None, *args, **kwargs):
1535         """
1536         Save cookies to a file.
1537         Code is taken from CPython 3.6
1538         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1539
1540         if filename is None:
1541             if self.filename is not None:
1542                 filename = self.filename
1543             else:
1544                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1545
1546         # Store session cookies with `expires` set to 0 instead of an empty string
1547         for cookie in self:
1548             if cookie.expires is None:
1549                 cookie.expires = 0
1550
1551         with self.open(filename, write=True) as f:
1552             f.write(self._HEADER)
1553             self._really_save(f, *args, **kwargs)
1554
1555     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1556         """Load cookies from a file."""
1557         if filename is None:
1558             if self.filename is not None:
1559                 filename = self.filename
1560             else:
1561                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1562
1563         def prepare_line(line):
1564             if line.startswith(self._HTTPONLY_PREFIX):
1565                 line = line[len(self._HTTPONLY_PREFIX):]
1566             # comments and empty lines are fine
1567             if line.startswith('#') or not line.strip():
1568                 return line
1569             cookie_list = line.split('\t')
1570             if len(cookie_list) != self._ENTRY_LEN:
1571                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1572             cookie = self._CookieFileEntry(*cookie_list)
1573             if cookie.expires_at and not cookie.expires_at.isdigit():
1574                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1575             return line
1576
1577         cf = io.StringIO()
1578         with self.open(filename) as f:
1579             for line in f:
1580                 try:
1581                     cf.write(prepare_line(line))
1582                 except compat_cookiejar.LoadError as e:
1583                     if f'{line.strip()} '[0] in '[{"':
1584                         raise compat_cookiejar.LoadError(
1585                             'Cookies file must be Netscape formatted, not JSON. See  '
1586                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1587                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1588                     continue
1589         cf.seek(0)
1590         self._really_load(cf, filename, ignore_discard, ignore_expires)
1591         # Session cookies are denoted by either `expires` field set to
1592         # an empty string or 0. MozillaCookieJar only recognizes the former
1593         # (see [1]). So we need force the latter to be recognized as session
1594         # cookies on our own.
1595         # Session cookies may be important for cookies-based authentication,
1596         # e.g. usually, when user does not check 'Remember me' check box while
1597         # logging in on a site, some important cookies are stored as session
1598         # cookies so that not recognizing them will result in failed login.
1599         # 1. https://bugs.python.org/issue17164
1600         for cookie in self:
1601             # Treat `expires=0` cookies as session cookies
1602             if cookie.expires == 0:
1603                 cookie.expires = None
1604                 cookie.discard = True
1605
1606
1607 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1608     def __init__(self, cookiejar=None):
1609         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1610
1611     def http_response(self, request, response):
1612         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1613
1614     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1615     https_response = http_response
1616
1617
1618 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1619     """YoutubeDL redirect handler
1620
1621     The code is based on HTTPRedirectHandler implementation from CPython [1].
1622
1623     This redirect handler solves two issues:
1624      - ensures redirect URL is always unicode under python 2
1625      - introduces support for experimental HTTP response status code
1626        308 Permanent Redirect [2] used by some sites [3]
1627
1628     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1629     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1630     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1631     """
1632
1633     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1634
1635     def redirect_request(self, req, fp, code, msg, headers, newurl):
1636         """Return a Request or None in response to a redirect.
1637
1638         This is called by the http_error_30x methods when a
1639         redirection response is received.  If a redirection should
1640         take place, return a new Request to allow http_error_30x to
1641         perform the redirect.  Otherwise, raise HTTPError if no-one
1642         else should try to handle this url.  Return None if you can't
1643         but another Handler might.
1644         """
1645         m = req.get_method()
1646         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1647                  or code in (301, 302, 303) and m == "POST")):
1648             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1649         # Strictly (according to RFC 2616), 301 or 302 in response to
1650         # a POST MUST NOT cause a redirection without confirmation
1651         # from the user (of urllib.request, in this case).  In practice,
1652         # essentially all clients do redirect in this case, so we do
1653         # the same.
1654
1655         # Be conciliant with URIs containing a space.  This is mainly
1656         # redundant with the more complete encoding done in http_error_302(),
1657         # but it is kept for compatibility with other callers.
1658         newurl = newurl.replace(' ', '%20')
1659
1660         CONTENT_HEADERS = ("content-length", "content-type")
1661         # NB: don't use dict comprehension for python 2.6 compatibility
1662         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1663
1664         # A 303 must either use GET or HEAD for subsequent request
1665         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1666         if code == 303 and m != 'HEAD':
1667             m = 'GET'
1668         # 301 and 302 redirects are commonly turned into a GET from a POST
1669         # for subsequent requests by browsers, so we'll do the same.
1670         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1671         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1672         if code in (301, 302) and m == 'POST':
1673             m = 'GET'
1674
1675         return compat_urllib_request.Request(
1676             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1677             unverifiable=True, method=m)
1678
1679
1680 def extract_timezone(date_str):
1681     m = re.search(
1682         r'''(?x)
1683             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1684             (?P<tz>Z|                                            # just the UTC Z, or
1685                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1686                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1687                    [ ]?                                          # optional space
1688                 (?P<sign>\+|-)                                   # +/-
1689                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1690             $)
1691         ''', date_str)
1692     if not m:
1693         timezone = datetime.timedelta()
1694     else:
1695         date_str = date_str[:-len(m.group('tz'))]
1696         if not m.group('sign'):
1697             timezone = datetime.timedelta()
1698         else:
1699             sign = 1 if m.group('sign') == '+' else -1
1700             timezone = datetime.timedelta(
1701                 hours=sign * int(m.group('hours')),
1702                 minutes=sign * int(m.group('minutes')))
1703     return timezone, date_str
1704
1705
1706 def parse_iso8601(date_str, delimiter='T', timezone=None):
1707     """ Return a UNIX timestamp from the given date """
1708
1709     if date_str is None:
1710         return None
1711
1712     date_str = re.sub(r'\.[0-9]+', '', date_str)
1713
1714     if timezone is None:
1715         timezone, date_str = extract_timezone(date_str)
1716
1717     with contextlib.suppress(ValueError):
1718         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1719         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1720         return calendar.timegm(dt.timetuple())
1721
1722
1723 def date_formats(day_first=True):
1724     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1725
1726
1727 def unified_strdate(date_str, day_first=True):
1728     """Return a string with the date in the format YYYYMMDD"""
1729
1730     if date_str is None:
1731         return None
1732     upload_date = None
1733     # Replace commas
1734     date_str = date_str.replace(',', ' ')
1735     # Remove AM/PM + timezone
1736     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1737     _, date_str = extract_timezone(date_str)
1738
1739     for expression in date_formats(day_first):
1740         with contextlib.suppress(ValueError):
1741             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1742     if upload_date is None:
1743         timetuple = email.utils.parsedate_tz(date_str)
1744         if timetuple:
1745             with contextlib.suppress(ValueError):
1746                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1747     if upload_date is not None:
1748         return compat_str(upload_date)
1749
1750
1751 def unified_timestamp(date_str, day_first=True):
1752     if date_str is None:
1753         return None
1754
1755     date_str = re.sub(r'[,|]', '', date_str)
1756
1757     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1758     timezone, date_str = extract_timezone(date_str)
1759
1760     # Remove AM/PM + timezone
1761     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1762
1763     # Remove unrecognized timezones from ISO 8601 alike timestamps
1764     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1765     if m:
1766         date_str = date_str[:-len(m.group('tz'))]
1767
1768     # Python only supports microseconds, so remove nanoseconds
1769     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1770     if m:
1771         date_str = m.group(1)
1772
1773     for expression in date_formats(day_first):
1774         with contextlib.suppress(ValueError):
1775             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1776             return calendar.timegm(dt.timetuple())
1777     timetuple = email.utils.parsedate_tz(date_str)
1778     if timetuple:
1779         return calendar.timegm(timetuple) + pm_delta * 3600
1780
1781
1782 def determine_ext(url, default_ext='unknown_video'):
1783     if url is None or '.' not in url:
1784         return default_ext
1785     guess = url.partition('?')[0].rpartition('.')[2]
1786     if re.match(r'^[A-Za-z0-9]+$', guess):
1787         return guess
1788     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1789     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1790         return guess.rstrip('/')
1791     else:
1792         return default_ext
1793
1794
1795 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1796     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1797
1798
1799 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1800     R"""
1801     Return a datetime object from a string.
1802     Supported format:
1803         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1804
1805     @param format       strftime format of DATE
1806     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1807                         auto: round to the unit provided in date_str (if applicable).
1808     """
1809     auto_precision = False
1810     if precision == 'auto':
1811         auto_precision = True
1812         precision = 'microsecond'
1813     today = datetime_round(datetime.datetime.utcnow(), precision)
1814     if date_str in ('now', 'today'):
1815         return today
1816     if date_str == 'yesterday':
1817         return today - datetime.timedelta(days=1)
1818     match = re.match(
1819         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1820         date_str)
1821     if match is not None:
1822         start_time = datetime_from_str(match.group('start'), precision, format)
1823         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1824         unit = match.group('unit')
1825         if unit == 'month' or unit == 'year':
1826             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1827             unit = 'day'
1828         else:
1829             if unit == 'week':
1830                 unit = 'day'
1831                 time *= 7
1832             delta = datetime.timedelta(**{unit + 's': time})
1833             new_date = start_time + delta
1834         if auto_precision:
1835             return datetime_round(new_date, unit)
1836         return new_date
1837
1838     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1839
1840
1841 def date_from_str(date_str, format='%Y%m%d', strict=False):
1842     R"""
1843     Return a date object from a string using datetime_from_str
1844
1845     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1846                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1847     """
1848     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1849         raise ValueError(f'Invalid date format "{date_str}"')
1850     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1851
1852
1853 def datetime_add_months(dt, months):
1854     """Increment/Decrement a datetime object by months."""
1855     month = dt.month + months - 1
1856     year = dt.year + month // 12
1857     month = month % 12 + 1
1858     day = min(dt.day, calendar.monthrange(year, month)[1])
1859     return dt.replace(year, month, day)
1860
1861
1862 def datetime_round(dt, precision='day'):
1863     """
1864     Round a datetime object's time to a specific precision
1865     """
1866     if precision == 'microsecond':
1867         return dt
1868
1869     unit_seconds = {
1870         'day': 86400,
1871         'hour': 3600,
1872         'minute': 60,
1873         'second': 1,
1874     }
1875     roundto = lambda x, n: ((x + n / 2) // n) * n
1876     timestamp = calendar.timegm(dt.timetuple())
1877     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1878
1879
1880 def hyphenate_date(date_str):
1881     """
1882     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1883     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1884     if match is not None:
1885         return '-'.join(match.groups())
1886     else:
1887         return date_str
1888
1889
1890 class DateRange:
1891     """Represents a time interval between two dates"""
1892
1893     def __init__(self, start=None, end=None):
1894         """start and end must be strings in the format accepted by date"""
1895         if start is not None:
1896             self.start = date_from_str(start, strict=True)
1897         else:
1898             self.start = datetime.datetime.min.date()
1899         if end is not None:
1900             self.end = date_from_str(end, strict=True)
1901         else:
1902             self.end = datetime.datetime.max.date()
1903         if self.start > self.end:
1904             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1905
1906     @classmethod
1907     def day(cls, day):
1908         """Returns a range that only contains the given day"""
1909         return cls(day, day)
1910
1911     def __contains__(self, date):
1912         """Check if the date is in the range"""
1913         if not isinstance(date, datetime.date):
1914             date = date_from_str(date)
1915         return self.start <= date <= self.end
1916
1917     def __str__(self):
1918         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1919
1920
1921 def platform_name():
1922     """ Returns the platform name as a compat_str """
1923     res = platform.platform()
1924     if isinstance(res, bytes):
1925         res = res.decode(preferredencoding())
1926
1927     assert isinstance(res, compat_str)
1928     return res
1929
1930
1931 @functools.cache
1932 def get_windows_version():
1933     ''' Get Windows version. returns () if it's not running on Windows '''
1934     if compat_os_name == 'nt':
1935         return version_tuple(platform.win32_ver()[1])
1936     else:
1937         return ()
1938
1939
1940 def write_string(s, out=None, encoding=None):
1941     assert isinstance(s, str)
1942     out = out or sys.stderr
1943
1944     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1945         s = re.sub(r'([\r\n]+)', r' \1', s)
1946
1947     enc, buffer = None, out
1948     if 'b' in getattr(out, 'mode', ''):
1949         enc = encoding or preferredencoding()
1950     elif hasattr(out, 'buffer'):
1951         buffer = out.buffer
1952         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1953
1954     buffer.write(s.encode(enc, 'ignore') if enc else s)
1955     out.flush()
1956
1957
1958 def bytes_to_intlist(bs):
1959     if not bs:
1960         return []
1961     if isinstance(bs[0], int):  # Python 3
1962         return list(bs)
1963     else:
1964         return [ord(c) for c in bs]
1965
1966
1967 def intlist_to_bytes(xs):
1968     if not xs:
1969         return b''
1970     return compat_struct_pack('%dB' % len(xs), *xs)
1971
1972
1973 class LockingUnsupportedError(OSError):
1974     msg = 'File locking is not supported'
1975
1976     def __init__(self):
1977         super().__init__(self.msg)
1978
1979
1980 # Cross-platform file locking
1981 if sys.platform == 'win32':
1982     import ctypes.wintypes
1983     import msvcrt
1984
1985     class OVERLAPPED(ctypes.Structure):
1986         _fields_ = [
1987             ('Internal', ctypes.wintypes.LPVOID),
1988             ('InternalHigh', ctypes.wintypes.LPVOID),
1989             ('Offset', ctypes.wintypes.DWORD),
1990             ('OffsetHigh', ctypes.wintypes.DWORD),
1991             ('hEvent', ctypes.wintypes.HANDLE),
1992         ]
1993
1994     kernel32 = ctypes.windll.kernel32
1995     LockFileEx = kernel32.LockFileEx
1996     LockFileEx.argtypes = [
1997         ctypes.wintypes.HANDLE,     # hFile
1998         ctypes.wintypes.DWORD,      # dwFlags
1999         ctypes.wintypes.DWORD,      # dwReserved
2000         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2002         ctypes.POINTER(OVERLAPPED)  # Overlapped
2003     ]
2004     LockFileEx.restype = ctypes.wintypes.BOOL
2005     UnlockFileEx = kernel32.UnlockFileEx
2006     UnlockFileEx.argtypes = [
2007         ctypes.wintypes.HANDLE,     # hFile
2008         ctypes.wintypes.DWORD,      # dwReserved
2009         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2010         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2011         ctypes.POINTER(OVERLAPPED)  # Overlapped
2012     ]
2013     UnlockFileEx.restype = ctypes.wintypes.BOOL
2014     whole_low = 0xffffffff
2015     whole_high = 0x7fffffff
2016
2017     def _lock_file(f, exclusive, block):
2018         overlapped = OVERLAPPED()
2019         overlapped.Offset = 0
2020         overlapped.OffsetHigh = 0
2021         overlapped.hEvent = 0
2022         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2023
2024         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2025                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2026                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2027             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2028             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2029
2030     def _unlock_file(f):
2031         assert f._lock_file_overlapped_p
2032         handle = msvcrt.get_osfhandle(f.fileno())
2033         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2034             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2035
2036 else:
2037     try:
2038         import fcntl
2039
2040         def _lock_file(f, exclusive, block):
2041             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2042             if not block:
2043                 flags |= fcntl.LOCK_NB
2044             try:
2045                 fcntl.flock(f, flags)
2046             except BlockingIOError:
2047                 raise
2048             except OSError:  # AOSP does not have flock()
2049                 fcntl.lockf(f, flags)
2050
2051         def _unlock_file(f):
2052             try:
2053                 fcntl.flock(f, fcntl.LOCK_UN)
2054             except OSError:
2055                 fcntl.lockf(f, fcntl.LOCK_UN)
2056
2057     except ImportError:
2058
2059         def _lock_file(f, exclusive, block):
2060             raise LockingUnsupportedError()
2061
2062         def _unlock_file(f):
2063             raise LockingUnsupportedError()
2064
2065
2066 class locked_file:
2067     locked = False
2068
2069     def __init__(self, filename, mode, block=True, encoding=None):
2070         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2071             raise NotImplementedError(mode)
2072         self.mode, self.block = mode, block
2073
2074         writable = any(f in mode for f in 'wax+')
2075         readable = any(f in mode for f in 'r+')
2076         flags = functools.reduce(operator.ior, (
2077             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2078             getattr(os, 'O_BINARY', 0),  # Windows only
2079             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2080             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2081             os.O_APPEND if 'a' in mode else 0,
2082             os.O_EXCL if 'x' in mode else 0,
2083             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2084         ))
2085
2086         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2087
2088     def __enter__(self):
2089         exclusive = 'r' not in self.mode
2090         try:
2091             _lock_file(self.f, exclusive, self.block)
2092             self.locked = True
2093         except OSError:
2094             self.f.close()
2095             raise
2096         if 'w' in self.mode:
2097             try:
2098                 self.f.truncate()
2099             except OSError as e:
2100                 if e.errno not in (
2101                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2102                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2103                 ):
2104                     raise
2105         return self
2106
2107     def unlock(self):
2108         if not self.locked:
2109             return
2110         try:
2111             _unlock_file(self.f)
2112         finally:
2113             self.locked = False
2114
2115     def __exit__(self, *_):
2116         try:
2117             self.unlock()
2118         finally:
2119             self.f.close()
2120
2121     open = __enter__
2122     close = __exit__
2123
2124     def __getattr__(self, attr):
2125         return getattr(self.f, attr)
2126
2127     def __iter__(self):
2128         return iter(self.f)
2129
2130
2131 @functools.cache
2132 def get_filesystem_encoding():
2133     encoding = sys.getfilesystemencoding()
2134     return encoding if encoding is not None else 'utf-8'
2135
2136
2137 def shell_quote(args):
2138     quoted_args = []
2139     encoding = get_filesystem_encoding()
2140     for a in args:
2141         if isinstance(a, bytes):
2142             # We may get a filename encoded with 'encodeFilename'
2143             a = a.decode(encoding)
2144         quoted_args.append(compat_shlex_quote(a))
2145     return ' '.join(quoted_args)
2146
2147
2148 def smuggle_url(url, data):
2149     """ Pass additional data in a URL for internal use. """
2150
2151     url, idata = unsmuggle_url(url, {})
2152     data.update(idata)
2153     sdata = compat_urllib_parse_urlencode(
2154         {'__youtubedl_smuggle': json.dumps(data)})
2155     return url + '#' + sdata
2156
2157
2158 def unsmuggle_url(smug_url, default=None):
2159     if '#__youtubedl_smuggle' not in smug_url:
2160         return smug_url, default
2161     url, _, sdata = smug_url.rpartition('#')
2162     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2163     data = json.loads(jsond)
2164     return url, data
2165
2166
2167 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2168     """ Formats numbers with decimal sufixes like K, M, etc """
2169     num, factor = float_or_none(num), float(factor)
2170     if num is None or num < 0:
2171         return None
2172     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2173     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2174     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2175     if factor == 1024:
2176         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2177     converted = num / (factor ** exponent)
2178     return fmt % (converted, suffix)
2179
2180
2181 def format_bytes(bytes):
2182     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2183
2184
2185 def lookup_unit_table(unit_table, s):
2186     units_re = '|'.join(re.escape(u) for u in unit_table)
2187     m = re.match(
2188         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2189     if not m:
2190         return None
2191     num_str = m.group('num').replace(',', '.')
2192     mult = unit_table[m.group('unit')]
2193     return int(float(num_str) * mult)
2194
2195
2196 def parse_filesize(s):
2197     if s is None:
2198         return None
2199
2200     # The lower-case forms are of course incorrect and unofficial,
2201     # but we support those too
2202     _UNIT_TABLE = {
2203         'B': 1,
2204         'b': 1,
2205         'bytes': 1,
2206         'KiB': 1024,
2207         'KB': 1000,
2208         'kB': 1024,
2209         'Kb': 1000,
2210         'kb': 1000,
2211         'kilobytes': 1000,
2212         'kibibytes': 1024,
2213         'MiB': 1024 ** 2,
2214         'MB': 1000 ** 2,
2215         'mB': 1024 ** 2,
2216         'Mb': 1000 ** 2,
2217         'mb': 1000 ** 2,
2218         'megabytes': 1000 ** 2,
2219         'mebibytes': 1024 ** 2,
2220         'GiB': 1024 ** 3,
2221         'GB': 1000 ** 3,
2222         'gB': 1024 ** 3,
2223         'Gb': 1000 ** 3,
2224         'gb': 1000 ** 3,
2225         'gigabytes': 1000 ** 3,
2226         'gibibytes': 1024 ** 3,
2227         'TiB': 1024 ** 4,
2228         'TB': 1000 ** 4,
2229         'tB': 1024 ** 4,
2230         'Tb': 1000 ** 4,
2231         'tb': 1000 ** 4,
2232         'terabytes': 1000 ** 4,
2233         'tebibytes': 1024 ** 4,
2234         'PiB': 1024 ** 5,
2235         'PB': 1000 ** 5,
2236         'pB': 1024 ** 5,
2237         'Pb': 1000 ** 5,
2238         'pb': 1000 ** 5,
2239         'petabytes': 1000 ** 5,
2240         'pebibytes': 1024 ** 5,
2241         'EiB': 1024 ** 6,
2242         'EB': 1000 ** 6,
2243         'eB': 1024 ** 6,
2244         'Eb': 1000 ** 6,
2245         'eb': 1000 ** 6,
2246         'exabytes': 1000 ** 6,
2247         'exbibytes': 1024 ** 6,
2248         'ZiB': 1024 ** 7,
2249         'ZB': 1000 ** 7,
2250         'zB': 1024 ** 7,
2251         'Zb': 1000 ** 7,
2252         'zb': 1000 ** 7,
2253         'zettabytes': 1000 ** 7,
2254         'zebibytes': 1024 ** 7,
2255         'YiB': 1024 ** 8,
2256         'YB': 1000 ** 8,
2257         'yB': 1024 ** 8,
2258         'Yb': 1000 ** 8,
2259         'yb': 1000 ** 8,
2260         'yottabytes': 1000 ** 8,
2261         'yobibytes': 1024 ** 8,
2262     }
2263
2264     return lookup_unit_table(_UNIT_TABLE, s)
2265
2266
2267 def parse_count(s):
2268     if s is None:
2269         return None
2270
2271     s = re.sub(r'^[^\d]+\s', '', s).strip()
2272
2273     if re.match(r'^[\d,.]+$', s):
2274         return str_to_int(s)
2275
2276     _UNIT_TABLE = {
2277         'k': 1000,
2278         'K': 1000,
2279         'm': 1000 ** 2,
2280         'M': 1000 ** 2,
2281         'kk': 1000 ** 2,
2282         'KK': 1000 ** 2,
2283         'b': 1000 ** 3,
2284         'B': 1000 ** 3,
2285     }
2286
2287     ret = lookup_unit_table(_UNIT_TABLE, s)
2288     if ret is not None:
2289         return ret
2290
2291     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2292     if mobj:
2293         return str_to_int(mobj.group(1))
2294
2295
2296 def parse_resolution(s, *, lenient=False):
2297     if s is None:
2298         return {}
2299
2300     if lenient:
2301         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2302     else:
2303         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2304     if mobj:
2305         return {
2306             'width': int(mobj.group('w')),
2307             'height': int(mobj.group('h')),
2308         }
2309
2310     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2311     if mobj:
2312         return {'height': int(mobj.group(1))}
2313
2314     mobj = re.search(r'\b([48])[kK]\b', s)
2315     if mobj:
2316         return {'height': int(mobj.group(1)) * 540}
2317
2318     return {}
2319
2320
2321 def parse_bitrate(s):
2322     if not isinstance(s, compat_str):
2323         return
2324     mobj = re.search(r'\b(\d+)\s*kbps', s)
2325     if mobj:
2326         return int(mobj.group(1))
2327
2328
2329 def month_by_name(name, lang='en'):
2330     """ Return the number of a month by (locale-independently) English name """
2331
2332     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2333
2334     try:
2335         return month_names.index(name) + 1
2336     except ValueError:
2337         return None
2338
2339
2340 def month_by_abbreviation(abbrev):
2341     """ Return the number of a month by (locale-independently) English
2342         abbreviations """
2343
2344     try:
2345         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2346     except ValueError:
2347         return None
2348
2349
2350 def fix_xml_ampersands(xml_str):
2351     """Replace all the '&' by '&amp;' in XML"""
2352     return re.sub(
2353         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2354         '&amp;',
2355         xml_str)
2356
2357
2358 def setproctitle(title):
2359     assert isinstance(title, compat_str)
2360
2361     # ctypes in Jython is not complete
2362     # http://bugs.jython.org/issue2148
2363     if sys.platform.startswith('java'):
2364         return
2365
2366     try:
2367         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2368     except OSError:
2369         return
2370     except TypeError:
2371         # LoadLibrary in Windows Python 2.7.13 only expects
2372         # a bytestring, but since unicode_literals turns
2373         # every string into a unicode string, it fails.
2374         return
2375     title_bytes = title.encode()
2376     buf = ctypes.create_string_buffer(len(title_bytes))
2377     buf.value = title_bytes
2378     try:
2379         libc.prctl(15, buf, 0, 0, 0)
2380     except AttributeError:
2381         return  # Strange libc, just skip this
2382
2383
2384 def remove_start(s, start):
2385     return s[len(start):] if s is not None and s.startswith(start) else s
2386
2387
2388 def remove_end(s, end):
2389     return s[:-len(end)] if s is not None and s.endswith(end) else s
2390
2391
2392 def remove_quotes(s):
2393     if s is None or len(s) < 2:
2394         return s
2395     for quote in ('"', "'", ):
2396         if s[0] == quote and s[-1] == quote:
2397             return s[1:-1]
2398     return s
2399
2400
2401 def get_domain(url):
2402     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2403     return domain.group('domain') if domain else None
2404
2405
2406 def url_basename(url):
2407     path = compat_urlparse.urlparse(url).path
2408     return path.strip('/').split('/')[-1]
2409
2410
2411 def base_url(url):
2412     return re.match(r'https?://[^?#&]+/', url).group()
2413
2414
2415 def urljoin(base, path):
2416     if isinstance(path, bytes):
2417         path = path.decode()
2418     if not isinstance(path, compat_str) or not path:
2419         return None
2420     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2421         return path
2422     if isinstance(base, bytes):
2423         base = base.decode()
2424     if not isinstance(base, compat_str) or not re.match(
2425             r'^(?:https?:)?//', base):
2426         return None
2427     return compat_urlparse.urljoin(base, path)
2428
2429
2430 class HEADRequest(compat_urllib_request.Request):
2431     def get_method(self):
2432         return 'HEAD'
2433
2434
2435 class PUTRequest(compat_urllib_request.Request):
2436     def get_method(self):
2437         return 'PUT'
2438
2439
2440 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2441     if get_attr and v is not None:
2442         v = getattr(v, get_attr, None)
2443     try:
2444         return int(v) * invscale // scale
2445     except (ValueError, TypeError, OverflowError):
2446         return default
2447
2448
2449 def str_or_none(v, default=None):
2450     return default if v is None else compat_str(v)
2451
2452
2453 def str_to_int(int_str):
2454     """ A more relaxed version of int_or_none """
2455     if isinstance(int_str, int):
2456         return int_str
2457     elif isinstance(int_str, compat_str):
2458         int_str = re.sub(r'[,\.\+]', '', int_str)
2459         return int_or_none(int_str)
2460
2461
2462 def float_or_none(v, scale=1, invscale=1, default=None):
2463     if v is None:
2464         return default
2465     try:
2466         return float(v) * invscale / scale
2467     except (ValueError, TypeError):
2468         return default
2469
2470
2471 def bool_or_none(v, default=None):
2472     return v if isinstance(v, bool) else default
2473
2474
2475 def strip_or_none(v, default=None):
2476     return v.strip() if isinstance(v, compat_str) else default
2477
2478
2479 def url_or_none(url):
2480     if not url or not isinstance(url, compat_str):
2481         return None
2482     url = url.strip()
2483     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2484
2485
2486 def request_to_url(req):
2487     if isinstance(req, compat_urllib_request.Request):
2488         return req.get_full_url()
2489     else:
2490         return req
2491
2492
2493 def strftime_or_none(timestamp, date_format, default=None):
2494     datetime_object = None
2495     try:
2496         if isinstance(timestamp, (int, float)):  # unix timestamp
2497             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2498         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2499             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2500         return datetime_object.strftime(date_format)
2501     except (ValueError, TypeError, AttributeError):
2502         return default
2503
2504
2505 def parse_duration(s):
2506     if not isinstance(s, str):
2507         return None
2508     s = s.strip()
2509     if not s:
2510         return None
2511
2512     days, hours, mins, secs, ms = [None] * 5
2513     m = re.match(r'''(?x)
2514             (?P<before_secs>
2515                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2516             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2517             (?P<ms>[.:][0-9]+)?Z?$
2518         ''', s)
2519     if m:
2520         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2521     else:
2522         m = re.match(
2523             r'''(?ix)(?:P?
2524                 (?:
2525                     [0-9]+\s*y(?:ears?)?,?\s*
2526                 )?
2527                 (?:
2528                     [0-9]+\s*m(?:onths?)?,?\s*
2529                 )?
2530                 (?:
2531                     [0-9]+\s*w(?:eeks?)?,?\s*
2532                 )?
2533                 (?:
2534                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2535                 )?
2536                 T)?
2537                 (?:
2538                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2539                 )?
2540                 (?:
2541                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2542                 )?
2543                 (?:
2544                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2545                 )?Z?$''', s)
2546         if m:
2547             days, hours, mins, secs, ms = m.groups()
2548         else:
2549             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2550             if m:
2551                 hours, mins = m.groups()
2552             else:
2553                 return None
2554
2555     if ms:
2556         ms = ms.replace(':', '.')
2557     return sum(float(part or 0) * mult for part, mult in (
2558         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2559
2560
2561 def prepend_extension(filename, ext, expected_real_ext=None):
2562     name, real_ext = os.path.splitext(filename)
2563     return (
2564         f'{name}.{ext}{real_ext}'
2565         if not expected_real_ext or real_ext[1:] == expected_real_ext
2566         else f'{filename}.{ext}')
2567
2568
2569 def replace_extension(filename, ext, expected_real_ext=None):
2570     name, real_ext = os.path.splitext(filename)
2571     return '{}.{}'.format(
2572         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2573         ext)
2574
2575
2576 def check_executable(exe, args=[]):
2577     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2578     args can be a list of arguments for a short output (like -version) """
2579     try:
2580         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2581     except OSError:
2582         return False
2583     return exe
2584
2585
2586 def _get_exe_version_output(exe, args, *, to_screen=None):
2587     if to_screen:
2588         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2589     try:
2590         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2591         # SIGTTOU if yt-dlp is run in the background.
2592         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2593         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2594                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2595     except OSError:
2596         return False
2597     return stdout
2598
2599
2600 def detect_exe_version(output, version_re=None, unrecognized='present'):
2601     assert isinstance(output, compat_str)
2602     if version_re is None:
2603         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2604     m = re.search(version_re, output)
2605     if m:
2606         return m.group(1)
2607     else:
2608         return unrecognized
2609
2610
2611 def get_exe_version(exe, args=['--version'],
2612                     version_re=None, unrecognized='present'):
2613     """ Returns the version of the specified executable,
2614     or False if the executable is not present """
2615     out = _get_exe_version_output(exe, args)
2616     return detect_exe_version(out, version_re, unrecognized) if out else False
2617
2618
2619 def frange(start=0, stop=None, step=1):
2620     """Float range"""
2621     if stop is None:
2622         start, stop = 0, start
2623     sign = [-1, 1][step > 0] if step else 0
2624     while sign * start < sign * stop:
2625         yield start
2626         start += step
2627
2628
2629 class LazyList(collections.abc.Sequence):
2630     """Lazy immutable list from an iterable
2631     Note that slices of a LazyList are lists and not LazyList"""
2632
2633     class IndexError(IndexError):
2634         pass
2635
2636     def __init__(self, iterable, *, reverse=False, _cache=None):
2637         self._iterable = iter(iterable)
2638         self._cache = [] if _cache is None else _cache
2639         self._reversed = reverse
2640
2641     def __iter__(self):
2642         if self._reversed:
2643             # We need to consume the entire iterable to iterate in reverse
2644             yield from self.exhaust()
2645             return
2646         yield from self._cache
2647         for item in self._iterable:
2648             self._cache.append(item)
2649             yield item
2650
2651     def _exhaust(self):
2652         self._cache.extend(self._iterable)
2653         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2654         return self._cache
2655
2656     def exhaust(self):
2657         """Evaluate the entire iterable"""
2658         return self._exhaust()[::-1 if self._reversed else 1]
2659
2660     @staticmethod
2661     def _reverse_index(x):
2662         return None if x is None else -(x + 1)
2663
2664     def __getitem__(self, idx):
2665         if isinstance(idx, slice):
2666             if self._reversed:
2667                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2668             start, stop, step = idx.start, idx.stop, idx.step or 1
2669         elif isinstance(idx, int):
2670             if self._reversed:
2671                 idx = self._reverse_index(idx)
2672             start, stop, step = idx, idx, 0
2673         else:
2674             raise TypeError('indices must be integers or slices')
2675         if ((start or 0) < 0 or (stop or 0) < 0
2676                 or (start is None and step < 0)
2677                 or (stop is None and step > 0)):
2678             # We need to consume the entire iterable to be able to slice from the end
2679             # Obviously, never use this with infinite iterables
2680             self._exhaust()
2681             try:
2682                 return self._cache[idx]
2683             except IndexError as e:
2684                 raise self.IndexError(e) from e
2685         n = max(start or 0, stop or 0) - len(self._cache) + 1
2686         if n > 0:
2687             self._cache.extend(itertools.islice(self._iterable, n))
2688         try:
2689             return self._cache[idx]
2690         except IndexError as e:
2691             raise self.IndexError(e) from e
2692
2693     def __bool__(self):
2694         try:
2695             self[-1] if self._reversed else self[0]
2696         except self.IndexError:
2697             return False
2698         return True
2699
2700     def __len__(self):
2701         self._exhaust()
2702         return len(self._cache)
2703
2704     def __reversed__(self):
2705         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2706
2707     def __copy__(self):
2708         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2709
2710     def __repr__(self):
2711         # repr and str should mimic a list. So we exhaust the iterable
2712         return repr(self.exhaust())
2713
2714     def __str__(self):
2715         return repr(self.exhaust())
2716
2717
2718 class PagedList:
2719
2720     class IndexError(IndexError):
2721         pass
2722
2723     def __len__(self):
2724         # This is only useful for tests
2725         return len(self.getslice())
2726
2727     def __init__(self, pagefunc, pagesize, use_cache=True):
2728         self._pagefunc = pagefunc
2729         self._pagesize = pagesize
2730         self._pagecount = float('inf')
2731         self._use_cache = use_cache
2732         self._cache = {}
2733
2734     def getpage(self, pagenum):
2735         page_results = self._cache.get(pagenum)
2736         if page_results is None:
2737             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2738         if self._use_cache:
2739             self._cache[pagenum] = page_results
2740         return page_results
2741
2742     def getslice(self, start=0, end=None):
2743         return list(self._getslice(start, end))
2744
2745     def _getslice(self, start, end):
2746         raise NotImplementedError('This method must be implemented by subclasses')
2747
2748     def __getitem__(self, idx):
2749         assert self._use_cache, 'Indexing PagedList requires cache'
2750         if not isinstance(idx, int) or idx < 0:
2751             raise TypeError('indices must be non-negative integers')
2752         entries = self.getslice(idx, idx + 1)
2753         if not entries:
2754             raise self.IndexError()
2755         return entries[0]
2756
2757
2758 class OnDemandPagedList(PagedList):
2759     """Download pages until a page with less than maximum results"""
2760
2761     def _getslice(self, start, end):
2762         for pagenum in itertools.count(start // self._pagesize):
2763             firstid = pagenum * self._pagesize
2764             nextfirstid = pagenum * self._pagesize + self._pagesize
2765             if start >= nextfirstid:
2766                 continue
2767
2768             startv = (
2769                 start % self._pagesize
2770                 if firstid <= start < nextfirstid
2771                 else 0)
2772             endv = (
2773                 ((end - 1) % self._pagesize) + 1
2774                 if (end is not None and firstid <= end <= nextfirstid)
2775                 else None)
2776
2777             try:
2778                 page_results = self.getpage(pagenum)
2779             except Exception:
2780                 self._pagecount = pagenum - 1
2781                 raise
2782             if startv != 0 or endv is not None:
2783                 page_results = page_results[startv:endv]
2784             yield from page_results
2785
2786             # A little optimization - if current page is not "full", ie. does
2787             # not contain page_size videos then we can assume that this page
2788             # is the last one - there are no more ids on further pages -
2789             # i.e. no need to query again.
2790             if len(page_results) + startv < self._pagesize:
2791                 break
2792
2793             # If we got the whole page, but the next page is not interesting,
2794             # break out early as well
2795             if end == nextfirstid:
2796                 break
2797
2798
2799 class InAdvancePagedList(PagedList):
2800     """PagedList with total number of pages known in advance"""
2801
2802     def __init__(self, pagefunc, pagecount, pagesize):
2803         PagedList.__init__(self, pagefunc, pagesize, True)
2804         self._pagecount = pagecount
2805
2806     def _getslice(self, start, end):
2807         start_page = start // self._pagesize
2808         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2809         skip_elems = start - start_page * self._pagesize
2810         only_more = None if end is None else end - start
2811         for pagenum in range(start_page, end_page):
2812             page_results = self.getpage(pagenum)
2813             if skip_elems:
2814                 page_results = page_results[skip_elems:]
2815                 skip_elems = None
2816             if only_more is not None:
2817                 if len(page_results) < only_more:
2818                     only_more -= len(page_results)
2819                 else:
2820                     yield from page_results[:only_more]
2821                     break
2822             yield from page_results
2823
2824
2825 class PlaylistEntries:
2826     MissingEntry = object()
2827     is_exhausted = False
2828
2829     def __init__(self, ydl, info_dict):
2830         self.ydl = ydl
2831
2832         # _entries must be assigned now since infodict can change during iteration
2833         entries = info_dict.get('entries')
2834         if entries is None:
2835             raise EntryNotInPlaylist('There are no entries')
2836         elif isinstance(entries, list):
2837             self.is_exhausted = True
2838
2839         requested_entries = info_dict.get('requested_entries')
2840         self.is_incomplete = bool(requested_entries)
2841         if self.is_incomplete:
2842             assert self.is_exhausted
2843             self._entries = [self.MissingEntry] * max(requested_entries)
2844             for i, entry in zip(requested_entries, entries):
2845                 self._entries[i - 1] = entry
2846         elif isinstance(entries, (list, PagedList, LazyList)):
2847             self._entries = entries
2848         else:
2849             self._entries = LazyList(entries)
2850
2851     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2852         (?P<start>[+-]?\d+)?
2853         (?P<range>[:-]
2854             (?P<end>[+-]?\d+|inf(?:inite)?)?
2855             (?::(?P<step>[+-]?\d+))?
2856         )?''')
2857
2858     @classmethod
2859     def parse_playlist_items(cls, string):
2860         for segment in string.split(','):
2861             if not segment:
2862                 raise ValueError('There is two or more consecutive commas')
2863             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2864             if not mobj:
2865                 raise ValueError(f'{segment!r} is not a valid specification')
2866             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2867             if int_or_none(step) == 0:
2868                 raise ValueError(f'Step in {segment!r} cannot be zero')
2869             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2870
2871     def get_requested_items(self):
2872         playlist_items = self.ydl.params.get('playlist_items')
2873         playlist_start = self.ydl.params.get('playliststart', 1)
2874         playlist_end = self.ydl.params.get('playlistend')
2875         # For backwards compatibility, interpret -1 as whole list
2876         if playlist_end in (-1, None):
2877             playlist_end = ''
2878         if not playlist_items:
2879             playlist_items = f'{playlist_start}:{playlist_end}'
2880         elif playlist_start != 1 or playlist_end:
2881             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2882
2883         for index in self.parse_playlist_items(playlist_items):
2884             for i, entry in self[index]:
2885                 yield i, entry
2886                 try:
2887                     # TODO: Add auto-generated fields
2888                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2889                 except (ExistingVideoReached, RejectedVideoReached):
2890                     return
2891
2892     def get_full_count(self):
2893         if self.is_exhausted and not self.is_incomplete:
2894             return len(self)
2895         elif isinstance(self._entries, InAdvancePagedList):
2896             if self._entries._pagesize == 1:
2897                 return self._entries._pagecount
2898
2899     @functools.cached_property
2900     def _getter(self):
2901         if isinstance(self._entries, list):
2902             def get_entry(i):
2903                 try:
2904                     entry = self._entries[i]
2905                 except IndexError:
2906                     entry = self.MissingEntry
2907                     if not self.is_incomplete:
2908                         raise self.IndexError()
2909                 if entry is self.MissingEntry:
2910                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2911                 return entry
2912         else:
2913             def get_entry(i):
2914                 try:
2915                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2916                 except (LazyList.IndexError, PagedList.IndexError):
2917                     raise self.IndexError()
2918         return get_entry
2919
2920     def __getitem__(self, idx):
2921         if isinstance(idx, int):
2922             idx = slice(idx, idx)
2923
2924         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2925         step = 1 if idx.step is None else idx.step
2926         if idx.start is None:
2927             start = 0 if step > 0 else len(self) - 1
2928         else:
2929             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2930
2931         # NB: Do not call len(self) when idx == [:]
2932         if idx.stop is None:
2933             stop = 0 if step < 0 else float('inf')
2934         else:
2935             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2936         stop += [-1, 1][step > 0]
2937
2938         for i in frange(start, stop, step):
2939             if i < 0:
2940                 continue
2941             try:
2942                 entry = self._getter(i)
2943             except self.IndexError:
2944                 self.is_exhausted = True
2945                 if step > 0:
2946                     break
2947                 continue
2948             yield i + 1, entry
2949
2950     def __len__(self):
2951         return len(tuple(self[:]))
2952
2953     class IndexError(IndexError):
2954         pass
2955
2956
2957 def uppercase_escape(s):
2958     unicode_escape = codecs.getdecoder('unicode_escape')
2959     return re.sub(
2960         r'\\U[0-9a-fA-F]{8}',
2961         lambda m: unicode_escape(m.group(0))[0],
2962         s)
2963
2964
2965 def lowercase_escape(s):
2966     unicode_escape = codecs.getdecoder('unicode_escape')
2967     return re.sub(
2968         r'\\u[0-9a-fA-F]{4}',
2969         lambda m: unicode_escape(m.group(0))[0],
2970         s)
2971
2972
2973 def escape_rfc3986(s):
2974     """Escape non-ASCII characters as suggested by RFC 3986"""
2975     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2976
2977
2978 def escape_url(url):
2979     """Escape URL as suggested by RFC 3986"""
2980     url_parsed = compat_urllib_parse_urlparse(url)
2981     return url_parsed._replace(
2982         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2983         path=escape_rfc3986(url_parsed.path),
2984         params=escape_rfc3986(url_parsed.params),
2985         query=escape_rfc3986(url_parsed.query),
2986         fragment=escape_rfc3986(url_parsed.fragment)
2987     ).geturl()
2988
2989
2990 def parse_qs(url):
2991     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2992
2993
2994 def read_batch_urls(batch_fd):
2995     def fixup(url):
2996         if not isinstance(url, compat_str):
2997             url = url.decode('utf-8', 'replace')
2998         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2999         for bom in BOM_UTF8:
3000             if url.startswith(bom):
3001                 url = url[len(bom):]
3002         url = url.lstrip()
3003         if not url or url.startswith(('#', ';', ']')):
3004             return False
3005         # "#" cannot be stripped out since it is part of the URI
3006         # However, it can be safely stipped out if follwing a whitespace
3007         return re.split(r'\s#', url, 1)[0].rstrip()
3008
3009     with contextlib.closing(batch_fd) as fd:
3010         return [url for url in map(fixup, fd) if url]
3011
3012
3013 def urlencode_postdata(*args, **kargs):
3014     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3015
3016
3017 def update_url_query(url, query):
3018     if not query:
3019         return url
3020     parsed_url = compat_urlparse.urlparse(url)
3021     qs = compat_parse_qs(parsed_url.query)
3022     qs.update(query)
3023     return compat_urlparse.urlunparse(parsed_url._replace(
3024         query=compat_urllib_parse_urlencode(qs, True)))
3025
3026
3027 def update_Request(req, url=None, data=None, headers={}, query={}):
3028     req_headers = req.headers.copy()
3029     req_headers.update(headers)
3030     req_data = data or req.data
3031     req_url = update_url_query(url or req.get_full_url(), query)
3032     req_get_method = req.get_method()
3033     if req_get_method == 'HEAD':
3034         req_type = HEADRequest
3035     elif req_get_method == 'PUT':
3036         req_type = PUTRequest
3037     else:
3038         req_type = compat_urllib_request.Request
3039     new_req = req_type(
3040         req_url, data=req_data, headers=req_headers,
3041         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3042     if hasattr(req, 'timeout'):
3043         new_req.timeout = req.timeout
3044     return new_req
3045
3046
3047 def _multipart_encode_impl(data, boundary):
3048     content_type = 'multipart/form-data; boundary=%s' % boundary
3049
3050     out = b''
3051     for k, v in data.items():
3052         out += b'--' + boundary.encode('ascii') + b'\r\n'
3053         if isinstance(k, compat_str):
3054             k = k.encode()
3055         if isinstance(v, compat_str):
3056             v = v.encode()
3057         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3058         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3059         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3060         if boundary.encode('ascii') in content:
3061             raise ValueError('Boundary overlaps with data')
3062         out += content
3063
3064     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3065
3066     return out, content_type
3067
3068
3069 def multipart_encode(data, boundary=None):
3070     '''
3071     Encode a dict to RFC 7578-compliant form-data
3072
3073     data:
3074         A dict where keys and values can be either Unicode or bytes-like
3075         objects.
3076     boundary:
3077         If specified a Unicode object, it's used as the boundary. Otherwise
3078         a random boundary is generated.
3079
3080     Reference: https://tools.ietf.org/html/rfc7578
3081     '''
3082     has_specified_boundary = boundary is not None
3083
3084     while True:
3085         if boundary is None:
3086             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3087
3088         try:
3089             out, content_type = _multipart_encode_impl(data, boundary)
3090             break
3091         except ValueError:
3092             if has_specified_boundary:
3093                 raise
3094             boundary = None
3095
3096     return out, content_type
3097
3098
3099 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3100     for val in map(d.get, variadic(key_or_keys)):
3101         if val is not None and (val or not skip_false_values):
3102             return val
3103     return default
3104
3105
3106 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3107     for f in funcs:
3108         try:
3109             val = f(*args, **kwargs)
3110         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3111             pass
3112         else:
3113             if expected_type is None or isinstance(val, expected_type):
3114                 return val
3115
3116
3117 def try_get(src, getter, expected_type=None):
3118     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3119
3120
3121 def filter_dict(dct, cndn=lambda _, v: v is not None):
3122     return {k: v for k, v in dct.items() if cndn(k, v)}
3123
3124
3125 def merge_dicts(*dicts):
3126     merged = {}
3127     for a_dict in dicts:
3128         for k, v in a_dict.items():
3129             if (v is not None and k not in merged
3130                     or isinstance(v, str) and merged[k] == ''):
3131                 merged[k] = v
3132     return merged
3133
3134
3135 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3136     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3137
3138
3139 US_RATINGS = {
3140     'G': 0,
3141     'PG': 10,
3142     'PG-13': 13,
3143     'R': 16,
3144     'NC': 18,
3145 }
3146
3147
3148 TV_PARENTAL_GUIDELINES = {
3149     'TV-Y': 0,
3150     'TV-Y7': 7,
3151     'TV-G': 0,
3152     'TV-PG': 0,
3153     'TV-14': 14,
3154     'TV-MA': 17,
3155 }
3156
3157
3158 def parse_age_limit(s):
3159     # isinstance(False, int) is True. So type() must be used instead
3160     if type(s) is int:  # noqa: E721
3161         return s if 0 <= s <= 21 else None
3162     elif not isinstance(s, str):
3163         return None
3164     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3165     if m:
3166         return int(m.group('age'))
3167     s = s.upper()
3168     if s in US_RATINGS:
3169         return US_RATINGS[s]
3170     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3171     if m:
3172         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3173     return None
3174
3175
3176 def strip_jsonp(code):
3177     return re.sub(
3178         r'''(?sx)^
3179             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3180             (?:\s*&&\s*(?P=func_name))?
3181             \s*\(\s*(?P<callback_data>.*)\);?
3182             \s*?(?://[^\n]*)*$''',
3183         r'\g<callback_data>', code)
3184
3185
3186 def js_to_json(code, vars={}):
3187     # vars is a dict of var, val pairs to substitute
3188     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3189     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3190     INTEGER_TABLE = (
3191         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3192         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3193     )
3194
3195     def fix_kv(m):
3196         v = m.group(0)
3197         if v in ('true', 'false', 'null'):
3198             return v
3199         elif v in ('undefined', 'void 0'):
3200             return 'null'
3201         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3202             return ""
3203
3204         if v[0] in ("'", '"'):
3205             v = re.sub(r'(?s)\\.|"', lambda m: {
3206                 '"': '\\"',
3207                 "\\'": "'",
3208                 '\\\n': '',
3209                 '\\x': '\\u00',
3210             }.get(m.group(0), m.group(0)), v[1:-1])
3211         else:
3212             for regex, base in INTEGER_TABLE:
3213                 im = re.match(regex, v)
3214                 if im:
3215                     i = int(im.group(1), base)
3216                     return '"%d":' % i if v.endswith(':') else '%d' % i
3217
3218             if v in vars:
3219                 return vars[v]
3220
3221         return '"%s"' % v
3222
3223     def create_map(mobj):
3224         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3225
3226     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3227     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3228
3229     return re.sub(r'''(?sx)
3230         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3231         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3232         {comment}|,(?={skip}[\]}}])|
3233         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3234         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3235         [0-9]+(?={skip}:)|
3236         !+
3237         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3238
3239
3240 def qualities(quality_ids):
3241     """ Get a numeric quality value out of a list of possible values """
3242     def q(qid):
3243         try:
3244             return quality_ids.index(qid)
3245         except ValueError:
3246             return -1
3247     return q
3248
3249
3250 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3251
3252
3253 DEFAULT_OUTTMPL = {
3254     'default': '%(title)s [%(id)s].%(ext)s',
3255     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3256 }
3257 OUTTMPL_TYPES = {
3258     'chapter': None,
3259     'subtitle': None,
3260     'thumbnail': None,
3261     'description': 'description',
3262     'annotation': 'annotations.xml',
3263     'infojson': 'info.json',
3264     'link': None,
3265     'pl_video': None,
3266     'pl_thumbnail': None,
3267     'pl_description': 'description',
3268     'pl_infojson': 'info.json',
3269 }
3270
3271 # As of [1] format syntax is:
3272 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3273 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3274 STR_FORMAT_RE_TMPL = r'''(?x)
3275     (?<!%)(?P<prefix>(?:%%)*)
3276     %
3277     (?P<has_key>\((?P<key>{0})\))?
3278     (?P<format>
3279         (?P<conversion>[#0\-+ ]+)?
3280         (?P<min_width>\d+)?
3281         (?P<precision>\.\d+)?
3282         (?P<len_mod>[hlL])?  # unused in python
3283         {1}  # conversion type
3284     )
3285 '''
3286
3287
3288 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3289
3290
3291 def limit_length(s, length):
3292     """ Add ellipses to overly long strings """
3293     if s is None:
3294         return None
3295     ELLIPSES = '...'
3296     if len(s) > length:
3297         return s[:length - len(ELLIPSES)] + ELLIPSES
3298     return s
3299
3300
3301 def version_tuple(v):
3302     return tuple(int(e) for e in re.split(r'[-.]', v))
3303
3304
3305 def is_outdated_version(version, limit, assume_new=True):
3306     if not version:
3307         return not assume_new
3308     try:
3309         return version_tuple(version) < version_tuple(limit)
3310     except ValueError:
3311         return not assume_new
3312
3313
3314 def ytdl_is_updateable():
3315     """ Returns if yt-dlp can be updated with -U """
3316
3317     from .update import is_non_updateable
3318
3319     return not is_non_updateable()
3320
3321
3322 def args_to_str(args):
3323     # Get a short string representation for a subprocess command
3324     return ' '.join(compat_shlex_quote(a) for a in args)
3325
3326
3327 def error_to_compat_str(err):
3328     return str(err)
3329
3330
3331 def error_to_str(err):
3332     return f'{type(err).__name__}: {err}'
3333
3334
3335 def mimetype2ext(mt):
3336     if mt is None:
3337         return None
3338
3339     mt, _, params = mt.partition(';')
3340     mt = mt.strip()
3341
3342     FULL_MAP = {
3343         'audio/mp4': 'm4a',
3344         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3345         # it's the most popular one
3346         'audio/mpeg': 'mp3',
3347         'audio/x-wav': 'wav',
3348         'audio/wav': 'wav',
3349         'audio/wave': 'wav',
3350     }
3351
3352     ext = FULL_MAP.get(mt)
3353     if ext is not None:
3354         return ext
3355
3356     SUBTYPE_MAP = {
3357         '3gpp': '3gp',
3358         'smptett+xml': 'tt',
3359         'ttaf+xml': 'dfxp',
3360         'ttml+xml': 'ttml',
3361         'x-flv': 'flv',
3362         'x-mp4-fragmented': 'mp4',
3363         'x-ms-sami': 'sami',
3364         'x-ms-wmv': 'wmv',
3365         'mpegurl': 'm3u8',
3366         'x-mpegurl': 'm3u8',
3367         'vnd.apple.mpegurl': 'm3u8',
3368         'dash+xml': 'mpd',
3369         'f4m+xml': 'f4m',
3370         'hds+xml': 'f4m',
3371         'vnd.ms-sstr+xml': 'ism',
3372         'quicktime': 'mov',
3373         'mp2t': 'ts',
3374         'x-wav': 'wav',
3375         'filmstrip+json': 'fs',
3376         'svg+xml': 'svg',
3377     }
3378
3379     _, _, subtype = mt.rpartition('/')
3380     ext = SUBTYPE_MAP.get(subtype.lower())
3381     if ext is not None:
3382         return ext
3383
3384     SUFFIX_MAP = {
3385         'json': 'json',
3386         'xml': 'xml',
3387         'zip': 'zip',
3388         'gzip': 'gz',
3389     }
3390
3391     _, _, suffix = subtype.partition('+')
3392     ext = SUFFIX_MAP.get(suffix)
3393     if ext is not None:
3394         return ext
3395
3396     return subtype.replace('+', '.')
3397
3398
3399 def ext2mimetype(ext_or_url):
3400     if not ext_or_url:
3401         return None
3402     if '.' not in ext_or_url:
3403         ext_or_url = f'file.{ext_or_url}'
3404     return mimetypes.guess_type(ext_or_url)[0]
3405
3406
3407 def parse_codecs(codecs_str):
3408     # http://tools.ietf.org/html/rfc6381
3409     if not codecs_str:
3410         return {}
3411     split_codecs = list(filter(None, map(
3412         str.strip, codecs_str.strip().strip(',').split(','))))
3413     vcodec, acodec, scodec, hdr = None, None, None, None
3414     for full_codec in split_codecs:
3415         parts = full_codec.split('.')
3416         codec = parts[0].replace('0', '')
3417         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3418                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3419             if not vcodec:
3420                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3421                 if codec in ('dvh1', 'dvhe'):
3422                     hdr = 'DV'
3423                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3424                     hdr = 'HDR10'
3425                 elif full_codec.replace('0', '').startswith('vp9.2'):
3426                     hdr = 'HDR10'
3427         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3428             if not acodec:
3429                 acodec = full_codec
3430         elif codec in ('stpp', 'wvtt',):
3431             if not scodec:
3432                 scodec = full_codec
3433         else:
3434             write_string(f'WARNING: Unknown codec {full_codec}\n')
3435     if vcodec or acodec or scodec:
3436         return {
3437             'vcodec': vcodec or 'none',
3438             'acodec': acodec or 'none',
3439             'dynamic_range': hdr,
3440             **({'scodec': scodec} if scodec is not None else {}),
3441         }
3442     elif len(split_codecs) == 2:
3443         return {
3444             'vcodec': split_codecs[0],
3445             'acodec': split_codecs[1],
3446         }
3447     return {}
3448
3449
3450 def urlhandle_detect_ext(url_handle):
3451     getheader = url_handle.headers.get
3452
3453     cd = getheader('Content-Disposition')
3454     if cd:
3455         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3456         if m:
3457             e = determine_ext(m.group('filename'), default_ext=None)
3458             if e:
3459                 return e
3460
3461     return mimetype2ext(getheader('Content-Type'))
3462
3463
3464 def encode_data_uri(data, mime_type):
3465     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3466
3467
3468 def age_restricted(content_limit, age_limit):
3469     """ Returns True iff the content should be blocked """
3470
3471     if age_limit is None:  # No limit set
3472         return False
3473     if content_limit is None:
3474         return False  # Content available for everyone
3475     return age_limit < content_limit
3476
3477
3478 def is_html(first_bytes):
3479     """ Detect whether a file contains HTML by examining its first bytes. """
3480
3481     BOMS = [
3482         (b'\xef\xbb\xbf', 'utf-8'),
3483         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3484         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3485         (b'\xff\xfe', 'utf-16-le'),
3486         (b'\xfe\xff', 'utf-16-be'),
3487     ]
3488
3489     encoding = 'utf-8'
3490     for bom, enc in BOMS:
3491         while first_bytes.startswith(bom):
3492             encoding, first_bytes = enc, first_bytes[len(bom):]
3493
3494     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3495
3496
3497 def determine_protocol(info_dict):
3498     protocol = info_dict.get('protocol')
3499     if protocol is not None:
3500         return protocol
3501
3502     url = sanitize_url(info_dict['url'])
3503     if url.startswith('rtmp'):
3504         return 'rtmp'
3505     elif url.startswith('mms'):
3506         return 'mms'
3507     elif url.startswith('rtsp'):
3508         return 'rtsp'
3509
3510     ext = determine_ext(url)
3511     if ext == 'm3u8':
3512         return 'm3u8'
3513     elif ext == 'f4m':
3514         return 'f4m'
3515
3516     return compat_urllib_parse_urlparse(url).scheme
3517
3518
3519 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3520     """ Render a list of rows, each as a list of values.
3521     Text after a \t will be right aligned """
3522     def width(string):
3523         return len(remove_terminal_sequences(string).replace('\t', ''))
3524
3525     def get_max_lens(table):
3526         return [max(width(str(v)) for v in col) for col in zip(*table)]
3527
3528     def filter_using_list(row, filterArray):
3529         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3530
3531     max_lens = get_max_lens(data) if hide_empty else []
3532     header_row = filter_using_list(header_row, max_lens)
3533     data = [filter_using_list(row, max_lens) for row in data]
3534
3535     table = [header_row] + data
3536     max_lens = get_max_lens(table)
3537     extra_gap += 1
3538     if delim:
3539         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3540         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3541     for row in table:
3542         for pos, text in enumerate(map(str, row)):
3543             if '\t' in text:
3544                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3545             else:
3546                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3547     ret = '\n'.join(''.join(row).rstrip() for row in table)
3548     return ret
3549
3550
3551 def _match_one(filter_part, dct, incomplete):
3552     # TODO: Generalize code with YoutubeDL._build_format_filter
3553     STRING_OPERATORS = {
3554         '*=': operator.contains,
3555         '^=': lambda attr, value: attr.startswith(value),
3556         '$=': lambda attr, value: attr.endswith(value),
3557         '~=': lambda attr, value: re.search(value, attr),
3558     }
3559     COMPARISON_OPERATORS = {
3560         **STRING_OPERATORS,
3561         '<=': operator.le,  # "<=" must be defined above "<"
3562         '<': operator.lt,
3563         '>=': operator.ge,
3564         '>': operator.gt,
3565         '=': operator.eq,
3566     }
3567
3568     if isinstance(incomplete, bool):
3569         is_incomplete = lambda _: incomplete
3570     else:
3571         is_incomplete = lambda k: k in incomplete
3572
3573     operator_rex = re.compile(r'''(?x)
3574         (?P<key>[a-z_]+)
3575         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3576         (?:
3577             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3578             (?P<strval>.+?)
3579         )
3580         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3581     m = operator_rex.fullmatch(filter_part.strip())
3582     if m:
3583         m = m.groupdict()
3584         unnegated_op = COMPARISON_OPERATORS[m['op']]
3585         if m['negation']:
3586             op = lambda attr, value: not unnegated_op(attr, value)
3587         else:
3588             op = unnegated_op
3589         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3590         if m['quote']:
3591             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3592         actual_value = dct.get(m['key'])
3593         numeric_comparison = None
3594         if isinstance(actual_value, (int, float)):
3595             # If the original field is a string and matching comparisonvalue is
3596             # a number we should respect the origin of the original field
3597             # and process comparison value as a string (see
3598             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3599             try:
3600                 numeric_comparison = int(comparison_value)
3601             except ValueError:
3602                 numeric_comparison = parse_filesize(comparison_value)
3603                 if numeric_comparison is None:
3604                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3605                 if numeric_comparison is None:
3606                     numeric_comparison = parse_duration(comparison_value)
3607         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3608             raise ValueError('Operator %s only supports string values!' % m['op'])
3609         if actual_value is None:
3610             return is_incomplete(m['key']) or m['none_inclusive']
3611         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3612
3613     UNARY_OPERATORS = {
3614         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3615         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3616     }
3617     operator_rex = re.compile(r'''(?x)
3618         (?P<op>%s)\s*(?P<key>[a-z_]+)
3619         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3620     m = operator_rex.fullmatch(filter_part.strip())
3621     if m:
3622         op = UNARY_OPERATORS[m.group('op')]
3623         actual_value = dct.get(m.group('key'))
3624         if is_incomplete(m.group('key')) and actual_value is None:
3625             return True
3626         return op(actual_value)
3627
3628     raise ValueError('Invalid filter part %r' % filter_part)
3629
3630
3631 def match_str(filter_str, dct, incomplete=False):
3632     """ Filter a dictionary with a simple string syntax.
3633     @returns           Whether the filter passes
3634     @param incomplete  Set of keys that is expected to be missing from dct.
3635                        Can be True/False to indicate all/none of the keys may be missing.
3636                        All conditions on incomplete keys pass if the key is missing
3637     """
3638     return all(
3639         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3640         for filter_part in re.split(r'(?<!\\)&', filter_str))
3641
3642
3643 def match_filter_func(filters):
3644     if not filters:
3645         return None
3646     filters = set(variadic(filters))
3647
3648     interactive = '-' in filters
3649     if interactive:
3650         filters.remove('-')
3651
3652     def _match_func(info_dict, incomplete=False):
3653         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3654             return NO_DEFAULT if interactive and not incomplete else None
3655         else:
3656             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3657             filter_str = ') | ('.join(map(str.strip, filters))
3658             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3659     return _match_func
3660
3661
3662 def download_range_func(chapters, ranges):
3663     def inner(info_dict, ydl):
3664         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3665                    else 'Cannot match chapters since chapter information is unavailable')
3666         for regex in chapters or []:
3667             for i, chapter in enumerate(info_dict.get('chapters') or []):
3668                 if re.search(regex, chapter['title']):
3669                     warning = None
3670                     yield {**chapter, 'index': i}
3671         if chapters and warning:
3672             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3673
3674         yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3675
3676     return inner
3677
3678
3679 def parse_dfxp_time_expr(time_expr):
3680     if not time_expr:
3681         return
3682
3683     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3684     if mobj:
3685         return float(mobj.group('time_offset'))
3686
3687     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3688     if mobj:
3689         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3690
3691
3692 def srt_subtitles_timecode(seconds):
3693     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3694
3695
3696 def ass_subtitles_timecode(seconds):
3697     time = timetuple_from_msec(seconds * 1000)
3698     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3699
3700
3701 def dfxp2srt(dfxp_data):
3702     '''
3703     @param dfxp_data A bytes-like object containing DFXP data
3704     @returns A unicode object containing converted SRT data
3705     '''
3706     LEGACY_NAMESPACES = (
3707         (b'http://www.w3.org/ns/ttml', [
3708             b'http://www.w3.org/2004/11/ttaf1',
3709             b'http://www.w3.org/2006/04/ttaf1',
3710             b'http://www.w3.org/2006/10/ttaf1',
3711         ]),
3712         (b'http://www.w3.org/ns/ttml#styling', [
3713             b'http://www.w3.org/ns/ttml#style',
3714         ]),
3715     )
3716
3717     SUPPORTED_STYLING = [
3718         'color',
3719         'fontFamily',
3720         'fontSize',
3721         'fontStyle',
3722         'fontWeight',
3723         'textDecoration'
3724     ]
3725
3726     _x = functools.partial(xpath_with_ns, ns_map={
3727         'xml': 'http://www.w3.org/XML/1998/namespace',
3728         'ttml': 'http://www.w3.org/ns/ttml',
3729         'tts': 'http://www.w3.org/ns/ttml#styling',
3730     })
3731
3732     styles = {}
3733     default_style = {}
3734
3735     class TTMLPElementParser:
3736         _out = ''
3737         _unclosed_elements = []
3738         _applied_styles = []
3739
3740         def start(self, tag, attrib):
3741             if tag in (_x('ttml:br'), 'br'):
3742                 self._out += '\n'
3743             else:
3744                 unclosed_elements = []
3745                 style = {}
3746                 element_style_id = attrib.get('style')
3747                 if default_style:
3748                     style.update(default_style)
3749                 if element_style_id:
3750                     style.update(styles.get(element_style_id, {}))
3751                 for prop in SUPPORTED_STYLING:
3752                     prop_val = attrib.get(_x('tts:' + prop))
3753                     if prop_val:
3754                         style[prop] = prop_val
3755                 if style:
3756                     font = ''
3757                     for k, v in sorted(style.items()):
3758                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3759                             continue
3760                         if k == 'color':
3761                             font += ' color="%s"' % v
3762                         elif k == 'fontSize':
3763                             font += ' size="%s"' % v
3764                         elif k == 'fontFamily':
3765                             font += ' face="%s"' % v
3766                         elif k == 'fontWeight' and v == 'bold':
3767                             self._out += '<b>'
3768                             unclosed_elements.append('b')
3769                         elif k == 'fontStyle' and v == 'italic':
3770                             self._out += '<i>'
3771                             unclosed_elements.append('i')
3772                         elif k == 'textDecoration' and v == 'underline':
3773                             self._out += '<u>'
3774                             unclosed_elements.append('u')
3775                     if font:
3776                         self._out += '<font' + font + '>'
3777                         unclosed_elements.append('font')
3778                     applied_style = {}
3779                     if self._applied_styles:
3780                         applied_style.update(self._applied_styles[-1])
3781                     applied_style.update(style)
3782                     self._applied_styles.append(applied_style)
3783                 self._unclosed_elements.append(unclosed_elements)
3784
3785         def end(self, tag):
3786             if tag not in (_x('ttml:br'), 'br'):
3787                 unclosed_elements = self._unclosed_elements.pop()
3788                 for element in reversed(unclosed_elements):
3789                     self._out += '</%s>' % element
3790                 if unclosed_elements and self._applied_styles:
3791                     self._applied_styles.pop()
3792
3793         def data(self, data):
3794             self._out += data
3795
3796         def close(self):
3797             return self._out.strip()
3798
3799     def parse_node(node):
3800         target = TTMLPElementParser()
3801         parser = xml.etree.ElementTree.XMLParser(target=target)
3802         parser.feed(xml.etree.ElementTree.tostring(node))
3803         return parser.close()
3804
3805     for k, v in LEGACY_NAMESPACES:
3806         for ns in v:
3807             dfxp_data = dfxp_data.replace(ns, k)
3808
3809     dfxp = compat_etree_fromstring(dfxp_data)
3810     out = []
3811     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3812
3813     if not paras:
3814         raise ValueError('Invalid dfxp/TTML subtitle')
3815
3816     repeat = False
3817     while True:
3818         for style in dfxp.findall(_x('.//ttml:style')):
3819             style_id = style.get('id') or style.get(_x('xml:id'))
3820             if not style_id:
3821                 continue
3822             parent_style_id = style.get('style')
3823             if parent_style_id:
3824                 if parent_style_id not in styles:
3825                     repeat = True
3826                     continue
3827                 styles[style_id] = styles[parent_style_id].copy()
3828             for prop in SUPPORTED_STYLING:
3829                 prop_val = style.get(_x('tts:' + prop))
3830                 if prop_val:
3831                     styles.setdefault(style_id, {})[prop] = prop_val
3832         if repeat:
3833             repeat = False
3834         else:
3835             break
3836
3837     for p in ('body', 'div'):
3838         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3839         if ele is None:
3840             continue
3841         style = styles.get(ele.get('style'))
3842         if not style:
3843             continue
3844         default_style.update(style)
3845
3846     for para, index in zip(paras, itertools.count(1)):
3847         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3848         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3849         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3850         if begin_time is None:
3851             continue
3852         if not end_time:
3853             if not dur:
3854                 continue
3855             end_time = begin_time + dur
3856         out.append('%d\n%s --> %s\n%s\n\n' % (
3857             index,
3858             srt_subtitles_timecode(begin_time),
3859             srt_subtitles_timecode(end_time),
3860             parse_node(para)))
3861
3862     return ''.join(out)
3863
3864
3865 def cli_option(params, command_option, param, separator=None):
3866     param = params.get(param)
3867     return ([] if param is None
3868             else [command_option, str(param)] if separator is None
3869             else [f'{command_option}{separator}{param}'])
3870
3871
3872 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3873     param = params.get(param)
3874     assert param in (True, False, None)
3875     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3876
3877
3878 def cli_valueless_option(params, command_option, param, expected_value=True):
3879     return [command_option] if params.get(param) == expected_value else []
3880
3881
3882 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3883     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3884         if use_compat:
3885             return argdict
3886         else:
3887             argdict = None
3888     if argdict is None:
3889         return default
3890     assert isinstance(argdict, dict)
3891
3892     assert isinstance(keys, (list, tuple))
3893     for key_list in keys:
3894         arg_list = list(filter(
3895             lambda x: x is not None,
3896             [argdict.get(key.lower()) for key in variadic(key_list)]))
3897         if arg_list:
3898             return [arg for args in arg_list for arg in args]
3899     return default
3900
3901
3902 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3903     main_key, exe = main_key.lower(), exe.lower()
3904     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3905     keys = [f'{root_key}{k}' for k in (keys or [''])]
3906     if root_key in keys:
3907         if main_key != exe:
3908             keys.append((main_key, exe))
3909         keys.append('default')
3910     else:
3911         use_compat = False
3912     return cli_configuration_args(argdict, keys, default, use_compat)
3913
3914
3915 class ISO639Utils:
3916     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3917     _lang_map = {
3918         'aa': 'aar',
3919         'ab': 'abk',
3920         'ae': 'ave',
3921         'af': 'afr',
3922         'ak': 'aka',
3923         'am': 'amh',
3924         'an': 'arg',
3925         'ar': 'ara',
3926         'as': 'asm',
3927         'av': 'ava',
3928         'ay': 'aym',
3929         'az': 'aze',
3930         'ba': 'bak',
3931         'be': 'bel',
3932         'bg': 'bul',
3933         'bh': 'bih',
3934         'bi': 'bis',
3935         'bm': 'bam',
3936         'bn': 'ben',
3937         'bo': 'bod',
3938         'br': 'bre',
3939         'bs': 'bos',
3940         'ca': 'cat',
3941         'ce': 'che',
3942         'ch': 'cha',
3943         'co': 'cos',
3944         'cr': 'cre',
3945         'cs': 'ces',
3946         'cu': 'chu',
3947         'cv': 'chv',
3948         'cy': 'cym',
3949         'da': 'dan',
3950         'de': 'deu',
3951         'dv': 'div',
3952         'dz': 'dzo',
3953         'ee': 'ewe',
3954         'el': 'ell',
3955         'en': 'eng',
3956         'eo': 'epo',
3957         'es': 'spa',
3958         'et': 'est',
3959         'eu': 'eus',
3960         'fa': 'fas',
3961         'ff': 'ful',
3962         'fi': 'fin',
3963         'fj': 'fij',
3964         'fo': 'fao',
3965         'fr': 'fra',
3966         'fy': 'fry',
3967         'ga': 'gle',
3968         'gd': 'gla',
3969         'gl': 'glg',
3970         'gn': 'grn',
3971         'gu': 'guj',
3972         'gv': 'glv',
3973         'ha': 'hau',
3974         'he': 'heb',
3975         'iw': 'heb',  # Replaced by he in 1989 revision
3976         'hi': 'hin',
3977         'ho': 'hmo',
3978         'hr': 'hrv',
3979         'ht': 'hat',
3980         'hu': 'hun',
3981         'hy': 'hye',
3982         'hz': 'her',
3983         'ia': 'ina',
3984         'id': 'ind',
3985         'in': 'ind',  # Replaced by id in 1989 revision
3986         'ie': 'ile',
3987         'ig': 'ibo',
3988         'ii': 'iii',
3989         'ik': 'ipk',
3990         'io': 'ido',
3991         'is': 'isl',
3992         'it': 'ita',
3993         'iu': 'iku',
3994         'ja': 'jpn',
3995         'jv': 'jav',
3996         'ka': 'kat',
3997         'kg': 'kon',
3998         'ki': 'kik',
3999         'kj': 'kua',
4000         'kk': 'kaz',
4001         'kl': 'kal',
4002         'km': 'khm',
4003         'kn': 'kan',
4004         'ko': 'kor',
4005         'kr': 'kau',
4006         'ks': 'kas',
4007         'ku': 'kur',
4008         'kv': 'kom',
4009         'kw': 'cor',
4010         'ky': 'kir',
4011         'la': 'lat',
4012         'lb': 'ltz',
4013         'lg': 'lug',
4014         'li': 'lim',
4015         'ln': 'lin',
4016         'lo': 'lao',
4017         'lt': 'lit',
4018         'lu': 'lub',
4019         'lv': 'lav',
4020         'mg': 'mlg',
4021         'mh': 'mah',
4022         'mi': 'mri',
4023         'mk': 'mkd',
4024         'ml': 'mal',
4025         'mn': 'mon',
4026         'mr': 'mar',
4027         'ms': 'msa',
4028         'mt': 'mlt',
4029         'my': 'mya',
4030         'na': 'nau',
4031         'nb': 'nob',
4032         'nd': 'nde',
4033         'ne': 'nep',
4034         'ng': 'ndo',
4035         'nl': 'nld',
4036         'nn': 'nno',
4037         'no': 'nor',
4038         'nr': 'nbl',
4039         'nv': 'nav',
4040         'ny': 'nya',
4041         'oc': 'oci',
4042         'oj': 'oji',
4043         'om': 'orm',
4044         'or': 'ori',
4045         'os': 'oss',
4046         'pa': 'pan',
4047         'pi': 'pli',
4048         'pl': 'pol',
4049         'ps': 'pus',
4050         'pt': 'por',
4051         'qu': 'que',
4052         'rm': 'roh',
4053         'rn': 'run',
4054         'ro': 'ron',
4055         'ru': 'rus',
4056         'rw': 'kin',
4057         'sa': 'san',
4058         'sc': 'srd',
4059         'sd': 'snd',
4060         'se': 'sme',
4061         'sg': 'sag',
4062         'si': 'sin',
4063         'sk': 'slk',
4064         'sl': 'slv',
4065         'sm': 'smo',
4066         'sn': 'sna',
4067         'so': 'som',
4068         'sq': 'sqi',
4069         'sr': 'srp',
4070         'ss': 'ssw',
4071         'st': 'sot',
4072         'su': 'sun',
4073         'sv': 'swe',
4074         'sw': 'swa',
4075         'ta': 'tam',
4076         'te': 'tel',
4077         'tg': 'tgk',
4078         'th': 'tha',
4079         'ti': 'tir',
4080         'tk': 'tuk',
4081         'tl': 'tgl',
4082         'tn': 'tsn',
4083         'to': 'ton',
4084         'tr': 'tur',
4085         'ts': 'tso',
4086         'tt': 'tat',
4087         'tw': 'twi',
4088         'ty': 'tah',
4089         'ug': 'uig',
4090         'uk': 'ukr',
4091         'ur': 'urd',
4092         'uz': 'uzb',
4093         've': 'ven',
4094         'vi': 'vie',
4095         'vo': 'vol',
4096         'wa': 'wln',
4097         'wo': 'wol',
4098         'xh': 'xho',
4099         'yi': 'yid',
4100         'ji': 'yid',  # Replaced by yi in 1989 revision
4101         'yo': 'yor',
4102         'za': 'zha',
4103         'zh': 'zho',
4104         'zu': 'zul',
4105     }
4106
4107     @classmethod
4108     def short2long(cls, code):
4109         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4110         return cls._lang_map.get(code[:2])
4111
4112     @classmethod
4113     def long2short(cls, code):
4114         """Convert language code from ISO 639-2/T to ISO 639-1"""
4115         for short_name, long_name in cls._lang_map.items():
4116             if long_name == code:
4117                 return short_name
4118
4119
4120 class ISO3166Utils:
4121     # From http://data.okfn.org/data/core/country-list
4122     _country_map = {
4123         'AF': 'Afghanistan',
4124         'AX': 'Åland Islands',
4125         'AL': 'Albania',
4126         'DZ': 'Algeria',
4127         'AS': 'American Samoa',
4128         'AD': 'Andorra',
4129         'AO': 'Angola',
4130         'AI': 'Anguilla',
4131         'AQ': 'Antarctica',
4132         'AG': 'Antigua and Barbuda',
4133         'AR': 'Argentina',
4134         'AM': 'Armenia',
4135         'AW': 'Aruba',
4136         'AU': 'Australia',
4137         'AT': 'Austria',
4138         'AZ': 'Azerbaijan',
4139         'BS': 'Bahamas',
4140         'BH': 'Bahrain',
4141         'BD': 'Bangladesh',
4142         'BB': 'Barbados',
4143         'BY': 'Belarus',
4144         'BE': 'Belgium',
4145         'BZ': 'Belize',
4146         'BJ': 'Benin',
4147         'BM': 'Bermuda',
4148         'BT': 'Bhutan',
4149         'BO': 'Bolivia, Plurinational State of',
4150         'BQ': 'Bonaire, Sint Eustatius and Saba',
4151         'BA': 'Bosnia and Herzegovina',
4152         'BW': 'Botswana',
4153         'BV': 'Bouvet Island',
4154         'BR': 'Brazil',
4155         'IO': 'British Indian Ocean Territory',
4156         'BN': 'Brunei Darussalam',
4157         'BG': 'Bulgaria',
4158         'BF': 'Burkina Faso',
4159         'BI': 'Burundi',
4160         'KH': 'Cambodia',
4161         'CM': 'Cameroon',
4162         'CA': 'Canada',
4163         'CV': 'Cape Verde',
4164         'KY': 'Cayman Islands',
4165         'CF': 'Central African Republic',
4166         'TD': 'Chad',
4167         'CL': 'Chile',
4168         'CN': 'China',
4169         'CX': 'Christmas Island',
4170         'CC': 'Cocos (Keeling) Islands',
4171         'CO': 'Colombia',
4172         'KM': 'Comoros',
4173         'CG': 'Congo',
4174         'CD': 'Congo, the Democratic Republic of the',
4175         'CK': 'Cook Islands',
4176         'CR': 'Costa Rica',
4177         'CI': 'Côte d\'Ivoire',
4178         'HR': 'Croatia',
4179         'CU': 'Cuba',
4180         'CW': 'Curaçao',
4181         'CY': 'Cyprus',
4182         'CZ': 'Czech Republic',
4183         'DK': 'Denmark',
4184         'DJ': 'Djibouti',
4185         'DM': 'Dominica',
4186         'DO': 'Dominican Republic',
4187         'EC': 'Ecuador',
4188         'EG': 'Egypt',
4189         'SV': 'El Salvador',
4190         'GQ': 'Equatorial Guinea',
4191         'ER': 'Eritrea',
4192         'EE': 'Estonia',
4193         'ET': 'Ethiopia',
4194         'FK': 'Falkland Islands (Malvinas)',
4195         'FO': 'Faroe Islands',
4196         'FJ': 'Fiji',
4197         'FI': 'Finland',
4198         'FR': 'France',
4199         'GF': 'French Guiana',
4200         'PF': 'French Polynesia',
4201         'TF': 'French Southern Territories',
4202         'GA': 'Gabon',
4203         'GM': 'Gambia',
4204         'GE': 'Georgia',
4205         'DE': 'Germany',
4206         'GH': 'Ghana',
4207         'GI': 'Gibraltar',
4208         'GR': 'Greece',
4209         'GL': 'Greenland',
4210         'GD': 'Grenada',
4211         'GP': 'Guadeloupe',
4212         'GU': 'Guam',
4213         'GT': 'Guatemala',
4214         'GG': 'Guernsey',
4215         'GN': 'Guinea',
4216         'GW': 'Guinea-Bissau',
4217         'GY': 'Guyana',
4218         'HT': 'Haiti',
4219         'HM': 'Heard Island and McDonald Islands',
4220         'VA': 'Holy See (Vatican City State)',
4221         'HN': 'Honduras',
4222         'HK': 'Hong Kong',
4223         'HU': 'Hungary',
4224         'IS': 'Iceland',
4225         'IN': 'India',
4226         'ID': 'Indonesia',
4227         'IR': 'Iran, Islamic Republic of',
4228         'IQ': 'Iraq',
4229         'IE': 'Ireland',
4230         'IM': 'Isle of Man',
4231         'IL': 'Israel',
4232         'IT': 'Italy',
4233         'JM': 'Jamaica',
4234         'JP': 'Japan',
4235         'JE': 'Jersey',
4236         'JO': 'Jordan',
4237         'KZ': 'Kazakhstan',
4238         'KE': 'Kenya',
4239         'KI': 'Kiribati',
4240         'KP': 'Korea, Democratic People\'s Republic of',
4241         'KR': 'Korea, Republic of',
4242         'KW': 'Kuwait',
4243         'KG': 'Kyrgyzstan',
4244         'LA': 'Lao People\'s Democratic Republic',
4245         'LV': 'Latvia',
4246         'LB': 'Lebanon',
4247         'LS': 'Lesotho',
4248         'LR': 'Liberia',
4249         'LY': 'Libya',
4250         'LI': 'Liechtenstein',
4251         'LT': 'Lithuania',
4252         'LU': 'Luxembourg',
4253         'MO': 'Macao',
4254         'MK': 'Macedonia, the Former Yugoslav Republic of',
4255         'MG': 'Madagascar',
4256         'MW': 'Malawi',
4257         'MY': 'Malaysia',
4258         'MV': 'Maldives',
4259         'ML': 'Mali',
4260         'MT': 'Malta',
4261         'MH': 'Marshall Islands',
4262         'MQ': 'Martinique',
4263         'MR': 'Mauritania',
4264         'MU': 'Mauritius',
4265         'YT': 'Mayotte',
4266         'MX': 'Mexico',
4267         'FM': 'Micronesia, Federated States of',
4268         'MD': 'Moldova, Republic of',
4269         'MC': 'Monaco',
4270         'MN': 'Mongolia',
4271         'ME': 'Montenegro',
4272         'MS': 'Montserrat',
4273         'MA': 'Morocco',
4274         'MZ': 'Mozambique',
4275         'MM': 'Myanmar',
4276         'NA': 'Namibia',
4277         'NR': 'Nauru',
4278         'NP': 'Nepal',
4279         'NL': 'Netherlands',
4280         'NC': 'New Caledonia',
4281         'NZ': 'New Zealand',
4282         'NI': 'Nicaragua',
4283         'NE': 'Niger',
4284         'NG': 'Nigeria',
4285         'NU': 'Niue',
4286         'NF': 'Norfolk Island',
4287         'MP': 'Northern Mariana Islands',
4288         'NO': 'Norway',
4289         'OM': 'Oman',
4290         'PK': 'Pakistan',
4291         'PW': 'Palau',
4292         'PS': 'Palestine, State of',
4293         'PA': 'Panama',
4294         'PG': 'Papua New Guinea',
4295         'PY': 'Paraguay',
4296         'PE': 'Peru',
4297         'PH': 'Philippines',
4298         'PN': 'Pitcairn',
4299         'PL': 'Poland',
4300         'PT': 'Portugal',
4301         'PR': 'Puerto Rico',
4302         'QA': 'Qatar',
4303         'RE': 'Réunion',
4304         'RO': 'Romania',
4305         'RU': 'Russian Federation',
4306         'RW': 'Rwanda',
4307         'BL': 'Saint Barthélemy',
4308         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4309         'KN': 'Saint Kitts and Nevis',
4310         'LC': 'Saint Lucia',
4311         'MF': 'Saint Martin (French part)',
4312         'PM': 'Saint Pierre and Miquelon',
4313         'VC': 'Saint Vincent and the Grenadines',
4314         'WS': 'Samoa',
4315         'SM': 'San Marino',
4316         'ST': 'Sao Tome and Principe',
4317         'SA': 'Saudi Arabia',
4318         'SN': 'Senegal',
4319         'RS': 'Serbia',
4320         'SC': 'Seychelles',
4321         'SL': 'Sierra Leone',
4322         'SG': 'Singapore',
4323         'SX': 'Sint Maarten (Dutch part)',
4324         'SK': 'Slovakia',
4325         'SI': 'Slovenia',
4326         'SB': 'Solomon Islands',
4327         'SO': 'Somalia',
4328         'ZA': 'South Africa',
4329         'GS': 'South Georgia and the South Sandwich Islands',
4330         'SS': 'South Sudan',
4331         'ES': 'Spain',
4332         'LK': 'Sri Lanka',
4333         'SD': 'Sudan',
4334         'SR': 'Suriname',
4335         'SJ': 'Svalbard and Jan Mayen',
4336         'SZ': 'Swaziland',
4337         'SE': 'Sweden',
4338         'CH': 'Switzerland',
4339         'SY': 'Syrian Arab Republic',
4340         'TW': 'Taiwan, Province of China',
4341         'TJ': 'Tajikistan',
4342         'TZ': 'Tanzania, United Republic of',
4343         'TH': 'Thailand',
4344         'TL': 'Timor-Leste',
4345         'TG': 'Togo',
4346         'TK': 'Tokelau',
4347         'TO': 'Tonga',
4348         'TT': 'Trinidad and Tobago',
4349         'TN': 'Tunisia',
4350         'TR': 'Turkey',
4351         'TM': 'Turkmenistan',
4352         'TC': 'Turks and Caicos Islands',
4353         'TV': 'Tuvalu',
4354         'UG': 'Uganda',
4355         'UA': 'Ukraine',
4356         'AE': 'United Arab Emirates',
4357         'GB': 'United Kingdom',
4358         'US': 'United States',
4359         'UM': 'United States Minor Outlying Islands',
4360         'UY': 'Uruguay',
4361         'UZ': 'Uzbekistan',
4362         'VU': 'Vanuatu',
4363         'VE': 'Venezuela, Bolivarian Republic of',
4364         'VN': 'Viet Nam',
4365         'VG': 'Virgin Islands, British',
4366         'VI': 'Virgin Islands, U.S.',
4367         'WF': 'Wallis and Futuna',
4368         'EH': 'Western Sahara',
4369         'YE': 'Yemen',
4370         'ZM': 'Zambia',
4371         'ZW': 'Zimbabwe',
4372         # Not ISO 3166 codes, but used for IP blocks
4373         'AP': 'Asia/Pacific Region',
4374         'EU': 'Europe',
4375     }
4376
4377     @classmethod
4378     def short2full(cls, code):
4379         """Convert an ISO 3166-2 country code to the corresponding full name"""
4380         return cls._country_map.get(code.upper())
4381
4382
4383 class GeoUtils:
4384     # Major IPv4 address blocks per country
4385     _country_ip_map = {
4386         'AD': '46.172.224.0/19',
4387         'AE': '94.200.0.0/13',
4388         'AF': '149.54.0.0/17',
4389         'AG': '209.59.64.0/18',
4390         'AI': '204.14.248.0/21',
4391         'AL': '46.99.0.0/16',
4392         'AM': '46.70.0.0/15',
4393         'AO': '105.168.0.0/13',
4394         'AP': '182.50.184.0/21',
4395         'AQ': '23.154.160.0/24',
4396         'AR': '181.0.0.0/12',
4397         'AS': '202.70.112.0/20',
4398         'AT': '77.116.0.0/14',
4399         'AU': '1.128.0.0/11',
4400         'AW': '181.41.0.0/18',
4401         'AX': '185.217.4.0/22',
4402         'AZ': '5.197.0.0/16',
4403         'BA': '31.176.128.0/17',
4404         'BB': '65.48.128.0/17',
4405         'BD': '114.130.0.0/16',
4406         'BE': '57.0.0.0/8',
4407         'BF': '102.178.0.0/15',
4408         'BG': '95.42.0.0/15',
4409         'BH': '37.131.0.0/17',
4410         'BI': '154.117.192.0/18',
4411         'BJ': '137.255.0.0/16',
4412         'BL': '185.212.72.0/23',
4413         'BM': '196.12.64.0/18',
4414         'BN': '156.31.0.0/16',
4415         'BO': '161.56.0.0/16',
4416         'BQ': '161.0.80.0/20',
4417         'BR': '191.128.0.0/12',
4418         'BS': '24.51.64.0/18',
4419         'BT': '119.2.96.0/19',
4420         'BW': '168.167.0.0/16',
4421         'BY': '178.120.0.0/13',
4422         'BZ': '179.42.192.0/18',
4423         'CA': '99.224.0.0/11',
4424         'CD': '41.243.0.0/16',
4425         'CF': '197.242.176.0/21',
4426         'CG': '160.113.0.0/16',
4427         'CH': '85.0.0.0/13',
4428         'CI': '102.136.0.0/14',
4429         'CK': '202.65.32.0/19',
4430         'CL': '152.172.0.0/14',
4431         'CM': '102.244.0.0/14',
4432         'CN': '36.128.0.0/10',
4433         'CO': '181.240.0.0/12',
4434         'CR': '201.192.0.0/12',
4435         'CU': '152.206.0.0/15',
4436         'CV': '165.90.96.0/19',
4437         'CW': '190.88.128.0/17',
4438         'CY': '31.153.0.0/16',
4439         'CZ': '88.100.0.0/14',
4440         'DE': '53.0.0.0/8',
4441         'DJ': '197.241.0.0/17',
4442         'DK': '87.48.0.0/12',
4443         'DM': '192.243.48.0/20',
4444         'DO': '152.166.0.0/15',
4445         'DZ': '41.96.0.0/12',
4446         'EC': '186.68.0.0/15',
4447         'EE': '90.190.0.0/15',
4448         'EG': '156.160.0.0/11',
4449         'ER': '196.200.96.0/20',
4450         'ES': '88.0.0.0/11',
4451         'ET': '196.188.0.0/14',
4452         'EU': '2.16.0.0/13',
4453         'FI': '91.152.0.0/13',
4454         'FJ': '144.120.0.0/16',
4455         'FK': '80.73.208.0/21',
4456         'FM': '119.252.112.0/20',
4457         'FO': '88.85.32.0/19',
4458         'FR': '90.0.0.0/9',
4459         'GA': '41.158.0.0/15',
4460         'GB': '25.0.0.0/8',
4461         'GD': '74.122.88.0/21',
4462         'GE': '31.146.0.0/16',
4463         'GF': '161.22.64.0/18',
4464         'GG': '62.68.160.0/19',
4465         'GH': '154.160.0.0/12',
4466         'GI': '95.164.0.0/16',
4467         'GL': '88.83.0.0/19',
4468         'GM': '160.182.0.0/15',
4469         'GN': '197.149.192.0/18',
4470         'GP': '104.250.0.0/19',
4471         'GQ': '105.235.224.0/20',
4472         'GR': '94.64.0.0/13',
4473         'GT': '168.234.0.0/16',
4474         'GU': '168.123.0.0/16',
4475         'GW': '197.214.80.0/20',
4476         'GY': '181.41.64.0/18',
4477         'HK': '113.252.0.0/14',
4478         'HN': '181.210.0.0/16',
4479         'HR': '93.136.0.0/13',
4480         'HT': '148.102.128.0/17',
4481         'HU': '84.0.0.0/14',
4482         'ID': '39.192.0.0/10',
4483         'IE': '87.32.0.0/12',
4484         'IL': '79.176.0.0/13',
4485         'IM': '5.62.80.0/20',
4486         'IN': '117.192.0.0/10',
4487         'IO': '203.83.48.0/21',
4488         'IQ': '37.236.0.0/14',
4489         'IR': '2.176.0.0/12',
4490         'IS': '82.221.0.0/16',
4491         'IT': '79.0.0.0/10',
4492         'JE': '87.244.64.0/18',
4493         'JM': '72.27.0.0/17',
4494         'JO': '176.29.0.0/16',
4495         'JP': '133.0.0.0/8',
4496         'KE': '105.48.0.0/12',
4497         'KG': '158.181.128.0/17',
4498         'KH': '36.37.128.0/17',
4499         'KI': '103.25.140.0/22',
4500         'KM': '197.255.224.0/20',
4501         'KN': '198.167.192.0/19',
4502         'KP': '175.45.176.0/22',
4503         'KR': '175.192.0.0/10',
4504         'KW': '37.36.0.0/14',
4505         'KY': '64.96.0.0/15',
4506         'KZ': '2.72.0.0/13',
4507         'LA': '115.84.64.0/18',
4508         'LB': '178.135.0.0/16',
4509         'LC': '24.92.144.0/20',
4510         'LI': '82.117.0.0/19',
4511         'LK': '112.134.0.0/15',
4512         'LR': '102.183.0.0/16',
4513         'LS': '129.232.0.0/17',
4514         'LT': '78.56.0.0/13',
4515         'LU': '188.42.0.0/16',
4516         'LV': '46.109.0.0/16',
4517         'LY': '41.252.0.0/14',
4518         'MA': '105.128.0.0/11',
4519         'MC': '88.209.64.0/18',
4520         'MD': '37.246.0.0/16',
4521         'ME': '178.175.0.0/17',
4522         'MF': '74.112.232.0/21',
4523         'MG': '154.126.0.0/17',
4524         'MH': '117.103.88.0/21',
4525         'MK': '77.28.0.0/15',
4526         'ML': '154.118.128.0/18',
4527         'MM': '37.111.0.0/17',
4528         'MN': '49.0.128.0/17',
4529         'MO': '60.246.0.0/16',
4530         'MP': '202.88.64.0/20',
4531         'MQ': '109.203.224.0/19',
4532         'MR': '41.188.64.0/18',
4533         'MS': '208.90.112.0/22',
4534         'MT': '46.11.0.0/16',
4535         'MU': '105.16.0.0/12',
4536         'MV': '27.114.128.0/18',
4537         'MW': '102.70.0.0/15',
4538         'MX': '187.192.0.0/11',
4539         'MY': '175.136.0.0/13',
4540         'MZ': '197.218.0.0/15',
4541         'NA': '41.182.0.0/16',
4542         'NC': '101.101.0.0/18',
4543         'NE': '197.214.0.0/18',
4544         'NF': '203.17.240.0/22',
4545         'NG': '105.112.0.0/12',
4546         'NI': '186.76.0.0/15',
4547         'NL': '145.96.0.0/11',
4548         'NO': '84.208.0.0/13',
4549         'NP': '36.252.0.0/15',
4550         'NR': '203.98.224.0/19',
4551         'NU': '49.156.48.0/22',
4552         'NZ': '49.224.0.0/14',
4553         'OM': '5.36.0.0/15',
4554         'PA': '186.72.0.0/15',
4555         'PE': '186.160.0.0/14',
4556         'PF': '123.50.64.0/18',
4557         'PG': '124.240.192.0/19',
4558         'PH': '49.144.0.0/13',
4559         'PK': '39.32.0.0/11',
4560         'PL': '83.0.0.0/11',
4561         'PM': '70.36.0.0/20',
4562         'PR': '66.50.0.0/16',
4563         'PS': '188.161.0.0/16',
4564         'PT': '85.240.0.0/13',
4565         'PW': '202.124.224.0/20',
4566         'PY': '181.120.0.0/14',
4567         'QA': '37.210.0.0/15',
4568         'RE': '102.35.0.0/16',
4569         'RO': '79.112.0.0/13',
4570         'RS': '93.86.0.0/15',
4571         'RU': '5.136.0.0/13',
4572         'RW': '41.186.0.0/16',
4573         'SA': '188.48.0.0/13',
4574         'SB': '202.1.160.0/19',
4575         'SC': '154.192.0.0/11',
4576         'SD': '102.120.0.0/13',
4577         'SE': '78.64.0.0/12',
4578         'SG': '8.128.0.0/10',
4579         'SI': '188.196.0.0/14',
4580         'SK': '78.98.0.0/15',
4581         'SL': '102.143.0.0/17',
4582         'SM': '89.186.32.0/19',
4583         'SN': '41.82.0.0/15',
4584         'SO': '154.115.192.0/18',
4585         'SR': '186.179.128.0/17',
4586         'SS': '105.235.208.0/21',
4587         'ST': '197.159.160.0/19',
4588         'SV': '168.243.0.0/16',
4589         'SX': '190.102.0.0/20',
4590         'SY': '5.0.0.0/16',
4591         'SZ': '41.84.224.0/19',
4592         'TC': '65.255.48.0/20',
4593         'TD': '154.68.128.0/19',
4594         'TG': '196.168.0.0/14',
4595         'TH': '171.96.0.0/13',
4596         'TJ': '85.9.128.0/18',
4597         'TK': '27.96.24.0/21',
4598         'TL': '180.189.160.0/20',
4599         'TM': '95.85.96.0/19',
4600         'TN': '197.0.0.0/11',
4601         'TO': '175.176.144.0/21',
4602         'TR': '78.160.0.0/11',
4603         'TT': '186.44.0.0/15',
4604         'TV': '202.2.96.0/19',
4605         'TW': '120.96.0.0/11',
4606         'TZ': '156.156.0.0/14',
4607         'UA': '37.52.0.0/14',
4608         'UG': '102.80.0.0/13',
4609         'US': '6.0.0.0/8',
4610         'UY': '167.56.0.0/13',
4611         'UZ': '84.54.64.0/18',
4612         'VA': '212.77.0.0/19',
4613         'VC': '207.191.240.0/21',
4614         'VE': '186.88.0.0/13',
4615         'VG': '66.81.192.0/20',
4616         'VI': '146.226.0.0/16',
4617         'VN': '14.160.0.0/11',
4618         'VU': '202.80.32.0/20',
4619         'WF': '117.20.32.0/21',
4620         'WS': '202.4.32.0/19',
4621         'YE': '134.35.0.0/16',
4622         'YT': '41.242.116.0/22',
4623         'ZA': '41.0.0.0/11',
4624         'ZM': '102.144.0.0/13',
4625         'ZW': '102.177.192.0/18',
4626     }
4627
4628     @classmethod
4629     def random_ipv4(cls, code_or_block):
4630         if len(code_or_block) == 2:
4631             block = cls._country_ip_map.get(code_or_block.upper())
4632             if not block:
4633                 return None
4634         else:
4635             block = code_or_block
4636         addr, preflen = block.split('/')
4637         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4638         addr_max = addr_min | (0xffffffff >> int(preflen))
4639         return compat_str(socket.inet_ntoa(
4640             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4641
4642
4643 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4644     def __init__(self, proxies=None):
4645         # Set default handlers
4646         for type in ('http', 'https'):
4647             setattr(self, '%s_open' % type,
4648                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4649                         meth(r, proxy, type))
4650         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4651
4652     def proxy_open(self, req, proxy, type):
4653         req_proxy = req.headers.get('Ytdl-request-proxy')
4654         if req_proxy is not None:
4655             proxy = req_proxy
4656             del req.headers['Ytdl-request-proxy']
4657
4658         if proxy == '__noproxy__':
4659             return None  # No Proxy
4660         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4661             req.add_header('Ytdl-socks-proxy', proxy)
4662             # yt-dlp's http/https handlers do wrapping the socket with socks
4663             return None
4664         return compat_urllib_request.ProxyHandler.proxy_open(
4665             self, req, proxy, type)
4666
4667
4668 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4669 # released into Public Domain
4670 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4671
4672 def long_to_bytes(n, blocksize=0):
4673     """long_to_bytes(n:long, blocksize:int) : string
4674     Convert a long integer to a byte string.
4675
4676     If optional blocksize is given and greater than zero, pad the front of the
4677     byte string with binary zeros so that the length is a multiple of
4678     blocksize.
4679     """
4680     # after much testing, this algorithm was deemed to be the fastest
4681     s = b''
4682     n = int(n)
4683     while n > 0:
4684         s = compat_struct_pack('>I', n & 0xffffffff) + s
4685         n = n >> 32
4686     # strip off leading zeros
4687     for i in range(len(s)):
4688         if s[i] != b'\000'[0]:
4689             break
4690     else:
4691         # only happens when n == 0
4692         s = b'\000'
4693         i = 0
4694     s = s[i:]
4695     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4696     # de-padding being done above, but sigh...
4697     if blocksize > 0 and len(s) % blocksize:
4698         s = (blocksize - len(s) % blocksize) * b'\000' + s
4699     return s
4700
4701
4702 def bytes_to_long(s):
4703     """bytes_to_long(string) : long
4704     Convert a byte string to a long integer.
4705
4706     This is (essentially) the inverse of long_to_bytes().
4707     """
4708     acc = 0
4709     length = len(s)
4710     if length % 4:
4711         extra = (4 - length % 4)
4712         s = b'\000' * extra + s
4713         length = length + extra
4714     for i in range(0, length, 4):
4715         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4716     return acc
4717
4718
4719 def ohdave_rsa_encrypt(data, exponent, modulus):
4720     '''
4721     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4722
4723     Input:
4724         data: data to encrypt, bytes-like object
4725         exponent, modulus: parameter e and N of RSA algorithm, both integer
4726     Output: hex string of encrypted data
4727
4728     Limitation: supports one block encryption only
4729     '''
4730
4731     payload = int(binascii.hexlify(data[::-1]), 16)
4732     encrypted = pow(payload, exponent, modulus)
4733     return '%x' % encrypted
4734
4735
4736 def pkcs1pad(data, length):
4737     """
4738     Padding input data with PKCS#1 scheme
4739
4740     @param {int[]} data        input data
4741     @param {int}   length      target length
4742     @returns {int[]}           padded data
4743     """
4744     if len(data) > length - 11:
4745         raise ValueError('Input data too long for PKCS#1 padding')
4746
4747     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4748     return [0, 2] + pseudo_random + [0] + data
4749
4750
4751 def _base_n_table(n, table):
4752     if not table and not n:
4753         raise ValueError('Either table or n must be specified')
4754     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4755
4756     if n != len(table):
4757         raise ValueError(f'base {n} exceeds table length {len(table)}')
4758     return table
4759
4760
4761 def encode_base_n(num, n=None, table=None):
4762     """Convert given int to a base-n string"""
4763     table = _base_n_table(n, table)
4764     if not num:
4765         return table[0]
4766
4767     result, base = '', len(table)
4768     while num:
4769         result = table[num % base] + result
4770         num = num // base
4771     return result
4772
4773
4774 def decode_base_n(string, n=None, table=None):
4775     """Convert given base-n string to int"""
4776     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4777     result, base = 0, len(table)
4778     for char in string:
4779         result = result * base + table[char]
4780     return result
4781
4782
4783 def decode_base(value, digits):
4784     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4785                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4786     return decode_base_n(value, table=digits)
4787
4788
4789 def decode_packed_codes(code):
4790     mobj = re.search(PACKED_CODES_RE, code)
4791     obfuscated_code, base, count, symbols = mobj.groups()
4792     base = int(base)
4793     count = int(count)
4794     symbols = symbols.split('|')
4795     symbol_table = {}
4796
4797     while count:
4798         count -= 1
4799         base_n_count = encode_base_n(count, base)
4800         symbol_table[base_n_count] = symbols[count] or base_n_count
4801
4802     return re.sub(
4803         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4804         obfuscated_code)
4805
4806
4807 def caesar(s, alphabet, shift):
4808     if shift == 0:
4809         return s
4810     l = len(alphabet)
4811     return ''.join(
4812         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4813         for c in s)
4814
4815
4816 def rot47(s):
4817     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4818
4819
4820 def parse_m3u8_attributes(attrib):
4821     info = {}
4822     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4823         if val.startswith('"'):
4824             val = val[1:-1]
4825         info[key] = val
4826     return info
4827
4828
4829 def urshift(val, n):
4830     return val >> n if val >= 0 else (val + 0x100000000) >> n
4831
4832
4833 # Based on png2str() written by @gdkchan and improved by @yokrysty
4834 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4835 def decode_png(png_data):
4836     # Reference: https://www.w3.org/TR/PNG/
4837     header = png_data[8:]
4838
4839     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4840         raise OSError('Not a valid PNG file.')
4841
4842     int_map = {1: '>B', 2: '>H', 4: '>I'}
4843     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4844
4845     chunks = []
4846
4847     while header:
4848         length = unpack_integer(header[:4])
4849         header = header[4:]
4850
4851         chunk_type = header[:4]
4852         header = header[4:]
4853
4854         chunk_data = header[:length]
4855         header = header[length:]
4856
4857         header = header[4:]  # Skip CRC
4858
4859         chunks.append({
4860             'type': chunk_type,
4861             'length': length,
4862             'data': chunk_data
4863         })
4864
4865     ihdr = chunks[0]['data']
4866
4867     width = unpack_integer(ihdr[:4])
4868     height = unpack_integer(ihdr[4:8])
4869
4870     idat = b''
4871
4872     for chunk in chunks:
4873         if chunk['type'] == b'IDAT':
4874             idat += chunk['data']
4875
4876     if not idat:
4877         raise OSError('Unable to read PNG data.')
4878
4879     decompressed_data = bytearray(zlib.decompress(idat))
4880
4881     stride = width * 3
4882     pixels = []
4883
4884     def _get_pixel(idx):
4885         x = idx % stride
4886         y = idx // stride
4887         return pixels[y][x]
4888
4889     for y in range(height):
4890         basePos = y * (1 + stride)
4891         filter_type = decompressed_data[basePos]
4892
4893         current_row = []
4894
4895         pixels.append(current_row)
4896
4897         for x in range(stride):
4898             color = decompressed_data[1 + basePos + x]
4899             basex = y * stride + x
4900             left = 0
4901             up = 0
4902
4903             if x > 2:
4904                 left = _get_pixel(basex - 3)
4905             if y > 0:
4906                 up = _get_pixel(basex - stride)
4907
4908             if filter_type == 1:  # Sub
4909                 color = (color + left) & 0xff
4910             elif filter_type == 2:  # Up
4911                 color = (color + up) & 0xff
4912             elif filter_type == 3:  # Average
4913                 color = (color + ((left + up) >> 1)) & 0xff
4914             elif filter_type == 4:  # Paeth
4915                 a = left
4916                 b = up
4917                 c = 0
4918
4919                 if x > 2 and y > 0:
4920                     c = _get_pixel(basex - stride - 3)
4921
4922                 p = a + b - c
4923
4924                 pa = abs(p - a)
4925                 pb = abs(p - b)
4926                 pc = abs(p - c)
4927
4928                 if pa <= pb and pa <= pc:
4929                     color = (color + a) & 0xff
4930                 elif pb <= pc:
4931                     color = (color + b) & 0xff
4932                 else:
4933                     color = (color + c) & 0xff
4934
4935             current_row.append(color)
4936
4937     return width, height, pixels
4938
4939
4940 def write_xattr(path, key, value):
4941     # Windows: Write xattrs to NTFS Alternate Data Streams:
4942     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4943     if compat_os_name == 'nt':
4944         assert ':' not in key
4945         assert os.path.exists(path)
4946
4947         try:
4948             with open(f'{path}:{key}', 'wb') as f:
4949                 f.write(value)
4950         except OSError as e:
4951             raise XAttrMetadataError(e.errno, e.strerror)
4952         return
4953
4954     # UNIX Method 1. Use xattrs/pyxattrs modules
4955     from .dependencies import xattr
4956
4957     setxattr = None
4958     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4959         # Unicode arguments are not supported in pyxattr until version 0.5.0
4960         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4961         if version_tuple(xattr.__version__) >= (0, 5, 0):
4962             setxattr = xattr.set
4963     elif xattr:
4964         setxattr = xattr.setxattr
4965
4966     if setxattr:
4967         try:
4968             setxattr(path, key, value)
4969         except OSError as e:
4970             raise XAttrMetadataError(e.errno, e.strerror)
4971         return
4972
4973     # UNIX Method 2. Use setfattr/xattr executables
4974     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4975            else 'xattr' if check_executable('xattr', ['-h']) else None)
4976     if not exe:
4977         raise XAttrUnavailableError(
4978             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4979             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4980
4981     value = value.decode()
4982     try:
4983         _, stderr, returncode = Popen.run(
4984             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4985             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4986     except OSError as e:
4987         raise XAttrMetadataError(e.errno, e.strerror)
4988     if returncode:
4989         raise XAttrMetadataError(returncode, stderr)
4990
4991
4992 def random_birthday(year_field, month_field, day_field):
4993     start_date = datetime.date(1950, 1, 1)
4994     end_date = datetime.date(1995, 12, 31)
4995     offset = random.randint(0, (end_date - start_date).days)
4996     random_date = start_date + datetime.timedelta(offset)
4997     return {
4998         year_field: str(random_date.year),
4999         month_field: str(random_date.month),
5000         day_field: str(random_date.day),
5001     }
5002
5003
5004 # Templates for internet shortcut files, which are plain text files.
5005 DOT_URL_LINK_TEMPLATE = '''\
5006 [InternetShortcut]
5007 URL=%(url)s
5008 '''
5009
5010 DOT_WEBLOC_LINK_TEMPLATE = '''\
5011 <?xml version="1.0" encoding="UTF-8"?>
5012 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5013 <plist version="1.0">
5014 <dict>
5015 \t<key>URL</key>
5016 \t<string>%(url)s</string>
5017 </dict>
5018 </plist>
5019 '''
5020
5021 DOT_DESKTOP_LINK_TEMPLATE = '''\
5022 [Desktop Entry]
5023 Encoding=UTF-8
5024 Name=%(filename)s
5025 Type=Link
5026 URL=%(url)s
5027 Icon=text-html
5028 '''
5029
5030 LINK_TEMPLATES = {
5031     'url': DOT_URL_LINK_TEMPLATE,
5032     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5033     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5034 }
5035
5036
5037 def iri_to_uri(iri):
5038     """
5039     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5040
5041     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5042     """
5043
5044     iri_parts = compat_urllib_parse_urlparse(iri)
5045
5046     if '[' in iri_parts.netloc:
5047         raise ValueError('IPv6 URIs are not, yet, supported.')
5048         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5049
5050     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5051
5052     net_location = ''
5053     if iri_parts.username:
5054         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5055         if iri_parts.password is not None:
5056             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5057         net_location += '@'
5058
5059     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5060     # The 'idna' encoding produces ASCII text.
5061     if iri_parts.port is not None and iri_parts.port != 80:
5062         net_location += ':' + str(iri_parts.port)
5063
5064     return urllib.parse.urlunparse(
5065         (iri_parts.scheme,
5066             net_location,
5067
5068             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5069
5070             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5071             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5072
5073             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5074             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5075
5076             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5077
5078     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5079
5080
5081 def to_high_limit_path(path):
5082     if sys.platform in ['win32', 'cygwin']:
5083         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5084         return '\\\\?\\' + os.path.abspath(path)
5085
5086     return path
5087
5088
5089 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5090     val = traverse_obj(obj, *variadic(field))
5091     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5092         return default
5093     return template % func(val)
5094
5095
5096 def clean_podcast_url(url):
5097     return re.sub(r'''(?x)
5098         (?:
5099             (?:
5100                 chtbl\.com/track|
5101                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5102                 play\.podtrac\.com
5103             )/[^/]+|
5104             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5105             flex\.acast\.com|
5106             pd(?:
5107                 cn\.co| # https://podcorn.com/analytics-prefix/
5108                 st\.fm # https://podsights.com/docs/
5109             )/e
5110         )/''', '', url)
5111
5112
5113 _HEX_TABLE = '0123456789abcdef'
5114
5115
5116 def random_uuidv4():
5117     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5118
5119
5120 def make_dir(path, to_screen=None):
5121     try:
5122         dn = os.path.dirname(path)
5123         if dn and not os.path.exists(dn):
5124             os.makedirs(dn)
5125         return True
5126     except OSError as err:
5127         if callable(to_screen) is not None:
5128             to_screen('unable to create directory ' + error_to_compat_str(err))
5129         return False
5130
5131
5132 def get_executable_path():
5133     from .update import _get_variant_and_executable_path
5134
5135     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5136
5137
5138 def load_plugins(name, suffix, namespace):
5139     classes = {}
5140     with contextlib.suppress(FileNotFoundError):
5141         plugins_spec = importlib.util.spec_from_file_location(
5142             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5143         plugins = importlib.util.module_from_spec(plugins_spec)
5144         sys.modules[plugins_spec.name] = plugins
5145         plugins_spec.loader.exec_module(plugins)
5146         for name in dir(plugins):
5147             if name in namespace:
5148                 continue
5149             if not name.endswith(suffix):
5150                 continue
5151             klass = getattr(plugins, name)
5152             classes[name] = namespace[name] = klass
5153     return classes
5154
5155
5156 def traverse_obj(
5157         obj, *path_list, default=None, expected_type=None, get_all=True,
5158         casesense=True, is_user_input=False, traverse_string=False):
5159     ''' Traverse nested list/dict/tuple
5160     @param path_list        A list of paths which are checked one by one.
5161                             Each path is a list of keys where each key is a:
5162                               - None:     Do nothing
5163                               - string:   A dictionary key
5164                               - int:      An index into a list
5165                               - tuple:    A list of keys all of which will be traversed
5166                               - Ellipsis: Fetch all values in the object
5167                               - Function: Takes the key and value as arguments
5168                                           and returns whether the key matches or not
5169     @param default          Default value to return
5170     @param expected_type    Only accept final value of this type (Can also be any callable)
5171     @param get_all          Return all the values obtained from a path or only the first one
5172     @param casesense        Whether to consider dictionary keys as case sensitive
5173     @param is_user_input    Whether the keys are generated from user input. If True,
5174                             strings are converted to int/slice if necessary
5175     @param traverse_string  Whether to traverse inside strings. If True, any
5176                             non-compatible object will also be converted into a string
5177     # TODO: Write tests
5178     '''
5179     if not casesense:
5180         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5181         path_list = (map(_lower, variadic(path)) for path in path_list)
5182
5183     def _traverse_obj(obj, path, _current_depth=0):
5184         nonlocal depth
5185         path = tuple(variadic(path))
5186         for i, key in enumerate(path):
5187             if None in (key, obj):
5188                 return obj
5189             if isinstance(key, (list, tuple)):
5190                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5191                 key = ...
5192             if key is ...:
5193                 obj = (obj.values() if isinstance(obj, dict)
5194                        else obj if isinstance(obj, (list, tuple, LazyList))
5195                        else str(obj) if traverse_string else [])
5196                 _current_depth += 1
5197                 depth = max(depth, _current_depth)
5198                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5199             elif callable(key):
5200                 if isinstance(obj, (list, tuple, LazyList)):
5201                     obj = enumerate(obj)
5202                 elif isinstance(obj, dict):
5203                     obj = obj.items()
5204                 else:
5205                     if not traverse_string:
5206                         return None
5207                     obj = str(obj)
5208                 _current_depth += 1
5209                 depth = max(depth, _current_depth)
5210                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5211             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5212                 obj = (obj.get(key) if casesense or (key in obj)
5213                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5214             else:
5215                 if is_user_input:
5216                     key = (int_or_none(key) if ':' not in key
5217                            else slice(*map(int_or_none, key.split(':'))))
5218                     if key == slice(None):
5219                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5220                 if not isinstance(key, (int, slice)):
5221                     return None
5222                 if not isinstance(obj, (list, tuple, LazyList)):
5223                     if not traverse_string:
5224                         return None
5225                     obj = str(obj)
5226                 try:
5227                     obj = obj[key]
5228                 except IndexError:
5229                     return None
5230         return obj
5231
5232     if isinstance(expected_type, type):
5233         type_test = lambda val: val if isinstance(val, expected_type) else None
5234     else:
5235         type_test = expected_type or IDENTITY
5236
5237     for path in path_list:
5238         depth = 0
5239         val = _traverse_obj(obj, path)
5240         if val is not None:
5241             if depth:
5242                 for _ in range(depth - 1):
5243                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5244                 val = [v for v in map(type_test, val) if v is not None]
5245                 if val:
5246                     return val if get_all else val[0]
5247             else:
5248                 val = type_test(val)
5249                 if val is not None:
5250                     return val
5251     return default
5252
5253
5254 def traverse_dict(dictn, keys, casesense=True):
5255     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5256                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5257     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5258
5259
5260 def get_first(obj, keys, **kwargs):
5261     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5262
5263
5264 def variadic(x, allowed_types=(str, bytes, dict)):
5265     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5266
5267
5268 def time_seconds(**kwargs):
5269     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5270     return t.timestamp()
5271
5272
5273 # create a JSON Web Signature (jws) with HS256 algorithm
5274 # the resulting format is in JWS Compact Serialization
5275 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5276 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5277 def jwt_encode_hs256(payload_data, key, headers={}):
5278     header_data = {
5279         'alg': 'HS256',
5280         'typ': 'JWT',
5281     }
5282     if headers:
5283         header_data.update(headers)
5284     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5285     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5286     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5287     signature_b64 = base64.b64encode(h.digest())
5288     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5289     return token
5290
5291
5292 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5293 def jwt_decode_hs256(jwt):
5294     header_b64, payload_b64, signature_b64 = jwt.split('.')
5295     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5296     return payload_data
5297
5298
5299 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5300
5301
5302 @functools.cache
5303 def supports_terminal_sequences(stream):
5304     if compat_os_name == 'nt':
5305         if not WINDOWS_VT_MODE:
5306             return False
5307     elif not os.getenv('TERM'):
5308         return False
5309     try:
5310         return stream.isatty()
5311     except BaseException:
5312         return False
5313
5314
5315 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5316     if get_windows_version() < (10, 0, 10586):
5317         return
5318     global WINDOWS_VT_MODE
5319     try:
5320         Popen.run('', shell=True)
5321     except Exception:
5322         return
5323
5324     WINDOWS_VT_MODE = True
5325     supports_terminal_sequences.cache_clear()
5326
5327
5328 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5329
5330
5331 def remove_terminal_sequences(string):
5332     return _terminal_sequences_re.sub('', string)
5333
5334
5335 def number_of_digits(number):
5336     return len('%d' % number)
5337
5338
5339 def join_nonempty(*values, delim='-', from_dict=None):
5340     if from_dict is not None:
5341         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5342     return delim.join(map(str, filter(None, values)))
5343
5344
5345 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5346     """
5347     Find the largest format dimensions in terms of video width and, for each thumbnail:
5348     * Modify the URL: Match the width with the provided regex and replace with the former width
5349     * Update dimensions
5350
5351     This function is useful with video services that scale the provided thumbnails on demand
5352     """
5353     _keys = ('width', 'height')
5354     max_dimensions = max(
5355         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5356         default=(0, 0))
5357     if not max_dimensions[0]:
5358         return thumbnails
5359     return [
5360         merge_dicts(
5361             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5362             dict(zip(_keys, max_dimensions)), thumbnail)
5363         for thumbnail in thumbnails
5364     ]
5365
5366
5367 def parse_http_range(range):
5368     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5369     if not range:
5370         return None, None, None
5371     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5372     if not crg:
5373         return None, None, None
5374     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5375
5376
5377 def read_stdin(what):
5378     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5379     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5380     return sys.stdin
5381
5382
5383 class Config:
5384     own_args = None
5385     parsed_args = None
5386     filename = None
5387     __initialized = False
5388
5389     def __init__(self, parser, label=None):
5390         self.parser, self.label = parser, label
5391         self._loaded_paths, self.configs = set(), []
5392
5393     def init(self, args=None, filename=None):
5394         assert not self.__initialized
5395         directory = ''
5396         if filename:
5397             location = os.path.realpath(filename)
5398             directory = os.path.dirname(location)
5399             if location in self._loaded_paths:
5400                 return False
5401             self._loaded_paths.add(location)
5402
5403         self.own_args, self.__initialized = args, True
5404         opts, _ = self.parser.parse_known_args(args)
5405         self.parsed_args, self.filename = args, filename
5406
5407         for location in opts.config_locations or []:
5408             if location == '-':
5409                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5410                 continue
5411             location = os.path.join(directory, expand_path(location))
5412             if os.path.isdir(location):
5413                 location = os.path.join(location, 'yt-dlp.conf')
5414             if not os.path.exists(location):
5415                 self.parser.error(f'config location {location} does not exist')
5416             self.append_config(self.read_file(location), location)
5417         return True
5418
5419     def __str__(self):
5420         label = join_nonempty(
5421             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5422             delim=' ')
5423         return join_nonempty(
5424             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5425             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5426             delim='\n')
5427
5428     @staticmethod
5429     def read_file(filename, default=[]):
5430         try:
5431             optionf = open(filename)
5432         except OSError:
5433             return default  # silently skip if file is not present
5434         try:
5435             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5436             contents = optionf.read()
5437             res = shlex.split(contents, comments=True)
5438         except Exception as err:
5439             raise ValueError(f'Unable to parse "{filename}": {err}')
5440         finally:
5441             optionf.close()
5442         return res
5443
5444     @staticmethod
5445     def hide_login_info(opts):
5446         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5447         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5448
5449         def _scrub_eq(o):
5450             m = eqre.match(o)
5451             if m:
5452                 return m.group('key') + '=PRIVATE'
5453             else:
5454                 return o
5455
5456         opts = list(map(_scrub_eq, opts))
5457         for idx, opt in enumerate(opts):
5458             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5459                 opts[idx + 1] = 'PRIVATE'
5460         return opts
5461
5462     def append_config(self, *args, label=None):
5463         config = type(self)(self.parser, label)
5464         config._loaded_paths = self._loaded_paths
5465         if config.init(*args):
5466             self.configs.append(config)
5467
5468     @property
5469     def all_args(self):
5470         for config in reversed(self.configs):
5471             yield from config.all_args
5472         yield from self.parsed_args or []
5473
5474     def parse_known_args(self, **kwargs):
5475         return self.parser.parse_known_args(self.all_args, **kwargs)
5476
5477     def parse_args(self):
5478         return self.parser.parse_args(self.all_args)
5479
5480
5481 class WebSocketsWrapper():
5482     """Wraps websockets module to use in non-async scopes"""
5483     pool = None
5484
5485     def __init__(self, url, headers=None, connect=True):
5486         self.loop = asyncio.new_event_loop()
5487         # XXX: "loop" is deprecated
5488         self.conn = websockets.connect(
5489             url, extra_headers=headers, ping_interval=None,
5490             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5491         if connect:
5492             self.__enter__()
5493         atexit.register(self.__exit__, None, None, None)
5494
5495     def __enter__(self):
5496         if not self.pool:
5497             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5498         return self
5499
5500     def send(self, *args):
5501         self.run_with_loop(self.pool.send(*args), self.loop)
5502
5503     def recv(self, *args):
5504         return self.run_with_loop(self.pool.recv(*args), self.loop)
5505
5506     def __exit__(self, type, value, traceback):
5507         try:
5508             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5509         finally:
5510             self.loop.close()
5511             self._cancel_all_tasks(self.loop)
5512
5513     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5514     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5515     @staticmethod
5516     def run_with_loop(main, loop):
5517         if not asyncio.iscoroutine(main):
5518             raise ValueError(f'a coroutine was expected, got {main!r}')
5519
5520         try:
5521             return loop.run_until_complete(main)
5522         finally:
5523             loop.run_until_complete(loop.shutdown_asyncgens())
5524             if hasattr(loop, 'shutdown_default_executor'):
5525                 loop.run_until_complete(loop.shutdown_default_executor())
5526
5527     @staticmethod
5528     def _cancel_all_tasks(loop):
5529         to_cancel = asyncio.all_tasks(loop)
5530
5531         if not to_cancel:
5532             return
5533
5534         for task in to_cancel:
5535             task.cancel()
5536
5537         # XXX: "loop" is removed in python 3.10+
5538         loop.run_until_complete(
5539             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5540
5541         for task in to_cancel:
5542             if task.cancelled():
5543                 continue
5544             if task.exception() is not None:
5545                 loop.call_exception_handler({
5546                     'message': 'unhandled exception during asyncio.run() shutdown',
5547                     'exception': task.exception(),
5548                     'task': task,
5549                 })
5550
5551
5552 def merge_headers(*dicts):
5553     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5554     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5555
5556
5557 class classproperty:
5558     """classmethod(property(func)) that works in py < 3.9"""
5559
5560     def __init__(self, func):
5561         functools.update_wrapper(self, func)
5562         self.func = func
5563
5564     def __get__(self, _, cls):
5565         return self.func(cls)
5566
5567
5568 class Namespace(types.SimpleNamespace):
5569     """Immutable namespace"""
5570
5571     def __iter__(self):
5572         return iter(self.__dict__.values())
5573
5574     @property
5575     def items_(self):
5576         return self.__dict__.items()
5577
5578
5579 # Deprecated
5580 has_certifi = bool(certifi)
5581 has_websockets = bool(websockets)