yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import importlib.util
  20 import io
  21 import itertools
  22 import json
  23 import locale
  24 import math
  25 import mimetypes
  26 import operator
  27 import os
  28 import platform
  29 import random
  30 import re
  31 import shlex
  32 import socket
  33 import ssl
  34 import struct
  35 import subprocess
  36 import sys
  37 import tempfile
  38 import time
  39 import traceback
  40 import types
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44 import zlib
  45 import http.client
  46 import http.cookiejar
  47
  48 from .compat import asyncio, functools  # isort: split
  49 from .compat import (
  50     compat_etree_fromstring,
  51     compat_expanduser,
  52     compat_HTMLParseError,
  53     compat_HTTPError,
  54     compat_os_name,
  55     compat_parse_qs,
  56     compat_shlex_quote,
  57     compat_str,
  58     compat_urllib_parse_urlencode,
  59     compat_urllib_parse_urlparse,
  60     compat_urlparse,
  61 )
  62 from .dependencies import brotli, certifi, websockets, xattr
  63 from .socks import ProxyType, sockssocket
  64
  65
  66 def register_socks_protocols():
  67     # "Register" SOCKS protocols
  68     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  69     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  70     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  71         if scheme not in compat_urlparse.uses_netloc:
  72             compat_urlparse.uses_netloc.append(scheme)
  73
  74
  75 # This is not clearly defined otherwise
  76 compiled_regex_type = type(re.compile(''))
  77
  78
  79 def random_user_agent():
  80     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  81     _CHROME_VERSIONS = (
  82         '90.0.4430.212',
  83         '90.0.4430.24',
  84         '90.0.4430.70',
  85         '90.0.4430.72',
  86         '90.0.4430.85',
  87         '90.0.4430.93',
  88         '91.0.4472.101',
  89         '91.0.4472.106',
  90         '91.0.4472.114',
  91         '91.0.4472.124',
  92         '91.0.4472.164',
  93         '91.0.4472.19',
  94         '91.0.4472.77',
  95         '92.0.4515.107',
  96         '92.0.4515.115',
  97         '92.0.4515.131',
  98         '92.0.4515.159',
  99         '92.0.4515.43',
 100         '93.0.4556.0',
 101         '93.0.4577.15',
 102         '93.0.4577.63',
 103         '93.0.4577.82',
 104         '94.0.4606.41',
 105         '94.0.4606.54',
 106         '94.0.4606.61',
 107         '94.0.4606.71',
 108         '94.0.4606.81',
 109         '94.0.4606.85',
 110         '95.0.4638.17',
 111         '95.0.4638.50',
 112         '95.0.4638.54',
 113         '95.0.4638.69',
 114         '95.0.4638.74',
 115         '96.0.4664.18',
 116         '96.0.4664.45',
 117         '96.0.4664.55',
 118         '96.0.4664.93',
 119         '97.0.4692.20',
 120     )
 121     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 122
 123
 124 SUPPORTED_ENCODINGS = [
 125     'gzip', 'deflate'
 126 ]
 127 if brotli:
 128     SUPPORTED_ENCODINGS.append('br')
 129
 130 std_headers = {
 131     'User-Agent': random_user_agent(),
 132     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 133     'Accept-Language': 'en-us,en;q=0.5',
 134     'Sec-Fetch-Mode': 'navigate',
 135 }
 136
 137
 138 USER_AGENTS = {
 139     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 140 }
 141
 142
 143 NO_DEFAULT = object()
 144 IDENTITY = lambda x: x
 145
 146 ENGLISH_MONTH_NAMES = [
 147     'January', 'February', 'March', 'April', 'May', 'June',
 148     'July', 'August', 'September', 'October', 'November', 'December']
 149
 150 MONTH_NAMES = {
 151     'en': ENGLISH_MONTH_NAMES,
 152     'fr': [
 153         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 154         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 155 }
 156
 157 KNOWN_EXTENSIONS = (
 158     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 159     'flv', 'f4v', 'f4a', 'f4b',
 160     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 161     'mkv', 'mka', 'mk3d',
 162     'avi', 'divx',
 163     'mov',
 164     'asf', 'wmv', 'wma',
 165     '3gp', '3g2',
 166     'mp3',
 167     'flac',
 168     'ape',
 169     'wav',
 170     'f4f', 'f4m', 'm3u8', 'smil')
 171
 172 # needed for sanitizing filenames in restricted mode
 173 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 174                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 175                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 176
 177 DATE_FORMATS = (
 178     '%d %B %Y',
 179     '%d %b %Y',
 180     '%B %d %Y',
 181     '%B %dst %Y',
 182     '%B %dnd %Y',
 183     '%B %drd %Y',
 184     '%B %dth %Y',
 185     '%b %d %Y',
 186     '%b %dst %Y',
 187     '%b %dnd %Y',
 188     '%b %drd %Y',
 189     '%b %dth %Y',
 190     '%b %dst %Y %I:%M',
 191     '%b %dnd %Y %I:%M',
 192     '%b %drd %Y %I:%M',
 193     '%b %dth %Y %I:%M',
 194     '%Y %m %d',
 195     '%Y-%m-%d',
 196     '%Y.%m.%d.',
 197     '%Y/%m/%d',
 198     '%Y/%m/%d %H:%M',
 199     '%Y/%m/%d %H:%M:%S',
 200     '%Y%m%d%H%M',
 201     '%Y%m%d%H%M%S',
 202     '%Y%m%d',
 203     '%Y-%m-%d %H:%M',
 204     '%Y-%m-%d %H:%M:%S',
 205     '%Y-%m-%d %H:%M:%S.%f',
 206     '%Y-%m-%d %H:%M:%S:%f',
 207     '%d.%m.%Y %H:%M',
 208     '%d.%m.%Y %H.%M',
 209     '%Y-%m-%dT%H:%M:%SZ',
 210     '%Y-%m-%dT%H:%M:%S.%fZ',
 211     '%Y-%m-%dT%H:%M:%S.%f0Z',
 212     '%Y-%m-%dT%H:%M:%S',
 213     '%Y-%m-%dT%H:%M:%S.%f',
 214     '%Y-%m-%dT%H:%M',
 215     '%b %d %Y at %H:%M',
 216     '%b %d %Y at %H:%M:%S',
 217     '%B %d %Y at %H:%M',
 218     '%B %d %Y at %H:%M:%S',
 219     '%H:%M %d-%b-%Y',
 220 )
 221
 222 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 223 DATE_FORMATS_DAY_FIRST.extend([
 224     '%d-%m-%Y',
 225     '%d.%m.%Y',
 226     '%d.%m.%y',
 227     '%d/%m/%Y',
 228     '%d/%m/%y',
 229     '%d/%m/%Y %H:%M:%S',
 230 ])
 231
 232 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 233 DATE_FORMATS_MONTH_FIRST.extend([
 234     '%m-%d-%Y',
 235     '%m.%d.%Y',
 236     '%m/%d/%Y',
 237     '%m/%d/%y',
 238     '%m/%d/%Y %H:%M:%S',
 239 ])
 240
 241 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 242 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 243
 244 NUMBER_RE = r'\d+(?:\.\d+)?'
 245
 246
 247 @functools.cache
 248 def preferredencoding():
 249     """Get preferred encoding.
 250
 251     Returns the best encoding scheme for the system, based on
 252     locale.getpreferredencoding() and some further tweaks.
 253     """
 254     try:
 255         pref = locale.getpreferredencoding()
 256         'TEST'.encode(pref)
 257     except Exception:
 258         pref = 'UTF-8'
 259
 260     return pref
 261
 262
 263 def write_json_file(obj, fn):
 264     """ Encode obj as JSON and write it to fn, atomically if possible """
 265
 266     tf = tempfile.NamedTemporaryFile(
 267         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 268         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 269
 270     try:
 271         with tf:
 272             json.dump(obj, tf, ensure_ascii=False)
 273         if sys.platform == 'win32':
 274             # Need to remove existing file on Windows, else os.rename raises
 275             # WindowsError or FileExistsError.
 276             with contextlib.suppress(OSError):
 277                 os.unlink(fn)
 278         with contextlib.suppress(OSError):
 279             mask = os.umask(0)
 280             os.umask(mask)
 281             os.chmod(tf.name, 0o666 & ~mask)
 282         os.rename(tf.name, fn)
 283     except Exception:
 284         with contextlib.suppress(OSError):
 285             os.remove(tf.name)
 286         raise
 287
 288
 289 def find_xpath_attr(node, xpath, key, val=None):
 290     """ Find the xpath xpath[@key=val] """
 291     assert re.match(r'^[a-zA-Z_-]+$', key)
 292     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 293     return node.find(expr)
 294
 295 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 296 # the namespace parameter
 297
 298
 299 def xpath_with_ns(path, ns_map):
 300     components = [c.split(':') for c in path.split('/')]
 301     replaced = []
 302     for c in components:
 303         if len(c) == 1:
 304             replaced.append(c[0])
 305         else:
 306             ns, tag = c
 307             replaced.append('{%s}%s' % (ns_map[ns], tag))
 308     return '/'.join(replaced)
 309
 310
 311 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 312     def _find_xpath(xpath):
 313         return node.find(xpath)
 314
 315     if isinstance(xpath, (str, compat_str)):
 316         n = _find_xpath(xpath)
 317     else:
 318         for xp in xpath:
 319             n = _find_xpath(xp)
 320             if n is not None:
 321                 break
 322
 323     if n is None:
 324         if default is not NO_DEFAULT:
 325             return default
 326         elif fatal:
 327             name = xpath if name is None else name
 328             raise ExtractorError('Could not find XML element %s' % name)
 329         else:
 330             return None
 331     return n
 332
 333
 334 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 335     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 336     if n is None or n == default:
 337         return n
 338     if n.text is None:
 339         if default is not NO_DEFAULT:
 340             return default
 341         elif fatal:
 342             name = xpath if name is None else name
 343             raise ExtractorError('Could not find XML element\'s text %s' % name)
 344         else:
 345             return None
 346     return n.text
 347
 348
 349 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 350     n = find_xpath_attr(node, xpath, key)
 351     if n is None:
 352         if default is not NO_DEFAULT:
 353             return default
 354         elif fatal:
 355             name = f'{xpath}[@{key}]' if name is None else name
 356             raise ExtractorError('Could not find XML attribute %s' % name)
 357         else:
 358             return None
 359     return n.attrib[key]
 360
 361
 362 def get_element_by_id(id, html, **kwargs):
 363     """Return the content of the tag with the specified ID in the passed HTML document"""
 364     return get_element_by_attribute('id', id, html, **kwargs)
 365
 366
 367 def get_element_html_by_id(id, html, **kwargs):
 368     """Return the html of the tag with the specified ID in the passed HTML document"""
 369     return get_element_html_by_attribute('id', id, html, **kwargs)
 370
 371
 372 def get_element_by_class(class_name, html):
 373     """Return the content of the first tag with the specified class in the passed HTML document"""
 374     retval = get_elements_by_class(class_name, html)
 375     return retval[0] if retval else None
 376
 377
 378 def get_element_html_by_class(class_name, html):
 379     """Return the html of the first tag with the specified class in the passed HTML document"""
 380     retval = get_elements_html_by_class(class_name, html)
 381     return retval[0] if retval else None
 382
 383
 384 def get_element_by_attribute(attribute, value, html, **kwargs):
 385     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 386     return retval[0] if retval else None
 387
 388
 389 def get_element_html_by_attribute(attribute, value, html, **kargs):
 390     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 391     return retval[0] if retval else None
 392
 393
 394 def get_elements_by_class(class_name, html, **kargs):
 395     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 396     return get_elements_by_attribute(
 397         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 398         html, escape_value=False)
 399
 400
 401 def get_elements_html_by_class(class_name, html):
 402     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 403     return get_elements_html_by_attribute(
 404         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 405         html, escape_value=False)
 406
 407
 408 def get_elements_by_attribute(*args, **kwargs):
 409     """Return the content of the tag with the specified attribute in the passed HTML document"""
 410     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 411
 412
 413 def get_elements_html_by_attribute(*args, **kwargs):
 414     """Return the html of the tag with the specified attribute in the passed HTML document"""
 415     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 416
 417
 418 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 419     """
 420     Return the text (content) and the html (whole) of the tag with the specified
 421     attribute in the passed HTML document
 422     """
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>[a-zA-Z0-9:._-]+)
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 def get_element_text_and_html_by_tag(tag, html):
 486     """
 487     For the first element with the specified tag in the passed HTML document
 488     return its' content (text) and the whole element (html)
 489     """
 490     def find_or_raise(haystack, needle, exc):
 491         try:
 492             return haystack.index(needle)
 493         except ValueError:
 494             raise exc
 495     closing_tag = f'</{tag}>'
 496     whole_start = find_or_raise(
 497         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 498     content_start = find_or_raise(
 499         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 500     content_start += whole_start + 1
 501     with HTMLBreakOnClosingTagParser() as parser:
 502         parser.feed(html[whole_start:content_start])
 503         if not parser.tagstack or parser.tagstack[0] != tag:
 504             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 505         offset = content_start
 506         while offset < len(html):
 507             next_closing_tag_start = find_or_raise(
 508                 html[offset:], closing_tag,
 509                 compat_HTMLParseError(f'closing {tag} tag not found'))
 510             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 511             try:
 512                 parser.feed(html[offset:offset + next_closing_tag_end])
 513                 offset += next_closing_tag_end
 514             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 515                 return html[content_start:offset + next_closing_tag_start], \
 516                     html[whole_start:offset + next_closing_tag_end]
 517         raise compat_HTMLParseError('unexpected end of html')
 518
 519
 520 class HTMLAttributeParser(html.parser.HTMLParser):
 521     """Trivial HTML parser to gather the attributes for a single element"""
 522
 523     def __init__(self):
 524         self.attrs = {}
 525         html.parser.HTMLParser.__init__(self)
 526
 527     def handle_starttag(self, tag, attrs):
 528         self.attrs = dict(attrs)
 529
 530
 531 class HTMLListAttrsParser(html.parser.HTMLParser):
 532     """HTML parser to gather the attributes for the elements of a list"""
 533
 534     def __init__(self):
 535         html.parser.HTMLParser.__init__(self)
 536         self.items = []
 537         self._level = 0
 538
 539     def handle_starttag(self, tag, attrs):
 540         if tag == 'li' and self._level == 0:
 541             self.items.append(dict(attrs))
 542         self._level += 1
 543
 544     def handle_endtag(self, tag):
 545         self._level -= 1
 546
 547
 548 def extract_attributes(html_element):
 549     """Given a string for an HTML element such as
 550     <el
 551          a="foo" B="bar" c="&98;az" d=boz
 552          empty= noval entity="&amp;"
 553          sq='"' dq="'"
 554     >
 555     Decode and return a dictionary of attributes.
 556     {
 557         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 558         'empty': '', 'noval': None, 'entity': '&',
 559         'sq': '"', 'dq': '\''
 560     }.
 561     """
 562     parser = HTMLAttributeParser()
 563     with contextlib.suppress(compat_HTMLParseError):
 564         parser.feed(html_element)
 565         parser.close()
 566     return parser.attrs
 567
 568
 569 def parse_list(webpage):
 570     """Given a string for an series of HTML <li> elements,
 571     return a dictionary of their attributes"""
 572     parser = HTMLListAttrsParser()
 573     parser.feed(webpage)
 574     parser.close()
 575     return parser.items
 576
 577
 578 def clean_html(html):
 579     """Clean an HTML snippet into a readable string"""
 580
 581     if html is None:  # Convenience for sanitizing descriptions etc.
 582         return html
 583
 584     html = re.sub(r'\s+', ' ', html)
 585     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 586     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 587     # Strip html tags
 588     html = re.sub('<.*?>', '', html)
 589     # Replace html entities
 590     html = unescapeHTML(html)
 591     return html.strip()
 592
 593
 594 class LenientJSONDecoder(json.JSONDecoder):
 595     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 596         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 597         super().__init__(*args, **kwargs)
 598
 599     def decode(self, s):
 600         if self.transform_source:
 601             s = self.transform_source(s)
 602         if self.ignore_extra:
 603             return self.raw_decode(s.lstrip())[0]
 604         return super().decode(s)
 605
 606
 607 def sanitize_open(filename, open_mode):
 608     """Try to open the given filename, and slightly tweak it if this fails.
 609
 610     Attempts to open the given filename. If this fails, it tries to change
 611     the filename slightly, step by step, until it's either able to open it
 612     or it fails and raises a final exception, like the standard open()
 613     function.
 614
 615     It returns the tuple (stream, definitive_file_name).
 616     """
 617     if filename == '-':
 618         if sys.platform == 'win32':
 619             import msvcrt
 620             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 621         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 622
 623     for attempt in range(2):
 624         try:
 625             try:
 626                 if sys.platform == 'win32':
 627                     # FIXME: An exclusive lock also locks the file from being read.
 628                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 629                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 630                     raise LockingUnsupportedError()
 631                 stream = locked_file(filename, open_mode, block=False).__enter__()
 632             except OSError:
 633                 stream = open(filename, open_mode)
 634             return stream, filename
 635         except OSError as err:
 636             if attempt or err.errno in (errno.EACCES,):
 637                 raise
 638             old_filename, filename = filename, sanitize_path(filename)
 639             if old_filename == filename:
 640                 raise
 641
 642
 643 def timeconvert(timestr):
 644     """Convert RFC 2822 defined time string into system timestamp"""
 645     timestamp = None
 646     timetuple = email.utils.parsedate_tz(timestr)
 647     if timetuple is not None:
 648         timestamp = email.utils.mktime_tz(timetuple)
 649     return timestamp
 650
 651
 652 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 653     """Sanitizes a string so it could be used as part of a filename.
 654     @param restricted   Use a stricter subset of allowed characters
 655     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 656                         If unset, yt-dlp's new sanitization rules are in effect
 657     """
 658     if s == '':
 659         return ''
 660
 661     def replace_insane(char):
 662         if restricted and char in ACCENT_CHARS:
 663             return ACCENT_CHARS[char]
 664         elif not restricted and char == '\n':
 665             return '\0 '
 666         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 667             return ''
 668         elif char == '"':
 669             return '' if restricted else '\''
 670         elif char == ':':
 671             return '\0_\0-' if restricted else '\0 \0-'
 672         elif char in '\\/|*<>':
 673             return '\0_'
 674         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 675             return '\0_'
 676         return char
 677
 678     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 679     result = ''.join(map(replace_insane, s))
 680     if is_id is NO_DEFAULT:
 681         result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result)  # Remove repeated substitute chars
 682         STRIP_RE = '(?:\0.|[ _-])*'
 683         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 684     result = result.replace('\0', '') or '_'
 685
 686     if not is_id:
 687         while '__' in result:
 688             result = result.replace('__', '_')
 689         result = result.strip('_')
 690         # Common case of "Foreign band name - English song title"
 691         if restricted and result.startswith('-_'):
 692             result = result[2:]
 693         if result.startswith('-'):
 694             result = '_' + result[len('-'):]
 695         result = result.lstrip('.')
 696         if not result:
 697             result = '_'
 698     return result
 699
 700
 701 def sanitize_path(s, force=False):
 702     """Sanitizes and normalizes path on Windows"""
 703     if sys.platform == 'win32':
 704         force = False
 705         drive_or_unc, _ = os.path.splitdrive(s)
 706     elif force:
 707         drive_or_unc = ''
 708     else:
 709         return s
 710
 711     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 712     if drive_or_unc:
 713         norm_path.pop(0)
 714     sanitized_path = [
 715         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 716         for path_part in norm_path]
 717     if drive_or_unc:
 718         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 719     elif force and s and s[0] == os.path.sep:
 720         sanitized_path.insert(0, os.path.sep)
 721     return os.path.join(*sanitized_path)
 722
 723
 724 def sanitize_url(url):
 725     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 726     # the number of unwanted failures due to missing protocol
 727     if url is None:
 728         return
 729     elif url.startswith('//'):
 730         return 'http:%s' % url
 731     # Fix some common typos seen so far
 732     COMMON_TYPOS = (
 733         # https://github.com/ytdl-org/youtube-dl/issues/15649
 734         (r'^httpss://', r'https://'),
 735         # https://bx1.be/lives/direct-tv/
 736         (r'^rmtp([es]?)://', r'rtmp\1://'),
 737     )
 738     for mistake, fixup in COMMON_TYPOS:
 739         if re.match(mistake, url):
 740             return re.sub(mistake, fixup, url)
 741     return url
 742
 743
 744 def extract_basic_auth(url):
 745     parts = compat_urlparse.urlsplit(url)
 746     if parts.username is None:
 747         return url, None
 748     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 749         parts.hostname if parts.port is None
 750         else '%s:%d' % (parts.hostname, parts.port))))
 751     auth_payload = base64.b64encode(
 752         ('%s:%s' % (parts.username, parts.password or '')).encode())
 753     return url, f'Basic {auth_payload.decode()}'
 754
 755
 756 def sanitized_Request(url, *args, **kwargs):
 757     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 758     if auth_header is not None:
 759         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 760         headers['Authorization'] = auth_header
 761     return urllib.request.Request(url, *args, **kwargs)
 762
 763
 764 def expand_path(s):
 765     """Expand shell variables and ~"""
 766     return os.path.expandvars(compat_expanduser(s))
 767
 768
 769 def orderedSet(iterable, *, lazy=False):
 770     """Remove all duplicates from the input iterable"""
 771     def _iter():
 772         seen = []  # Do not use set since the items can be unhashable
 773         for x in iterable:
 774             if x not in seen:
 775                 seen.append(x)
 776                 yield x
 777
 778     return _iter() if lazy else list(_iter())
 779
 780
 781 def _htmlentity_transform(entity_with_semicolon):
 782     """Transforms an HTML entity to a character."""
 783     entity = entity_with_semicolon[:-1]
 784
 785     # Known non-numeric HTML entity
 786     if entity in html.entities.name2codepoint:
 787         return chr(html.entities.name2codepoint[entity])
 788
 789     # TODO: HTML5 allows entities without a semicolon. For example,
 790     # '&Eacuteric' should be decoded as 'Éric'.
 791     if entity_with_semicolon in html.entities.html5:
 792         return html.entities.html5[entity_with_semicolon]
 793
 794     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 795     if mobj is not None:
 796         numstr = mobj.group(1)
 797         if numstr.startswith('x'):
 798             base = 16
 799             numstr = '0%s' % numstr
 800         else:
 801             base = 10
 802         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 803         with contextlib.suppress(ValueError):
 804             return chr(int(numstr, base))
 805
 806     # Unknown entity in name, return its literal representation
 807     return '&%s;' % entity
 808
 809
 810 def unescapeHTML(s):
 811     if s is None:
 812         return None
 813     assert isinstance(s, str)
 814
 815     return re.sub(
 816         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 817
 818
 819 def escapeHTML(text):
 820     return (
 821         text
 822         .replace('&', '&amp;')
 823         .replace('<', '&lt;')
 824         .replace('>', '&gt;')
 825         .replace('"', '&quot;')
 826         .replace("'", '&#39;')
 827     )
 828
 829
 830 def process_communicate_or_kill(p, *args, **kwargs):
 831     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 832                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 833     return Popen.communicate_or_kill(p, *args, **kwargs)
 834
 835
 836 class Popen(subprocess.Popen):
 837     if sys.platform == 'win32':
 838         _startupinfo = subprocess.STARTUPINFO()
 839         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 840     else:
 841         _startupinfo = None
 842
 843     def __init__(self, *args, text=False, **kwargs):
 844         if text is True:
 845             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 846             kwargs.setdefault('encoding', 'utf-8')
 847             kwargs.setdefault('errors', 'replace')
 848         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 849
 850     def communicate_or_kill(self, *args, **kwargs):
 851         try:
 852             return self.communicate(*args, **kwargs)
 853         except BaseException:  # Including KeyboardInterrupt
 854             self.kill(timeout=None)
 855             raise
 856
 857     def kill(self, *, timeout=0):
 858         super().kill()
 859         if timeout != 0:
 860             self.wait(timeout=timeout)
 861
 862     @classmethod
 863     def run(cls, *args, **kwargs):
 864         with cls(*args, **kwargs) as proc:
 865             stdout, stderr = proc.communicate_or_kill()
 866             return stdout or '', stderr or '', proc.returncode
 867
 868
 869 def get_subprocess_encoding():
 870     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 871         # For subprocess calls, encode with locale encoding
 872         # Refer to http://stackoverflow.com/a/9951851/35070
 873         encoding = preferredencoding()
 874     else:
 875         encoding = sys.getfilesystemencoding()
 876     if encoding is None:
 877         encoding = 'utf-8'
 878     return encoding
 879
 880
 881 def encodeFilename(s, for_subprocess=False):
 882     assert isinstance(s, str)
 883     return s
 884
 885
 886 def decodeFilename(b, for_subprocess=False):
 887     return b
 888
 889
 890 def encodeArgument(s):
 891     # Legacy code that uses byte strings
 892     # Uncomment the following line after fixing all post processors
 893     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 894     return s if isinstance(s, str) else s.decode('ascii')
 895
 896
 897 def decodeArgument(b):
 898     return b
 899
 900
 901 def decodeOption(optval):
 902     if optval is None:
 903         return optval
 904     if isinstance(optval, bytes):
 905         optval = optval.decode(preferredencoding())
 906
 907     assert isinstance(optval, compat_str)
 908     return optval
 909
 910
 911 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 912
 913
 914 def timetuple_from_msec(msec):
 915     secs, msec = divmod(msec, 1000)
 916     mins, secs = divmod(secs, 60)
 917     hrs, mins = divmod(mins, 60)
 918     return _timetuple(hrs, mins, secs, msec)
 919
 920
 921 def formatSeconds(secs, delim=':', msec=False):
 922     time = timetuple_from_msec(secs * 1000)
 923     if time.hours:
 924         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 925     elif time.minutes:
 926         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 927     else:
 928         ret = '%d' % time.seconds
 929     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 930
 931
 932 def _ssl_load_windows_store_certs(ssl_context, storename):
 933     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 934     try:
 935         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 936                  if encoding == 'x509_asn' and (
 937                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 938     except PermissionError:
 939         return
 940     for cert in certs:
 941         with contextlib.suppress(ssl.SSLError):
 942             ssl_context.load_verify_locations(cadata=cert)
 943
 944
 945 def make_HTTPS_handler(params, **kwargs):
 946     opts_check_certificate = not params.get('nocheckcertificate')
 947     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 948     context.check_hostname = opts_check_certificate
 949     if params.get('legacyserverconnect'):
 950         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 951         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 952         context.set_ciphers('DEFAULT')
 953
 954     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 955     if opts_check_certificate:
 956         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 957             context.load_verify_locations(cafile=certifi.where())
 958         try:
 959             context.load_default_certs()
 960         # Work around the issue in load_default_certs when there are bad certificates. See:
 961         # https://github.com/yt-dlp/yt-dlp/issues/1060,
 962         # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 963         except ssl.SSLError:
 964             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 965             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 966                 for storename in ('CA', 'ROOT'):
 967                     _ssl_load_windows_store_certs(context, storename)
 968             context.set_default_verify_paths()
 969
 970     client_certfile = params.get('client_certificate')
 971     if client_certfile:
 972         try:
 973             context.load_cert_chain(
 974                 client_certfile, keyfile=params.get('client_certificate_key'),
 975                 password=params.get('client_certificate_password'))
 976         except ssl.SSLError:
 977             raise YoutubeDLError('Unable to load client certificate')
 978
 979     # Some servers may reject requests if ALPN extension is not sent. See:
 980     # https://github.com/python/cpython/issues/85140
 981     # https://github.com/yt-dlp/yt-dlp/issues/3878
 982     with contextlib.suppress(NotImplementedError):
 983         context.set_alpn_protocols(['http/1.1'])
 984
 985     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 986
 987
 988 def bug_reports_message(before=';'):
 989     from .update import REPOSITORY
 990
 991     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 992            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 993
 994     before = before.rstrip()
 995     if not before or before.endswith(('.', '!', '?')):
 996         msg = msg[0].title() + msg[1:]
 997
 998     return (before + ' ' if before else '') + msg
 999
1000
1001 class YoutubeDLError(Exception):
1002     """Base exception for YoutubeDL errors."""
1003     msg = None
1004
1005     def __init__(self, msg=None):
1006         if msg is not None:
1007             self.msg = msg
1008         elif self.msg is None:
1009             self.msg = type(self).__name__
1010         super().__init__(self.msg)
1011
1012
1013 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1014 if hasattr(ssl, 'CertificateError'):
1015     network_exceptions.append(ssl.CertificateError)
1016 network_exceptions = tuple(network_exceptions)
1017
1018
1019 class ExtractorError(YoutubeDLError):
1020     """Error during info extraction."""
1021
1022     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1023         """ tb, if given, is the original traceback (so that it can be printed out).
1024         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1025         """
1026         if sys.exc_info()[0] in network_exceptions:
1027             expected = True
1028
1029         self.orig_msg = str(msg)
1030         self.traceback = tb
1031         self.expected = expected
1032         self.cause = cause
1033         self.video_id = video_id
1034         self.ie = ie
1035         self.exc_info = sys.exc_info()  # preserve original exception
1036         if isinstance(self.exc_info[1], ExtractorError):
1037             self.exc_info = self.exc_info[1].exc_info
1038
1039         super().__init__(''.join((
1040             format_field(ie, None, '[%s] '),
1041             format_field(video_id, None, '%s: '),
1042             msg,
1043             format_field(cause, None, ' (caused by %r)'),
1044             '' if expected else bug_reports_message())))
1045
1046     def format_traceback(self):
1047         return join_nonempty(
1048             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1049             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1050             delim='\n') or None
1051
1052
1053 class UnsupportedError(ExtractorError):
1054     def __init__(self, url):
1055         super().__init__(
1056             'Unsupported URL: %s' % url, expected=True)
1057         self.url = url
1058
1059
1060 class RegexNotFoundError(ExtractorError):
1061     """Error when a regex didn't match"""
1062     pass
1063
1064
1065 class GeoRestrictedError(ExtractorError):
1066     """Geographic restriction Error exception.
1067
1068     This exception may be thrown when a video is not available from your
1069     geographic location due to geographic restrictions imposed by a website.
1070     """
1071
1072     def __init__(self, msg, countries=None, **kwargs):
1073         kwargs['expected'] = True
1074         super().__init__(msg, **kwargs)
1075         self.countries = countries
1076
1077
1078 class DownloadError(YoutubeDLError):
1079     """Download Error exception.
1080
1081     This exception may be thrown by FileDownloader objects if they are not
1082     configured to continue on errors. They will contain the appropriate
1083     error message.
1084     """
1085
1086     def __init__(self, msg, exc_info=None):
1087         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1088         super().__init__(msg)
1089         self.exc_info = exc_info
1090
1091
1092 class EntryNotInPlaylist(YoutubeDLError):
1093     """Entry not in playlist exception.
1094
1095     This exception will be thrown by YoutubeDL when a requested entry
1096     is not found in the playlist info_dict
1097     """
1098     msg = 'Entry not found in info'
1099
1100
1101 class SameFileError(YoutubeDLError):
1102     """Same File exception.
1103
1104     This exception will be thrown by FileDownloader objects if they detect
1105     multiple files would have to be downloaded to the same file on disk.
1106     """
1107     msg = 'Fixed output name but more than one file to download'
1108
1109     def __init__(self, filename=None):
1110         if filename is not None:
1111             self.msg += f': {filename}'
1112         super().__init__(self.msg)
1113
1114
1115 class PostProcessingError(YoutubeDLError):
1116     """Post Processing exception.
1117
1118     This exception may be raised by PostProcessor's .run() method to
1119     indicate an error in the postprocessing task.
1120     """
1121
1122
1123 class DownloadCancelled(YoutubeDLError):
1124     """ Exception raised when the download queue should be interrupted """
1125     msg = 'The download was cancelled'
1126
1127
1128 class ExistingVideoReached(DownloadCancelled):
1129     """ --break-on-existing triggered """
1130     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1131
1132
1133 class RejectedVideoReached(DownloadCancelled):
1134     """ --break-on-reject triggered """
1135     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1136
1137
1138 class MaxDownloadsReached(DownloadCancelled):
1139     """ --max-downloads limit has been reached. """
1140     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1141
1142
1143 class ReExtractInfo(YoutubeDLError):
1144     """ Video info needs to be re-extracted. """
1145
1146     def __init__(self, msg, expected=False):
1147         super().__init__(msg)
1148         self.expected = expected
1149
1150
1151 class ThrottledDownload(ReExtractInfo):
1152     """ Download speed below --throttled-rate. """
1153     msg = 'The download speed is below throttle limit'
1154
1155     def __init__(self):
1156         super().__init__(self.msg, expected=False)
1157
1158
1159 class UnavailableVideoError(YoutubeDLError):
1160     """Unavailable Format exception.
1161
1162     This exception will be thrown when a video is requested
1163     in a format that is not available for that video.
1164     """
1165     msg = 'Unable to download video'
1166
1167     def __init__(self, err=None):
1168         if err is not None:
1169             self.msg += f': {err}'
1170         super().__init__(self.msg)
1171
1172
1173 class ContentTooShortError(YoutubeDLError):
1174     """Content Too Short exception.
1175
1176     This exception may be raised by FileDownloader objects when a file they
1177     download is too small for what the server announced first, indicating
1178     the connection was probably interrupted.
1179     """
1180
1181     def __init__(self, downloaded, expected):
1182         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1183         # Both in bytes
1184         self.downloaded = downloaded
1185         self.expected = expected
1186
1187
1188 class XAttrMetadataError(YoutubeDLError):
1189     def __init__(self, code=None, msg='Unknown error'):
1190         super().__init__(msg)
1191         self.code = code
1192         self.msg = msg
1193
1194         # Parsing code and msg
1195         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1196                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1197             self.reason = 'NO_SPACE'
1198         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1199             self.reason = 'VALUE_TOO_LONG'
1200         else:
1201             self.reason = 'NOT_SUPPORTED'
1202
1203
1204 class XAttrUnavailableError(YoutubeDLError):
1205     pass
1206
1207
1208 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1209     hc = http_class(*args, **kwargs)
1210     source_address = ydl_handler._params.get('source_address')
1211
1212     if source_address is not None:
1213         # This is to workaround _create_connection() from socket where it will try all
1214         # address data from getaddrinfo() including IPv6. This filters the result from
1215         # getaddrinfo() based on the source_address value.
1216         # This is based on the cpython socket.create_connection() function.
1217         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1218         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1219             host, port = address
1220             err = None
1221             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1222             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1223             ip_addrs = [addr for addr in addrs if addr[0] == af]
1224             if addrs and not ip_addrs:
1225                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1226                 raise OSError(
1227                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1228                     % (ip_version, source_address[0]))
1229             for res in ip_addrs:
1230                 af, socktype, proto, canonname, sa = res
1231                 sock = None
1232                 try:
1233                     sock = socket.socket(af, socktype, proto)
1234                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1235                         sock.settimeout(timeout)
1236                     sock.bind(source_address)
1237                     sock.connect(sa)
1238                     err = None  # Explicitly break reference cycle
1239                     return sock
1240                 except OSError as _:
1241                     err = _
1242                     if sock is not None:
1243                         sock.close()
1244             if err is not None:
1245                 raise err
1246             else:
1247                 raise OSError('getaddrinfo returns an empty list')
1248         if hasattr(hc, '_create_connection'):
1249             hc._create_connection = _create_connection
1250         hc.source_address = (source_address, 0)
1251
1252     return hc
1253
1254
1255 def handle_youtubedl_headers(headers):
1256     filtered_headers = headers
1257
1258     if 'Youtubedl-no-compression' in filtered_headers:
1259         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1260         del filtered_headers['Youtubedl-no-compression']
1261
1262     return filtered_headers
1263
1264
1265 class YoutubeDLHandler(urllib.request.HTTPHandler):
1266     """Handler for HTTP requests and responses.
1267
1268     This class, when installed with an OpenerDirector, automatically adds
1269     the standard headers to every HTTP request and handles gzipped and
1270     deflated responses from web servers. If compression is to be avoided in
1271     a particular request, the original request in the program code only has
1272     to include the HTTP header "Youtubedl-no-compression", which will be
1273     removed before making the real request.
1274
1275     Part of this code was copied from:
1276
1277     http://techknack.net/python-urllib2-handlers/
1278
1279     Andrew Rowls, the author of that code, agreed to release it to the
1280     public domain.
1281     """
1282
1283     def __init__(self, params, *args, **kwargs):
1284         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1285         self._params = params
1286
1287     def http_open(self, req):
1288         conn_class = http.client.HTTPConnection
1289
1290         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1291         if socks_proxy:
1292             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1293             del req.headers['Ytdl-socks-proxy']
1294
1295         return self.do_open(functools.partial(
1296             _create_http_connection, self, conn_class, False),
1297             req)
1298
1299     @staticmethod
1300     def deflate(data):
1301         if not data:
1302             return data
1303         try:
1304             return zlib.decompress(data, -zlib.MAX_WBITS)
1305         except zlib.error:
1306             return zlib.decompress(data)
1307
1308     @staticmethod
1309     def brotli(data):
1310         if not data:
1311             return data
1312         return brotli.decompress(data)
1313
1314     def http_request(self, req):
1315         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1316         # always respected by websites, some tend to give out URLs with non percent-encoded
1317         # non-ASCII characters (see telemb.py, ard.py [#3412])
1318         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1319         # To work around aforementioned issue we will replace request's original URL with
1320         # percent-encoded one
1321         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1322         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1323         url = req.get_full_url()
1324         url_escaped = escape_url(url)
1325
1326         # Substitute URL if any change after escaping
1327         if url != url_escaped:
1328             req = update_Request(req, url=url_escaped)
1329
1330         for h, v in self._params.get('http_headers', std_headers).items():
1331             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1332             # The dict keys are capitalized because of this bug by urllib
1333             if h.capitalize() not in req.headers:
1334                 req.add_header(h, v)
1335
1336         if 'Accept-encoding' not in req.headers:
1337             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1338
1339         req.headers = handle_youtubedl_headers(req.headers)
1340
1341         return super().do_request_(req)
1342
1343     def http_response(self, req, resp):
1344         old_resp = resp
1345         # gzip
1346         if resp.headers.get('Content-encoding', '') == 'gzip':
1347             content = resp.read()
1348             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1349             try:
1350                 uncompressed = io.BytesIO(gz.read())
1351             except OSError as original_ioerror:
1352                 # There may be junk add the end of the file
1353                 # See http://stackoverflow.com/q/4928560/35070 for details
1354                 for i in range(1, 1024):
1355                     try:
1356                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1357                         uncompressed = io.BytesIO(gz.read())
1358                     except OSError:
1359                         continue
1360                     break
1361                 else:
1362                     raise original_ioerror
1363             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1364             resp.msg = old_resp.msg
1365             del resp.headers['Content-encoding']
1366         # deflate
1367         if resp.headers.get('Content-encoding', '') == 'deflate':
1368             gz = io.BytesIO(self.deflate(resp.read()))
1369             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1370             resp.msg = old_resp.msg
1371             del resp.headers['Content-encoding']
1372         # brotli
1373         if resp.headers.get('Content-encoding', '') == 'br':
1374             resp = urllib.request.addinfourl(
1375                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1376             resp.msg = old_resp.msg
1377             del resp.headers['Content-encoding']
1378         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1379         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1380         if 300 <= resp.code < 400:
1381             location = resp.headers.get('Location')
1382             if location:
1383                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1384                 location = location.encode('iso-8859-1').decode()
1385                 location_escaped = escape_url(location)
1386                 if location != location_escaped:
1387                     del resp.headers['Location']
1388                     resp.headers['Location'] = location_escaped
1389         return resp
1390
1391     https_request = http_request
1392     https_response = http_response
1393
1394
1395 def make_socks_conn_class(base_class, socks_proxy):
1396     assert issubclass(base_class, (
1397         http.client.HTTPConnection, http.client.HTTPSConnection))
1398
1399     url_components = compat_urlparse.urlparse(socks_proxy)
1400     if url_components.scheme.lower() == 'socks5':
1401         socks_type = ProxyType.SOCKS5
1402     elif url_components.scheme.lower() in ('socks', 'socks4'):
1403         socks_type = ProxyType.SOCKS4
1404     elif url_components.scheme.lower() == 'socks4a':
1405         socks_type = ProxyType.SOCKS4A
1406
1407     def unquote_if_non_empty(s):
1408         if not s:
1409             return s
1410         return urllib.parse.unquote_plus(s)
1411
1412     proxy_args = (
1413         socks_type,
1414         url_components.hostname, url_components.port or 1080,
1415         True,  # Remote DNS
1416         unquote_if_non_empty(url_components.username),
1417         unquote_if_non_empty(url_components.password),
1418     )
1419
1420     class SocksConnection(base_class):
1421         def connect(self):
1422             self.sock = sockssocket()
1423             self.sock.setproxy(*proxy_args)
1424             if isinstance(self.timeout, (int, float)):
1425                 self.sock.settimeout(self.timeout)
1426             self.sock.connect((self.host, self.port))
1427
1428             if isinstance(self, http.client.HTTPSConnection):
1429                 if hasattr(self, '_context'):  # Python > 2.6
1430                     self.sock = self._context.wrap_socket(
1431                         self.sock, server_hostname=self.host)
1432                 else:
1433                     self.sock = ssl.wrap_socket(self.sock)
1434
1435     return SocksConnection
1436
1437
1438 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1439     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1440         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1441         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1442         self._params = params
1443
1444     def https_open(self, req):
1445         kwargs = {}
1446         conn_class = self._https_conn_class
1447
1448         if hasattr(self, '_context'):  # python > 2.6
1449             kwargs['context'] = self._context
1450         if hasattr(self, '_check_hostname'):  # python 3.x
1451             kwargs['check_hostname'] = self._check_hostname
1452
1453         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1454         if socks_proxy:
1455             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1456             del req.headers['Ytdl-socks-proxy']
1457
1458         try:
1459             return self.do_open(
1460                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1461         except urllib.error.URLError as e:
1462             if (isinstance(e.reason, ssl.SSLError)
1463                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1464                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1465             raise
1466
1467
1468 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1469     """
1470     See [1] for cookie file format.
1471
1472     1. https://curl.haxx.se/docs/http-cookies.html
1473     """
1474     _HTTPONLY_PREFIX = '#HttpOnly_'
1475     _ENTRY_LEN = 7
1476     _HEADER = '''# Netscape HTTP Cookie File
1477 # This file is generated by yt-dlp.  Do not edit.
1478
1479 '''
1480     _CookieFileEntry = collections.namedtuple(
1481         'CookieFileEntry',
1482         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1483
1484     def __init__(self, filename=None, *args, **kwargs):
1485         super().__init__(None, *args, **kwargs)
1486         if self.is_path(filename):
1487             filename = os.fspath(filename)
1488         self.filename = filename
1489
1490     @staticmethod
1491     def _true_or_false(cndn):
1492         return 'TRUE' if cndn else 'FALSE'
1493
1494     @staticmethod
1495     def is_path(file):
1496         return isinstance(file, (str, bytes, os.PathLike))
1497
1498     @contextlib.contextmanager
1499     def open(self, file, *, write=False):
1500         if self.is_path(file):
1501             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1502                 yield f
1503         else:
1504             if write:
1505                 file.truncate(0)
1506             yield file
1507
1508     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1509         now = time.time()
1510         for cookie in self:
1511             if (not ignore_discard and cookie.discard
1512                     or not ignore_expires and cookie.is_expired(now)):
1513                 continue
1514             name, value = cookie.name, cookie.value
1515             if value is None:
1516                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1517                 # with no name, whereas http.cookiejar regards it as a
1518                 # cookie with no value.
1519                 name, value = '', name
1520             f.write('%s\n' % '\t'.join((
1521                 cookie.domain,
1522                 self._true_or_false(cookie.domain.startswith('.')),
1523                 cookie.path,
1524                 self._true_or_false(cookie.secure),
1525                 str_or_none(cookie.expires, default=''),
1526                 name, value
1527             )))
1528
1529     def save(self, filename=None, *args, **kwargs):
1530         """
1531         Save cookies to a file.
1532         Code is taken from CPython 3.6
1533         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1534
1535         if filename is None:
1536             if self.filename is not None:
1537                 filename = self.filename
1538             else:
1539                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1540
1541         # Store session cookies with `expires` set to 0 instead of an empty string
1542         for cookie in self:
1543             if cookie.expires is None:
1544                 cookie.expires = 0
1545
1546         with self.open(filename, write=True) as f:
1547             f.write(self._HEADER)
1548             self._really_save(f, *args, **kwargs)
1549
1550     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1551         """Load cookies from a file."""
1552         if filename is None:
1553             if self.filename is not None:
1554                 filename = self.filename
1555             else:
1556                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1557
1558         def prepare_line(line):
1559             if line.startswith(self._HTTPONLY_PREFIX):
1560                 line = line[len(self._HTTPONLY_PREFIX):]
1561             # comments and empty lines are fine
1562             if line.startswith('#') or not line.strip():
1563                 return line
1564             cookie_list = line.split('\t')
1565             if len(cookie_list) != self._ENTRY_LEN:
1566                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1567             cookie = self._CookieFileEntry(*cookie_list)
1568             if cookie.expires_at and not cookie.expires_at.isdigit():
1569                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1570             return line
1571
1572         cf = io.StringIO()
1573         with self.open(filename) as f:
1574             for line in f:
1575                 try:
1576                     cf.write(prepare_line(line))
1577                 except http.cookiejar.LoadError as e:
1578                     if f'{line.strip()} '[0] in '[{"':
1579                         raise http.cookiejar.LoadError(
1580                             'Cookies file must be Netscape formatted, not JSON. See  '
1581                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1582                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1583                     continue
1584         cf.seek(0)
1585         self._really_load(cf, filename, ignore_discard, ignore_expires)
1586         # Session cookies are denoted by either `expires` field set to
1587         # an empty string or 0. MozillaCookieJar only recognizes the former
1588         # (see [1]). So we need force the latter to be recognized as session
1589         # cookies on our own.
1590         # Session cookies may be important for cookies-based authentication,
1591         # e.g. usually, when user does not check 'Remember me' check box while
1592         # logging in on a site, some important cookies are stored as session
1593         # cookies so that not recognizing them will result in failed login.
1594         # 1. https://bugs.python.org/issue17164
1595         for cookie in self:
1596             # Treat `expires=0` cookies as session cookies
1597             if cookie.expires == 0:
1598                 cookie.expires = None
1599                 cookie.discard = True
1600
1601
1602 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1603     def __init__(self, cookiejar=None):
1604         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1605
1606     def http_response(self, request, response):
1607         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1608
1609     https_request = urllib.request.HTTPCookieProcessor.http_request
1610     https_response = http_response
1611
1612
1613 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1614     """YoutubeDL redirect handler
1615
1616     The code is based on HTTPRedirectHandler implementation from CPython [1].
1617
1618     This redirect handler solves two issues:
1619      - ensures redirect URL is always unicode under python 2
1620      - introduces support for experimental HTTP response status code
1621        308 Permanent Redirect [2] used by some sites [3]
1622
1623     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1624     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1625     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1626     """
1627
1628     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1629
1630     def redirect_request(self, req, fp, code, msg, headers, newurl):
1631         """Return a Request or None in response to a redirect.
1632
1633         This is called by the http_error_30x methods when a
1634         redirection response is received.  If a redirection should
1635         take place, return a new Request to allow http_error_30x to
1636         perform the redirect.  Otherwise, raise HTTPError if no-one
1637         else should try to handle this url.  Return None if you can't
1638         but another Handler might.
1639         """
1640         m = req.get_method()
1641         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1642                  or code in (301, 302, 303) and m == "POST")):
1643             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1644         # Strictly (according to RFC 2616), 301 or 302 in response to
1645         # a POST MUST NOT cause a redirection without confirmation
1646         # from the user (of urllib.request, in this case).  In practice,
1647         # essentially all clients do redirect in this case, so we do
1648         # the same.
1649
1650         # Be conciliant with URIs containing a space.  This is mainly
1651         # redundant with the more complete encoding done in http_error_302(),
1652         # but it is kept for compatibility with other callers.
1653         newurl = newurl.replace(' ', '%20')
1654
1655         CONTENT_HEADERS = ("content-length", "content-type")
1656         # NB: don't use dict comprehension for python 2.6 compatibility
1657         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1658
1659         # A 303 must either use GET or HEAD for subsequent request
1660         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1661         if code == 303 and m != 'HEAD':
1662             m = 'GET'
1663         # 301 and 302 redirects are commonly turned into a GET from a POST
1664         # for subsequent requests by browsers, so we'll do the same.
1665         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1666         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1667         if code in (301, 302) and m == 'POST':
1668             m = 'GET'
1669
1670         return urllib.request.Request(
1671             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1672             unverifiable=True, method=m)
1673
1674
1675 def extract_timezone(date_str):
1676     m = re.search(
1677         r'''(?x)
1678             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1679             (?P<tz>Z|                                            # just the UTC Z, or
1680                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1681                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1682                    [ ]?                                          # optional space
1683                 (?P<sign>\+|-)                                   # +/-
1684                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1685             $)
1686         ''', date_str)
1687     if not m:
1688         timezone = datetime.timedelta()
1689     else:
1690         date_str = date_str[:-len(m.group('tz'))]
1691         if not m.group('sign'):
1692             timezone = datetime.timedelta()
1693         else:
1694             sign = 1 if m.group('sign') == '+' else -1
1695             timezone = datetime.timedelta(
1696                 hours=sign * int(m.group('hours')),
1697                 minutes=sign * int(m.group('minutes')))
1698     return timezone, date_str
1699
1700
1701 def parse_iso8601(date_str, delimiter='T', timezone=None):
1702     """ Return a UNIX timestamp from the given date """
1703
1704     if date_str is None:
1705         return None
1706
1707     date_str = re.sub(r'\.[0-9]+', '', date_str)
1708
1709     if timezone is None:
1710         timezone, date_str = extract_timezone(date_str)
1711
1712     with contextlib.suppress(ValueError):
1713         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1714         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1715         return calendar.timegm(dt.timetuple())
1716
1717
1718 def date_formats(day_first=True):
1719     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1720
1721
1722 def unified_strdate(date_str, day_first=True):
1723     """Return a string with the date in the format YYYYMMDD"""
1724
1725     if date_str is None:
1726         return None
1727     upload_date = None
1728     # Replace commas
1729     date_str = date_str.replace(',', ' ')
1730     # Remove AM/PM + timezone
1731     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1732     _, date_str = extract_timezone(date_str)
1733
1734     for expression in date_formats(day_first):
1735         with contextlib.suppress(ValueError):
1736             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1737     if upload_date is None:
1738         timetuple = email.utils.parsedate_tz(date_str)
1739         if timetuple:
1740             with contextlib.suppress(ValueError):
1741                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1742     if upload_date is not None:
1743         return compat_str(upload_date)
1744
1745
1746 def unified_timestamp(date_str, day_first=True):
1747     if date_str is None:
1748         return None
1749
1750     date_str = re.sub(r'[,|]', '', date_str)
1751
1752     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1753     timezone, date_str = extract_timezone(date_str)
1754
1755     # Remove AM/PM + timezone
1756     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1757
1758     # Remove unrecognized timezones from ISO 8601 alike timestamps
1759     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1760     if m:
1761         date_str = date_str[:-len(m.group('tz'))]
1762
1763     # Python only supports microseconds, so remove nanoseconds
1764     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1765     if m:
1766         date_str = m.group(1)
1767
1768     for expression in date_formats(day_first):
1769         with contextlib.suppress(ValueError):
1770             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1771             return calendar.timegm(dt.timetuple())
1772     timetuple = email.utils.parsedate_tz(date_str)
1773     if timetuple:
1774         return calendar.timegm(timetuple) + pm_delta * 3600
1775
1776
1777 def determine_ext(url, default_ext='unknown_video'):
1778     if url is None or '.' not in url:
1779         return default_ext
1780     guess = url.partition('?')[0].rpartition('.')[2]
1781     if re.match(r'^[A-Za-z0-9]+$', guess):
1782         return guess
1783     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1784     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1785         return guess.rstrip('/')
1786     else:
1787         return default_ext
1788
1789
1790 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1791     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1792
1793
1794 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1795     R"""
1796     Return a datetime object from a string.
1797     Supported format:
1798         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1799
1800     @param format       strftime format of DATE
1801     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1802                         auto: round to the unit provided in date_str (if applicable).
1803     """
1804     auto_precision = False
1805     if precision == 'auto':
1806         auto_precision = True
1807         precision = 'microsecond'
1808     today = datetime_round(datetime.datetime.utcnow(), precision)
1809     if date_str in ('now', 'today'):
1810         return today
1811     if date_str == 'yesterday':
1812         return today - datetime.timedelta(days=1)
1813     match = re.match(
1814         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1815         date_str)
1816     if match is not None:
1817         start_time = datetime_from_str(match.group('start'), precision, format)
1818         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1819         unit = match.group('unit')
1820         if unit == 'month' or unit == 'year':
1821             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1822             unit = 'day'
1823         else:
1824             if unit == 'week':
1825                 unit = 'day'
1826                 time *= 7
1827             delta = datetime.timedelta(**{unit + 's': time})
1828             new_date = start_time + delta
1829         if auto_precision:
1830             return datetime_round(new_date, unit)
1831         return new_date
1832
1833     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1834
1835
1836 def date_from_str(date_str, format='%Y%m%d', strict=False):
1837     R"""
1838     Return a date object from a string using datetime_from_str
1839
1840     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1841                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1842     """
1843     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1844         raise ValueError(f'Invalid date format "{date_str}"')
1845     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1846
1847
1848 def datetime_add_months(dt, months):
1849     """Increment/Decrement a datetime object by months."""
1850     month = dt.month + months - 1
1851     year = dt.year + month // 12
1852     month = month % 12 + 1
1853     day = min(dt.day, calendar.monthrange(year, month)[1])
1854     return dt.replace(year, month, day)
1855
1856
1857 def datetime_round(dt, precision='day'):
1858     """
1859     Round a datetime object's time to a specific precision
1860     """
1861     if precision == 'microsecond':
1862         return dt
1863
1864     unit_seconds = {
1865         'day': 86400,
1866         'hour': 3600,
1867         'minute': 60,
1868         'second': 1,
1869     }
1870     roundto = lambda x, n: ((x + n / 2) // n) * n
1871     timestamp = calendar.timegm(dt.timetuple())
1872     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1873
1874
1875 def hyphenate_date(date_str):
1876     """
1877     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1878     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1879     if match is not None:
1880         return '-'.join(match.groups())
1881     else:
1882         return date_str
1883
1884
1885 class DateRange:
1886     """Represents a time interval between two dates"""
1887
1888     def __init__(self, start=None, end=None):
1889         """start and end must be strings in the format accepted by date"""
1890         if start is not None:
1891             self.start = date_from_str(start, strict=True)
1892         else:
1893             self.start = datetime.datetime.min.date()
1894         if end is not None:
1895             self.end = date_from_str(end, strict=True)
1896         else:
1897             self.end = datetime.datetime.max.date()
1898         if self.start > self.end:
1899             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1900
1901     @classmethod
1902     def day(cls, day):
1903         """Returns a range that only contains the given day"""
1904         return cls(day, day)
1905
1906     def __contains__(self, date):
1907         """Check if the date is in the range"""
1908         if not isinstance(date, datetime.date):
1909             date = date_from_str(date)
1910         return self.start <= date <= self.end
1911
1912     def __str__(self):
1913         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1914
1915
1916 def platform_name():
1917     """ Returns the platform name as a compat_str """
1918     res = platform.platform()
1919     if isinstance(res, bytes):
1920         res = res.decode(preferredencoding())
1921
1922     assert isinstance(res, compat_str)
1923     return res
1924
1925
1926 @functools.cache
1927 def get_windows_version():
1928     ''' Get Windows version. returns () if it's not running on Windows '''
1929     if compat_os_name == 'nt':
1930         return version_tuple(platform.win32_ver()[1])
1931     else:
1932         return ()
1933
1934
1935 def write_string(s, out=None, encoding=None):
1936     assert isinstance(s, str)
1937     out = out or sys.stderr
1938
1939     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1940         s = re.sub(r'([\r\n]+)', r' \1', s)
1941
1942     enc, buffer = None, out
1943     if 'b' in getattr(out, 'mode', ''):
1944         enc = encoding or preferredencoding()
1945     elif hasattr(out, 'buffer'):
1946         buffer = out.buffer
1947         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1948
1949     buffer.write(s.encode(enc, 'ignore') if enc else s)
1950     out.flush()
1951
1952
1953 def bytes_to_intlist(bs):
1954     if not bs:
1955         return []
1956     if isinstance(bs[0], int):  # Python 3
1957         return list(bs)
1958     else:
1959         return [ord(c) for c in bs]
1960
1961
1962 def intlist_to_bytes(xs):
1963     if not xs:
1964         return b''
1965     return struct.pack('%dB' % len(xs), *xs)
1966
1967
1968 class LockingUnsupportedError(OSError):
1969     msg = 'File locking is not supported'
1970
1971     def __init__(self):
1972         super().__init__(self.msg)
1973
1974
1975 # Cross-platform file locking
1976 if sys.platform == 'win32':
1977     import ctypes.wintypes
1978     import msvcrt
1979
1980     class OVERLAPPED(ctypes.Structure):
1981         _fields_ = [
1982             ('Internal', ctypes.wintypes.LPVOID),
1983             ('InternalHigh', ctypes.wintypes.LPVOID),
1984             ('Offset', ctypes.wintypes.DWORD),
1985             ('OffsetHigh', ctypes.wintypes.DWORD),
1986             ('hEvent', ctypes.wintypes.HANDLE),
1987         ]
1988
1989     kernel32 = ctypes.windll.kernel32
1990     LockFileEx = kernel32.LockFileEx
1991     LockFileEx.argtypes = [
1992         ctypes.wintypes.HANDLE,     # hFile
1993         ctypes.wintypes.DWORD,      # dwFlags
1994         ctypes.wintypes.DWORD,      # dwReserved
1995         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1996         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1997         ctypes.POINTER(OVERLAPPED)  # Overlapped
1998     ]
1999     LockFileEx.restype = ctypes.wintypes.BOOL
2000     UnlockFileEx = kernel32.UnlockFileEx
2001     UnlockFileEx.argtypes = [
2002         ctypes.wintypes.HANDLE,     # hFile
2003         ctypes.wintypes.DWORD,      # dwReserved
2004         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2005         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2006         ctypes.POINTER(OVERLAPPED)  # Overlapped
2007     ]
2008     UnlockFileEx.restype = ctypes.wintypes.BOOL
2009     whole_low = 0xffffffff
2010     whole_high = 0x7fffffff
2011
2012     def _lock_file(f, exclusive, block):
2013         overlapped = OVERLAPPED()
2014         overlapped.Offset = 0
2015         overlapped.OffsetHigh = 0
2016         overlapped.hEvent = 0
2017         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2018
2019         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2020                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2021                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2022             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2023             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2024
2025     def _unlock_file(f):
2026         assert f._lock_file_overlapped_p
2027         handle = msvcrt.get_osfhandle(f.fileno())
2028         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2029             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2030
2031 else:
2032     try:
2033         import fcntl
2034
2035         def _lock_file(f, exclusive, block):
2036             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2037             if not block:
2038                 flags |= fcntl.LOCK_NB
2039             try:
2040                 fcntl.flock(f, flags)
2041             except BlockingIOError:
2042                 raise
2043             except OSError:  # AOSP does not have flock()
2044                 fcntl.lockf(f, flags)
2045
2046         def _unlock_file(f):
2047             try:
2048                 fcntl.flock(f, fcntl.LOCK_UN)
2049             except OSError:
2050                 fcntl.lockf(f, fcntl.LOCK_UN)
2051
2052     except ImportError:
2053
2054         def _lock_file(f, exclusive, block):
2055             raise LockingUnsupportedError()
2056
2057         def _unlock_file(f):
2058             raise LockingUnsupportedError()
2059
2060
2061 class locked_file:
2062     locked = False
2063
2064     def __init__(self, filename, mode, block=True, encoding=None):
2065         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2066             raise NotImplementedError(mode)
2067         self.mode, self.block = mode, block
2068
2069         writable = any(f in mode for f in 'wax+')
2070         readable = any(f in mode for f in 'r+')
2071         flags = functools.reduce(operator.ior, (
2072             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2073             getattr(os, 'O_BINARY', 0),  # Windows only
2074             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2075             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2076             os.O_APPEND if 'a' in mode else 0,
2077             os.O_EXCL if 'x' in mode else 0,
2078             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2079         ))
2080
2081         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2082
2083     def __enter__(self):
2084         exclusive = 'r' not in self.mode
2085         try:
2086             _lock_file(self.f, exclusive, self.block)
2087             self.locked = True
2088         except OSError:
2089             self.f.close()
2090             raise
2091         if 'w' in self.mode:
2092             try:
2093                 self.f.truncate()
2094             except OSError as e:
2095                 if e.errno not in (
2096                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2097                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2098                 ):
2099                     raise
2100         return self
2101
2102     def unlock(self):
2103         if not self.locked:
2104             return
2105         try:
2106             _unlock_file(self.f)
2107         finally:
2108             self.locked = False
2109
2110     def __exit__(self, *_):
2111         try:
2112             self.unlock()
2113         finally:
2114             self.f.close()
2115
2116     open = __enter__
2117     close = __exit__
2118
2119     def __getattr__(self, attr):
2120         return getattr(self.f, attr)
2121
2122     def __iter__(self):
2123         return iter(self.f)
2124
2125
2126 @functools.cache
2127 def get_filesystem_encoding():
2128     encoding = sys.getfilesystemencoding()
2129     return encoding if encoding is not None else 'utf-8'
2130
2131
2132 def shell_quote(args):
2133     quoted_args = []
2134     encoding = get_filesystem_encoding()
2135     for a in args:
2136         if isinstance(a, bytes):
2137             # We may get a filename encoded with 'encodeFilename'
2138             a = a.decode(encoding)
2139         quoted_args.append(compat_shlex_quote(a))
2140     return ' '.join(quoted_args)
2141
2142
2143 def smuggle_url(url, data):
2144     """ Pass additional data in a URL for internal use. """
2145
2146     url, idata = unsmuggle_url(url, {})
2147     data.update(idata)
2148     sdata = compat_urllib_parse_urlencode(
2149         {'__youtubedl_smuggle': json.dumps(data)})
2150     return url + '#' + sdata
2151
2152
2153 def unsmuggle_url(smug_url, default=None):
2154     if '#__youtubedl_smuggle' not in smug_url:
2155         return smug_url, default
2156     url, _, sdata = smug_url.rpartition('#')
2157     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2158     data = json.loads(jsond)
2159     return url, data
2160
2161
2162 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2163     """ Formats numbers with decimal sufixes like K, M, etc """
2164     num, factor = float_or_none(num), float(factor)
2165     if num is None or num < 0:
2166         return None
2167     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2168     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2169     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2170     if factor == 1024:
2171         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2172     converted = num / (factor ** exponent)
2173     return fmt % (converted, suffix)
2174
2175
2176 def format_bytes(bytes):
2177     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2178
2179
2180 def lookup_unit_table(unit_table, s):
2181     units_re = '|'.join(re.escape(u) for u in unit_table)
2182     m = re.match(
2183         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2184     if not m:
2185         return None
2186     num_str = m.group('num').replace(',', '.')
2187     mult = unit_table[m.group('unit')]
2188     return int(float(num_str) * mult)
2189
2190
2191 def parse_filesize(s):
2192     if s is None:
2193         return None
2194
2195     # The lower-case forms are of course incorrect and unofficial,
2196     # but we support those too
2197     _UNIT_TABLE = {
2198         'B': 1,
2199         'b': 1,
2200         'bytes': 1,
2201         'KiB': 1024,
2202         'KB': 1000,
2203         'kB': 1024,
2204         'Kb': 1000,
2205         'kb': 1000,
2206         'kilobytes': 1000,
2207         'kibibytes': 1024,
2208         'MiB': 1024 ** 2,
2209         'MB': 1000 ** 2,
2210         'mB': 1024 ** 2,
2211         'Mb': 1000 ** 2,
2212         'mb': 1000 ** 2,
2213         'megabytes': 1000 ** 2,
2214         'mebibytes': 1024 ** 2,
2215         'GiB': 1024 ** 3,
2216         'GB': 1000 ** 3,
2217         'gB': 1024 ** 3,
2218         'Gb': 1000 ** 3,
2219         'gb': 1000 ** 3,
2220         'gigabytes': 1000 ** 3,
2221         'gibibytes': 1024 ** 3,
2222         'TiB': 1024 ** 4,
2223         'TB': 1000 ** 4,
2224         'tB': 1024 ** 4,
2225         'Tb': 1000 ** 4,
2226         'tb': 1000 ** 4,
2227         'terabytes': 1000 ** 4,
2228         'tebibytes': 1024 ** 4,
2229         'PiB': 1024 ** 5,
2230         'PB': 1000 ** 5,
2231         'pB': 1024 ** 5,
2232         'Pb': 1000 ** 5,
2233         'pb': 1000 ** 5,
2234         'petabytes': 1000 ** 5,
2235         'pebibytes': 1024 ** 5,
2236         'EiB': 1024 ** 6,
2237         'EB': 1000 ** 6,
2238         'eB': 1024 ** 6,
2239         'Eb': 1000 ** 6,
2240         'eb': 1000 ** 6,
2241         'exabytes': 1000 ** 6,
2242         'exbibytes': 1024 ** 6,
2243         'ZiB': 1024 ** 7,
2244         'ZB': 1000 ** 7,
2245         'zB': 1024 ** 7,
2246         'Zb': 1000 ** 7,
2247         'zb': 1000 ** 7,
2248         'zettabytes': 1000 ** 7,
2249         'zebibytes': 1024 ** 7,
2250         'YiB': 1024 ** 8,
2251         'YB': 1000 ** 8,
2252         'yB': 1024 ** 8,
2253         'Yb': 1000 ** 8,
2254         'yb': 1000 ** 8,
2255         'yottabytes': 1000 ** 8,
2256         'yobibytes': 1024 ** 8,
2257     }
2258
2259     return lookup_unit_table(_UNIT_TABLE, s)
2260
2261
2262 def parse_count(s):
2263     if s is None:
2264         return None
2265
2266     s = re.sub(r'^[^\d]+\s', '', s).strip()
2267
2268     if re.match(r'^[\d,.]+$', s):
2269         return str_to_int(s)
2270
2271     _UNIT_TABLE = {
2272         'k': 1000,
2273         'K': 1000,
2274         'm': 1000 ** 2,
2275         'M': 1000 ** 2,
2276         'kk': 1000 ** 2,
2277         'KK': 1000 ** 2,
2278         'b': 1000 ** 3,
2279         'B': 1000 ** 3,
2280     }
2281
2282     ret = lookup_unit_table(_UNIT_TABLE, s)
2283     if ret is not None:
2284         return ret
2285
2286     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2287     if mobj:
2288         return str_to_int(mobj.group(1))
2289
2290
2291 def parse_resolution(s, *, lenient=False):
2292     if s is None:
2293         return {}
2294
2295     if lenient:
2296         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2297     else:
2298         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2299     if mobj:
2300         return {
2301             'width': int(mobj.group('w')),
2302             'height': int(mobj.group('h')),
2303         }
2304
2305     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2306     if mobj:
2307         return {'height': int(mobj.group(1))}
2308
2309     mobj = re.search(r'\b([48])[kK]\b', s)
2310     if mobj:
2311         return {'height': int(mobj.group(1)) * 540}
2312
2313     return {}
2314
2315
2316 def parse_bitrate(s):
2317     if not isinstance(s, compat_str):
2318         return
2319     mobj = re.search(r'\b(\d+)\s*kbps', s)
2320     if mobj:
2321         return int(mobj.group(1))
2322
2323
2324 def month_by_name(name, lang='en'):
2325     """ Return the number of a month by (locale-independently) English name """
2326
2327     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2328
2329     try:
2330         return month_names.index(name) + 1
2331     except ValueError:
2332         return None
2333
2334
2335 def month_by_abbreviation(abbrev):
2336     """ Return the number of a month by (locale-independently) English
2337         abbreviations """
2338
2339     try:
2340         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2341     except ValueError:
2342         return None
2343
2344
2345 def fix_xml_ampersands(xml_str):
2346     """Replace all the '&' by '&amp;' in XML"""
2347     return re.sub(
2348         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2349         '&amp;',
2350         xml_str)
2351
2352
2353 def setproctitle(title):
2354     assert isinstance(title, compat_str)
2355
2356     # ctypes in Jython is not complete
2357     # http://bugs.jython.org/issue2148
2358     if sys.platform.startswith('java'):
2359         return
2360
2361     try:
2362         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2363     except OSError:
2364         return
2365     except TypeError:
2366         # LoadLibrary in Windows Python 2.7.13 only expects
2367         # a bytestring, but since unicode_literals turns
2368         # every string into a unicode string, it fails.
2369         return
2370     title_bytes = title.encode()
2371     buf = ctypes.create_string_buffer(len(title_bytes))
2372     buf.value = title_bytes
2373     try:
2374         libc.prctl(15, buf, 0, 0, 0)
2375     except AttributeError:
2376         return  # Strange libc, just skip this
2377
2378
2379 def remove_start(s, start):
2380     return s[len(start):] if s is not None and s.startswith(start) else s
2381
2382
2383 def remove_end(s, end):
2384     return s[:-len(end)] if s is not None and s.endswith(end) else s
2385
2386
2387 def remove_quotes(s):
2388     if s is None or len(s) < 2:
2389         return s
2390     for quote in ('"', "'", ):
2391         if s[0] == quote and s[-1] == quote:
2392             return s[1:-1]
2393     return s
2394
2395
2396 def get_domain(url):
2397     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2398     return domain.group('domain') if domain else None
2399
2400
2401 def url_basename(url):
2402     path = compat_urlparse.urlparse(url).path
2403     return path.strip('/').split('/')[-1]
2404
2405
2406 def base_url(url):
2407     return re.match(r'https?://[^?#&]+/', url).group()
2408
2409
2410 def urljoin(base, path):
2411     if isinstance(path, bytes):
2412         path = path.decode()
2413     if not isinstance(path, compat_str) or not path:
2414         return None
2415     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2416         return path
2417     if isinstance(base, bytes):
2418         base = base.decode()
2419     if not isinstance(base, compat_str) or not re.match(
2420             r'^(?:https?:)?//', base):
2421         return None
2422     return compat_urlparse.urljoin(base, path)
2423
2424
2425 class HEADRequest(urllib.request.Request):
2426     def get_method(self):
2427         return 'HEAD'
2428
2429
2430 class PUTRequest(urllib.request.Request):
2431     def get_method(self):
2432         return 'PUT'
2433
2434
2435 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2436     if get_attr and v is not None:
2437         v = getattr(v, get_attr, None)
2438     try:
2439         return int(v) * invscale // scale
2440     except (ValueError, TypeError, OverflowError):
2441         return default
2442
2443
2444 def str_or_none(v, default=None):
2445     return default if v is None else compat_str(v)
2446
2447
2448 def str_to_int(int_str):
2449     """ A more relaxed version of int_or_none """
2450     if isinstance(int_str, int):
2451         return int_str
2452     elif isinstance(int_str, compat_str):
2453         int_str = re.sub(r'[,\.\+]', '', int_str)
2454         return int_or_none(int_str)
2455
2456
2457 def float_or_none(v, scale=1, invscale=1, default=None):
2458     if v is None:
2459         return default
2460     try:
2461         return float(v) * invscale / scale
2462     except (ValueError, TypeError):
2463         return default
2464
2465
2466 def bool_or_none(v, default=None):
2467     return v if isinstance(v, bool) else default
2468
2469
2470 def strip_or_none(v, default=None):
2471     return v.strip() if isinstance(v, compat_str) else default
2472
2473
2474 def url_or_none(url):
2475     if not url or not isinstance(url, compat_str):
2476         return None
2477     url = url.strip()
2478     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2479
2480
2481 def request_to_url(req):
2482     if isinstance(req, urllib.request.Request):
2483         return req.get_full_url()
2484     else:
2485         return req
2486
2487
2488 def strftime_or_none(timestamp, date_format, default=None):
2489     datetime_object = None
2490     try:
2491         if isinstance(timestamp, (int, float)):  # unix timestamp
2492             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2493         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2494             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2495         return datetime_object.strftime(date_format)
2496     except (ValueError, TypeError, AttributeError):
2497         return default
2498
2499
2500 def parse_duration(s):
2501     if not isinstance(s, str):
2502         return None
2503     s = s.strip()
2504     if not s:
2505         return None
2506
2507     days, hours, mins, secs, ms = [None] * 5
2508     m = re.match(r'''(?x)
2509             (?P<before_secs>
2510                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2511             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2512             (?P<ms>[.:][0-9]+)?Z?$
2513         ''', s)
2514     if m:
2515         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2516     else:
2517         m = re.match(
2518             r'''(?ix)(?:P?
2519                 (?:
2520                     [0-9]+\s*y(?:ears?)?,?\s*
2521                 )?
2522                 (?:
2523                     [0-9]+\s*m(?:onths?)?,?\s*
2524                 )?
2525                 (?:
2526                     [0-9]+\s*w(?:eeks?)?,?\s*
2527                 )?
2528                 (?:
2529                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2530                 )?
2531                 T)?
2532                 (?:
2533                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2534                 )?
2535                 (?:
2536                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2537                 )?
2538                 (?:
2539                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2540                 )?Z?$''', s)
2541         if m:
2542             days, hours, mins, secs, ms = m.groups()
2543         else:
2544             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2545             if m:
2546                 hours, mins = m.groups()
2547             else:
2548                 return None
2549
2550     if ms:
2551         ms = ms.replace(':', '.')
2552     return sum(float(part or 0) * mult for part, mult in (
2553         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2554
2555
2556 def prepend_extension(filename, ext, expected_real_ext=None):
2557     name, real_ext = os.path.splitext(filename)
2558     return (
2559         f'{name}.{ext}{real_ext}'
2560         if not expected_real_ext or real_ext[1:] == expected_real_ext
2561         else f'{filename}.{ext}')
2562
2563
2564 def replace_extension(filename, ext, expected_real_ext=None):
2565     name, real_ext = os.path.splitext(filename)
2566     return '{}.{}'.format(
2567         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2568         ext)
2569
2570
2571 def check_executable(exe, args=[]):
2572     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2573     args can be a list of arguments for a short output (like -version) """
2574     try:
2575         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2576     except OSError:
2577         return False
2578     return exe
2579
2580
2581 def _get_exe_version_output(exe, args, *, to_screen=None):
2582     if to_screen:
2583         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2584     try:
2585         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2586         # SIGTTOU if yt-dlp is run in the background.
2587         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2588         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2589                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2590     except OSError:
2591         return False
2592     return stdout
2593
2594
2595 def detect_exe_version(output, version_re=None, unrecognized='present'):
2596     assert isinstance(output, compat_str)
2597     if version_re is None:
2598         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2599     m = re.search(version_re, output)
2600     if m:
2601         return m.group(1)
2602     else:
2603         return unrecognized
2604
2605
2606 def get_exe_version(exe, args=['--version'],
2607                     version_re=None, unrecognized='present'):
2608     """ Returns the version of the specified executable,
2609     or False if the executable is not present """
2610     out = _get_exe_version_output(exe, args)
2611     return detect_exe_version(out, version_re, unrecognized) if out else False
2612
2613
2614 def frange(start=0, stop=None, step=1):
2615     """Float range"""
2616     if stop is None:
2617         start, stop = 0, start
2618     sign = [-1, 1][step > 0] if step else 0
2619     while sign * start < sign * stop:
2620         yield start
2621         start += step
2622
2623
2624 class LazyList(collections.abc.Sequence):
2625     """Lazy immutable list from an iterable
2626     Note that slices of a LazyList are lists and not LazyList"""
2627
2628     class IndexError(IndexError):
2629         pass
2630
2631     def __init__(self, iterable, *, reverse=False, _cache=None):
2632         self._iterable = iter(iterable)
2633         self._cache = [] if _cache is None else _cache
2634         self._reversed = reverse
2635
2636     def __iter__(self):
2637         if self._reversed:
2638             # We need to consume the entire iterable to iterate in reverse
2639             yield from self.exhaust()
2640             return
2641         yield from self._cache
2642         for item in self._iterable:
2643             self._cache.append(item)
2644             yield item
2645
2646     def _exhaust(self):
2647         self._cache.extend(self._iterable)
2648         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2649         return self._cache
2650
2651     def exhaust(self):
2652         """Evaluate the entire iterable"""
2653         return self._exhaust()[::-1 if self._reversed else 1]
2654
2655     @staticmethod
2656     def _reverse_index(x):
2657         return None if x is None else -(x + 1)
2658
2659     def __getitem__(self, idx):
2660         if isinstance(idx, slice):
2661             if self._reversed:
2662                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2663             start, stop, step = idx.start, idx.stop, idx.step or 1
2664         elif isinstance(idx, int):
2665             if self._reversed:
2666                 idx = self._reverse_index(idx)
2667             start, stop, step = idx, idx, 0
2668         else:
2669             raise TypeError('indices must be integers or slices')
2670         if ((start or 0) < 0 or (stop or 0) < 0
2671                 or (start is None and step < 0)
2672                 or (stop is None and step > 0)):
2673             # We need to consume the entire iterable to be able to slice from the end
2674             # Obviously, never use this with infinite iterables
2675             self._exhaust()
2676             try:
2677                 return self._cache[idx]
2678             except IndexError as e:
2679                 raise self.IndexError(e) from e
2680         n = max(start or 0, stop or 0) - len(self._cache) + 1
2681         if n > 0:
2682             self._cache.extend(itertools.islice(self._iterable, n))
2683         try:
2684             return self._cache[idx]
2685         except IndexError as e:
2686             raise self.IndexError(e) from e
2687
2688     def __bool__(self):
2689         try:
2690             self[-1] if self._reversed else self[0]
2691         except self.IndexError:
2692             return False
2693         return True
2694
2695     def __len__(self):
2696         self._exhaust()
2697         return len(self._cache)
2698
2699     def __reversed__(self):
2700         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2701
2702     def __copy__(self):
2703         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2704
2705     def __repr__(self):
2706         # repr and str should mimic a list. So we exhaust the iterable
2707         return repr(self.exhaust())
2708
2709     def __str__(self):
2710         return repr(self.exhaust())
2711
2712
2713 class PagedList:
2714
2715     class IndexError(IndexError):
2716         pass
2717
2718     def __len__(self):
2719         # This is only useful for tests
2720         return len(self.getslice())
2721
2722     def __init__(self, pagefunc, pagesize, use_cache=True):
2723         self._pagefunc = pagefunc
2724         self._pagesize = pagesize
2725         self._pagecount = float('inf')
2726         self._use_cache = use_cache
2727         self._cache = {}
2728
2729     def getpage(self, pagenum):
2730         page_results = self._cache.get(pagenum)
2731         if page_results is None:
2732             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2733         if self._use_cache:
2734             self._cache[pagenum] = page_results
2735         return page_results
2736
2737     def getslice(self, start=0, end=None):
2738         return list(self._getslice(start, end))
2739
2740     def _getslice(self, start, end):
2741         raise NotImplementedError('This method must be implemented by subclasses')
2742
2743     def __getitem__(self, idx):
2744         assert self._use_cache, 'Indexing PagedList requires cache'
2745         if not isinstance(idx, int) or idx < 0:
2746             raise TypeError('indices must be non-negative integers')
2747         entries = self.getslice(idx, idx + 1)
2748         if not entries:
2749             raise self.IndexError()
2750         return entries[0]
2751
2752
2753 class OnDemandPagedList(PagedList):
2754     """Download pages until a page with less than maximum results"""
2755
2756     def _getslice(self, start, end):
2757         for pagenum in itertools.count(start // self._pagesize):
2758             firstid = pagenum * self._pagesize
2759             nextfirstid = pagenum * self._pagesize + self._pagesize
2760             if start >= nextfirstid:
2761                 continue
2762
2763             startv = (
2764                 start % self._pagesize
2765                 if firstid <= start < nextfirstid
2766                 else 0)
2767             endv = (
2768                 ((end - 1) % self._pagesize) + 1
2769                 if (end is not None and firstid <= end <= nextfirstid)
2770                 else None)
2771
2772             try:
2773                 page_results = self.getpage(pagenum)
2774             except Exception:
2775                 self._pagecount = pagenum - 1
2776                 raise
2777             if startv != 0 or endv is not None:
2778                 page_results = page_results[startv:endv]
2779             yield from page_results
2780
2781             # A little optimization - if current page is not "full", ie. does
2782             # not contain page_size videos then we can assume that this page
2783             # is the last one - there are no more ids on further pages -
2784             # i.e. no need to query again.
2785             if len(page_results) + startv < self._pagesize:
2786                 break
2787
2788             # If we got the whole page, but the next page is not interesting,
2789             # break out early as well
2790             if end == nextfirstid:
2791                 break
2792
2793
2794 class InAdvancePagedList(PagedList):
2795     """PagedList with total number of pages known in advance"""
2796
2797     def __init__(self, pagefunc, pagecount, pagesize):
2798         PagedList.__init__(self, pagefunc, pagesize, True)
2799         self._pagecount = pagecount
2800
2801     def _getslice(self, start, end):
2802         start_page = start // self._pagesize
2803         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2804         skip_elems = start - start_page * self._pagesize
2805         only_more = None if end is None else end - start
2806         for pagenum in range(start_page, end_page):
2807             page_results = self.getpage(pagenum)
2808             if skip_elems:
2809                 page_results = page_results[skip_elems:]
2810                 skip_elems = None
2811             if only_more is not None:
2812                 if len(page_results) < only_more:
2813                     only_more -= len(page_results)
2814                 else:
2815                     yield from page_results[:only_more]
2816                     break
2817             yield from page_results
2818
2819
2820 class PlaylistEntries:
2821     MissingEntry = object()
2822     is_exhausted = False
2823
2824     def __init__(self, ydl, info_dict):
2825         self.ydl = ydl
2826
2827         # _entries must be assigned now since infodict can change during iteration
2828         entries = info_dict.get('entries')
2829         if entries is None:
2830             raise EntryNotInPlaylist('There are no entries')
2831         elif isinstance(entries, list):
2832             self.is_exhausted = True
2833
2834         requested_entries = info_dict.get('requested_entries')
2835         self.is_incomplete = bool(requested_entries)
2836         if self.is_incomplete:
2837             assert self.is_exhausted
2838             self._entries = [self.MissingEntry] * max(requested_entries)
2839             for i, entry in zip(requested_entries, entries):
2840                 self._entries[i - 1] = entry
2841         elif isinstance(entries, (list, PagedList, LazyList)):
2842             self._entries = entries
2843         else:
2844             self._entries = LazyList(entries)
2845
2846     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2847         (?P<start>[+-]?\d+)?
2848         (?P<range>[:-]
2849             (?P<end>[+-]?\d+|inf(?:inite)?)?
2850             (?::(?P<step>[+-]?\d+))?
2851         )?''')
2852
2853     @classmethod
2854     def parse_playlist_items(cls, string):
2855         for segment in string.split(','):
2856             if not segment:
2857                 raise ValueError('There is two or more consecutive commas')
2858             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2859             if not mobj:
2860                 raise ValueError(f'{segment!r} is not a valid specification')
2861             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2862             if int_or_none(step) == 0:
2863                 raise ValueError(f'Step in {segment!r} cannot be zero')
2864             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2865
2866     def get_requested_items(self):
2867         playlist_items = self.ydl.params.get('playlist_items')
2868         playlist_start = self.ydl.params.get('playliststart', 1)
2869         playlist_end = self.ydl.params.get('playlistend')
2870         # For backwards compatibility, interpret -1 as whole list
2871         if playlist_end in (-1, None):
2872             playlist_end = ''
2873         if not playlist_items:
2874             playlist_items = f'{playlist_start}:{playlist_end}'
2875         elif playlist_start != 1 or playlist_end:
2876             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2877
2878         for index in self.parse_playlist_items(playlist_items):
2879             for i, entry in self[index]:
2880                 yield i, entry
2881                 if not entry:
2882                     continue
2883                 try:
2884                     # TODO: Add auto-generated fields
2885                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2886                 except (ExistingVideoReached, RejectedVideoReached):
2887                     return
2888
2889     def get_full_count(self):
2890         if self.is_exhausted and not self.is_incomplete:
2891             return len(self)
2892         elif isinstance(self._entries, InAdvancePagedList):
2893             if self._entries._pagesize == 1:
2894                 return self._entries._pagecount
2895
2896     @functools.cached_property
2897     def _getter(self):
2898         if isinstance(self._entries, list):
2899             def get_entry(i):
2900                 try:
2901                     entry = self._entries[i]
2902                 except IndexError:
2903                     entry = self.MissingEntry
2904                     if not self.is_incomplete:
2905                         raise self.IndexError()
2906                 if entry is self.MissingEntry:
2907                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2908                 return entry
2909         else:
2910             def get_entry(i):
2911                 try:
2912                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2913                 except (LazyList.IndexError, PagedList.IndexError):
2914                     raise self.IndexError()
2915         return get_entry
2916
2917     def __getitem__(self, idx):
2918         if isinstance(idx, int):
2919             idx = slice(idx, idx)
2920
2921         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2922         step = 1 if idx.step is None else idx.step
2923         if idx.start is None:
2924             start = 0 if step > 0 else len(self) - 1
2925         else:
2926             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2927
2928         # NB: Do not call len(self) when idx == [:]
2929         if idx.stop is None:
2930             stop = 0 if step < 0 else float('inf')
2931         else:
2932             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2933         stop += [-1, 1][step > 0]
2934
2935         for i in frange(start, stop, step):
2936             if i < 0:
2937                 continue
2938             try:
2939                 entry = self._getter(i)
2940             except self.IndexError:
2941                 self.is_exhausted = True
2942                 if step > 0:
2943                     break
2944                 continue
2945             yield i + 1, entry
2946
2947     def __len__(self):
2948         return len(tuple(self[:]))
2949
2950     class IndexError(IndexError):
2951         pass
2952
2953
2954 def uppercase_escape(s):
2955     unicode_escape = codecs.getdecoder('unicode_escape')
2956     return re.sub(
2957         r'\\U[0-9a-fA-F]{8}',
2958         lambda m: unicode_escape(m.group(0))[0],
2959         s)
2960
2961
2962 def lowercase_escape(s):
2963     unicode_escape = codecs.getdecoder('unicode_escape')
2964     return re.sub(
2965         r'\\u[0-9a-fA-F]{4}',
2966         lambda m: unicode_escape(m.group(0))[0],
2967         s)
2968
2969
2970 def escape_rfc3986(s):
2971     """Escape non-ASCII characters as suggested by RFC 3986"""
2972     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2973
2974
2975 def escape_url(url):
2976     """Escape URL as suggested by RFC 3986"""
2977     url_parsed = compat_urllib_parse_urlparse(url)
2978     return url_parsed._replace(
2979         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2980         path=escape_rfc3986(url_parsed.path),
2981         params=escape_rfc3986(url_parsed.params),
2982         query=escape_rfc3986(url_parsed.query),
2983         fragment=escape_rfc3986(url_parsed.fragment)
2984     ).geturl()
2985
2986
2987 def parse_qs(url):
2988     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2989
2990
2991 def read_batch_urls(batch_fd):
2992     def fixup(url):
2993         if not isinstance(url, compat_str):
2994             url = url.decode('utf-8', 'replace')
2995         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2996         for bom in BOM_UTF8:
2997             if url.startswith(bom):
2998                 url = url[len(bom):]
2999         url = url.lstrip()
3000         if not url or url.startswith(('#', ';', ']')):
3001             return False
3002         # "#" cannot be stripped out since it is part of the URI
3003         # However, it can be safely stipped out if follwing a whitespace
3004         return re.split(r'\s#', url, 1)[0].rstrip()
3005
3006     with contextlib.closing(batch_fd) as fd:
3007         return [url for url in map(fixup, fd) if url]
3008
3009
3010 def urlencode_postdata(*args, **kargs):
3011     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3012
3013
3014 def update_url_query(url, query):
3015     if not query:
3016         return url
3017     parsed_url = compat_urlparse.urlparse(url)
3018     qs = compat_parse_qs(parsed_url.query)
3019     qs.update(query)
3020     return compat_urlparse.urlunparse(parsed_url._replace(
3021         query=compat_urllib_parse_urlencode(qs, True)))
3022
3023
3024 def update_Request(req, url=None, data=None, headers={}, query={}):
3025     req_headers = req.headers.copy()
3026     req_headers.update(headers)
3027     req_data = data or req.data
3028     req_url = update_url_query(url or req.get_full_url(), query)
3029     req_get_method = req.get_method()
3030     if req_get_method == 'HEAD':
3031         req_type = HEADRequest
3032     elif req_get_method == 'PUT':
3033         req_type = PUTRequest
3034     else:
3035         req_type = urllib.request.Request
3036     new_req = req_type(
3037         req_url, data=req_data, headers=req_headers,
3038         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3039     if hasattr(req, 'timeout'):
3040         new_req.timeout = req.timeout
3041     return new_req
3042
3043
3044 def _multipart_encode_impl(data, boundary):
3045     content_type = 'multipart/form-data; boundary=%s' % boundary
3046
3047     out = b''
3048     for k, v in data.items():
3049         out += b'--' + boundary.encode('ascii') + b'\r\n'
3050         if isinstance(k, compat_str):
3051             k = k.encode()
3052         if isinstance(v, compat_str):
3053             v = v.encode()
3054         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3055         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3056         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3057         if boundary.encode('ascii') in content:
3058             raise ValueError('Boundary overlaps with data')
3059         out += content
3060
3061     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3062
3063     return out, content_type
3064
3065
3066 def multipart_encode(data, boundary=None):
3067     '''
3068     Encode a dict to RFC 7578-compliant form-data
3069
3070     data:
3071         A dict where keys and values can be either Unicode or bytes-like
3072         objects.
3073     boundary:
3074         If specified a Unicode object, it's used as the boundary. Otherwise
3075         a random boundary is generated.
3076
3077     Reference: https://tools.ietf.org/html/rfc7578
3078     '''
3079     has_specified_boundary = boundary is not None
3080
3081     while True:
3082         if boundary is None:
3083             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3084
3085         try:
3086             out, content_type = _multipart_encode_impl(data, boundary)
3087             break
3088         except ValueError:
3089             if has_specified_boundary:
3090                 raise
3091             boundary = None
3092
3093     return out, content_type
3094
3095
3096 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3097     for val in map(d.get, variadic(key_or_keys)):
3098         if val is not None and (val or not skip_false_values):
3099             return val
3100     return default
3101
3102
3103 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3104     for f in funcs:
3105         try:
3106             val = f(*args, **kwargs)
3107         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3108             pass
3109         else:
3110             if expected_type is None or isinstance(val, expected_type):
3111                 return val
3112
3113
3114 def try_get(src, getter, expected_type=None):
3115     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3116
3117
3118 def filter_dict(dct, cndn=lambda _, v: v is not None):
3119     return {k: v for k, v in dct.items() if cndn(k, v)}
3120
3121
3122 def merge_dicts(*dicts):
3123     merged = {}
3124     for a_dict in dicts:
3125         for k, v in a_dict.items():
3126             if (v is not None and k not in merged
3127                     or isinstance(v, str) and merged[k] == ''):
3128                 merged[k] = v
3129     return merged
3130
3131
3132 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3133     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3134
3135
3136 US_RATINGS = {
3137     'G': 0,
3138     'PG': 10,
3139     'PG-13': 13,
3140     'R': 16,
3141     'NC': 18,
3142 }
3143
3144
3145 TV_PARENTAL_GUIDELINES = {
3146     'TV-Y': 0,
3147     'TV-Y7': 7,
3148     'TV-G': 0,
3149     'TV-PG': 0,
3150     'TV-14': 14,
3151     'TV-MA': 17,
3152 }
3153
3154
3155 def parse_age_limit(s):
3156     # isinstance(False, int) is True. So type() must be used instead
3157     if type(s) is int:  # noqa: E721
3158         return s if 0 <= s <= 21 else None
3159     elif not isinstance(s, str):
3160         return None
3161     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3162     if m:
3163         return int(m.group('age'))
3164     s = s.upper()
3165     if s in US_RATINGS:
3166         return US_RATINGS[s]
3167     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3168     if m:
3169         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3170     return None
3171
3172
3173 def strip_jsonp(code):
3174     return re.sub(
3175         r'''(?sx)^
3176             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3177             (?:\s*&&\s*(?P=func_name))?
3178             \s*\(\s*(?P<callback_data>.*)\);?
3179             \s*?(?://[^\n]*)*$''',
3180         r'\g<callback_data>', code)
3181
3182
3183 def js_to_json(code, vars={}):
3184     # vars is a dict of var, val pairs to substitute
3185     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3186     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3187     INTEGER_TABLE = (
3188         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3189         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3190     )
3191
3192     def fix_kv(m):
3193         v = m.group(0)
3194         if v in ('true', 'false', 'null'):
3195             return v
3196         elif v in ('undefined', 'void 0'):
3197             return 'null'
3198         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3199             return ""
3200
3201         if v[0] in ("'", '"'):
3202             v = re.sub(r'(?s)\\.|"', lambda m: {
3203                 '"': '\\"',
3204                 "\\'": "'",
3205                 '\\\n': '',
3206                 '\\x': '\\u00',
3207             }.get(m.group(0), m.group(0)), v[1:-1])
3208         else:
3209             for regex, base in INTEGER_TABLE:
3210                 im = re.match(regex, v)
3211                 if im:
3212                     i = int(im.group(1), base)
3213                     return '"%d":' % i if v.endswith(':') else '%d' % i
3214
3215             if v in vars:
3216                 return vars[v]
3217
3218         return '"%s"' % v
3219
3220     def create_map(mobj):
3221         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3222
3223     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3224     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3225
3226     return re.sub(r'''(?sx)
3227         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3228         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3229         {comment}|,(?={skip}[\]}}])|
3230         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3231         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3232         [0-9]+(?={skip}:)|
3233         !+
3234         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3235
3236
3237 def qualities(quality_ids):
3238     """ Get a numeric quality value out of a list of possible values """
3239     def q(qid):
3240         try:
3241             return quality_ids.index(qid)
3242         except ValueError:
3243             return -1
3244     return q
3245
3246
3247 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3248
3249
3250 DEFAULT_OUTTMPL = {
3251     'default': '%(title)s [%(id)s].%(ext)s',
3252     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3253 }
3254 OUTTMPL_TYPES = {
3255     'chapter': None,
3256     'subtitle': None,
3257     'thumbnail': None,
3258     'description': 'description',
3259     'annotation': 'annotations.xml',
3260     'infojson': 'info.json',
3261     'link': None,
3262     'pl_video': None,
3263     'pl_thumbnail': None,
3264     'pl_description': 'description',
3265     'pl_infojson': 'info.json',
3266 }
3267
3268 # As of [1] format syntax is:
3269 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3270 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3271 STR_FORMAT_RE_TMPL = r'''(?x)
3272     (?<!%)(?P<prefix>(?:%%)*)
3273     %
3274     (?P<has_key>\((?P<key>{0})\))?
3275     (?P<format>
3276         (?P<conversion>[#0\-+ ]+)?
3277         (?P<min_width>\d+)?
3278         (?P<precision>\.\d+)?
3279         (?P<len_mod>[hlL])?  # unused in python
3280         {1}  # conversion type
3281     )
3282 '''
3283
3284
3285 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3286
3287
3288 def limit_length(s, length):
3289     """ Add ellipses to overly long strings """
3290     if s is None:
3291         return None
3292     ELLIPSES = '...'
3293     if len(s) > length:
3294         return s[:length - len(ELLIPSES)] + ELLIPSES
3295     return s
3296
3297
3298 def version_tuple(v):
3299     return tuple(int(e) for e in re.split(r'[-.]', v))
3300
3301
3302 def is_outdated_version(version, limit, assume_new=True):
3303     if not version:
3304         return not assume_new
3305     try:
3306         return version_tuple(version) < version_tuple(limit)
3307     except ValueError:
3308         return not assume_new
3309
3310
3311 def ytdl_is_updateable():
3312     """ Returns if yt-dlp can be updated with -U """
3313
3314     from .update import is_non_updateable
3315
3316     return not is_non_updateable()
3317
3318
3319 def args_to_str(args):
3320     # Get a short string representation for a subprocess command
3321     return ' '.join(compat_shlex_quote(a) for a in args)
3322
3323
3324 def error_to_compat_str(err):
3325     return str(err)
3326
3327
3328 def error_to_str(err):
3329     return f'{type(err).__name__}: {err}'
3330
3331
3332 def mimetype2ext(mt):
3333     if mt is None:
3334         return None
3335
3336     mt, _, params = mt.partition(';')
3337     mt = mt.strip()
3338
3339     FULL_MAP = {
3340         'audio/mp4': 'm4a',
3341         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3342         # it's the most popular one
3343         'audio/mpeg': 'mp3',
3344         'audio/x-wav': 'wav',
3345         'audio/wav': 'wav',
3346         'audio/wave': 'wav',
3347     }
3348
3349     ext = FULL_MAP.get(mt)
3350     if ext is not None:
3351         return ext
3352
3353     SUBTYPE_MAP = {
3354         '3gpp': '3gp',
3355         'smptett+xml': 'tt',
3356         'ttaf+xml': 'dfxp',
3357         'ttml+xml': 'ttml',
3358         'x-flv': 'flv',
3359         'x-mp4-fragmented': 'mp4',
3360         'x-ms-sami': 'sami',
3361         'x-ms-wmv': 'wmv',
3362         'mpegurl': 'm3u8',
3363         'x-mpegurl': 'm3u8',
3364         'vnd.apple.mpegurl': 'm3u8',
3365         'dash+xml': 'mpd',
3366         'f4m+xml': 'f4m',
3367         'hds+xml': 'f4m',
3368         'vnd.ms-sstr+xml': 'ism',
3369         'quicktime': 'mov',
3370         'mp2t': 'ts',
3371         'x-wav': 'wav',
3372         'filmstrip+json': 'fs',
3373         'svg+xml': 'svg',
3374     }
3375
3376     _, _, subtype = mt.rpartition('/')
3377     ext = SUBTYPE_MAP.get(subtype.lower())
3378     if ext is not None:
3379         return ext
3380
3381     SUFFIX_MAP = {
3382         'json': 'json',
3383         'xml': 'xml',
3384         'zip': 'zip',
3385         'gzip': 'gz',
3386     }
3387
3388     _, _, suffix = subtype.partition('+')
3389     ext = SUFFIX_MAP.get(suffix)
3390     if ext is not None:
3391         return ext
3392
3393     return subtype.replace('+', '.')
3394
3395
3396 def ext2mimetype(ext_or_url):
3397     if not ext_or_url:
3398         return None
3399     if '.' not in ext_or_url:
3400         ext_or_url = f'file.{ext_or_url}'
3401     return mimetypes.guess_type(ext_or_url)[0]
3402
3403
3404 def parse_codecs(codecs_str):
3405     # http://tools.ietf.org/html/rfc6381
3406     if not codecs_str:
3407         return {}
3408     split_codecs = list(filter(None, map(
3409         str.strip, codecs_str.strip().strip(',').split(','))))
3410     vcodec, acodec, scodec, hdr = None, None, None, None
3411     for full_codec in split_codecs:
3412         parts = full_codec.split('.')
3413         codec = parts[0].replace('0', '')
3414         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3415                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3416             if not vcodec:
3417                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3418                 if codec in ('dvh1', 'dvhe'):
3419                     hdr = 'DV'
3420                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3421                     hdr = 'HDR10'
3422                 elif full_codec.replace('0', '').startswith('vp9.2'):
3423                     hdr = 'HDR10'
3424         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3425             if not acodec:
3426                 acodec = full_codec
3427         elif codec in ('stpp', 'wvtt',):
3428             if not scodec:
3429                 scodec = full_codec
3430         else:
3431             write_string(f'WARNING: Unknown codec {full_codec}\n')
3432     if vcodec or acodec or scodec:
3433         return {
3434             'vcodec': vcodec or 'none',
3435             'acodec': acodec or 'none',
3436             'dynamic_range': hdr,
3437             **({'scodec': scodec} if scodec is not None else {}),
3438         }
3439     elif len(split_codecs) == 2:
3440         return {
3441             'vcodec': split_codecs[0],
3442             'acodec': split_codecs[1],
3443         }
3444     return {}
3445
3446
3447 def urlhandle_detect_ext(url_handle):
3448     getheader = url_handle.headers.get
3449
3450     cd = getheader('Content-Disposition')
3451     if cd:
3452         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3453         if m:
3454             e = determine_ext(m.group('filename'), default_ext=None)
3455             if e:
3456                 return e
3457
3458     return mimetype2ext(getheader('Content-Type'))
3459
3460
3461 def encode_data_uri(data, mime_type):
3462     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3463
3464
3465 def age_restricted(content_limit, age_limit):
3466     """ Returns True iff the content should be blocked """
3467
3468     if age_limit is None:  # No limit set
3469         return False
3470     if content_limit is None:
3471         return False  # Content available for everyone
3472     return age_limit < content_limit
3473
3474
3475 def is_html(first_bytes):
3476     """ Detect whether a file contains HTML by examining its first bytes. """
3477
3478     BOMS = [
3479         (b'\xef\xbb\xbf', 'utf-8'),
3480         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3481         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3482         (b'\xff\xfe', 'utf-16-le'),
3483         (b'\xfe\xff', 'utf-16-be'),
3484     ]
3485
3486     encoding = 'utf-8'
3487     for bom, enc in BOMS:
3488         while first_bytes.startswith(bom):
3489             encoding, first_bytes = enc, first_bytes[len(bom):]
3490
3491     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3492
3493
3494 def determine_protocol(info_dict):
3495     protocol = info_dict.get('protocol')
3496     if protocol is not None:
3497         return protocol
3498
3499     url = sanitize_url(info_dict['url'])
3500     if url.startswith('rtmp'):
3501         return 'rtmp'
3502     elif url.startswith('mms'):
3503         return 'mms'
3504     elif url.startswith('rtsp'):
3505         return 'rtsp'
3506
3507     ext = determine_ext(url)
3508     if ext == 'm3u8':
3509         return 'm3u8'
3510     elif ext == 'f4m':
3511         return 'f4m'
3512
3513     return compat_urllib_parse_urlparse(url).scheme
3514
3515
3516 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3517     """ Render a list of rows, each as a list of values.
3518     Text after a \t will be right aligned """
3519     def width(string):
3520         return len(remove_terminal_sequences(string).replace('\t', ''))
3521
3522     def get_max_lens(table):
3523         return [max(width(str(v)) for v in col) for col in zip(*table)]
3524
3525     def filter_using_list(row, filterArray):
3526         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3527
3528     max_lens = get_max_lens(data) if hide_empty else []
3529     header_row = filter_using_list(header_row, max_lens)
3530     data = [filter_using_list(row, max_lens) for row in data]
3531
3532     table = [header_row] + data
3533     max_lens = get_max_lens(table)
3534     extra_gap += 1
3535     if delim:
3536         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3537         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3538     for row in table:
3539         for pos, text in enumerate(map(str, row)):
3540             if '\t' in text:
3541                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3542             else:
3543                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3544     ret = '\n'.join(''.join(row).rstrip() for row in table)
3545     return ret
3546
3547
3548 def _match_one(filter_part, dct, incomplete):
3549     # TODO: Generalize code with YoutubeDL._build_format_filter
3550     STRING_OPERATORS = {
3551         '*=': operator.contains,
3552         '^=': lambda attr, value: attr.startswith(value),
3553         '$=': lambda attr, value: attr.endswith(value),
3554         '~=': lambda attr, value: re.search(value, attr),
3555     }
3556     COMPARISON_OPERATORS = {
3557         **STRING_OPERATORS,
3558         '<=': operator.le,  # "<=" must be defined above "<"
3559         '<': operator.lt,
3560         '>=': operator.ge,
3561         '>': operator.gt,
3562         '=': operator.eq,
3563     }
3564
3565     if isinstance(incomplete, bool):
3566         is_incomplete = lambda _: incomplete
3567     else:
3568         is_incomplete = lambda k: k in incomplete
3569
3570     operator_rex = re.compile(r'''(?x)
3571         (?P<key>[a-z_]+)
3572         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3573         (?:
3574             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3575             (?P<strval>.+?)
3576         )
3577         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3578     m = operator_rex.fullmatch(filter_part.strip())
3579     if m:
3580         m = m.groupdict()
3581         unnegated_op = COMPARISON_OPERATORS[m['op']]
3582         if m['negation']:
3583             op = lambda attr, value: not unnegated_op(attr, value)
3584         else:
3585             op = unnegated_op
3586         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3587         if m['quote']:
3588             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3589         actual_value = dct.get(m['key'])
3590         numeric_comparison = None
3591         if isinstance(actual_value, (int, float)):
3592             # If the original field is a string and matching comparisonvalue is
3593             # a number we should respect the origin of the original field
3594             # and process comparison value as a string (see
3595             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3596             try:
3597                 numeric_comparison = int(comparison_value)
3598             except ValueError:
3599                 numeric_comparison = parse_filesize(comparison_value)
3600                 if numeric_comparison is None:
3601                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3602                 if numeric_comparison is None:
3603                     numeric_comparison = parse_duration(comparison_value)
3604         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3605             raise ValueError('Operator %s only supports string values!' % m['op'])
3606         if actual_value is None:
3607             return is_incomplete(m['key']) or m['none_inclusive']
3608         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3609
3610     UNARY_OPERATORS = {
3611         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3612         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3613     }
3614     operator_rex = re.compile(r'''(?x)
3615         (?P<op>%s)\s*(?P<key>[a-z_]+)
3616         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3617     m = operator_rex.fullmatch(filter_part.strip())
3618     if m:
3619         op = UNARY_OPERATORS[m.group('op')]
3620         actual_value = dct.get(m.group('key'))
3621         if is_incomplete(m.group('key')) and actual_value is None:
3622             return True
3623         return op(actual_value)
3624
3625     raise ValueError('Invalid filter part %r' % filter_part)
3626
3627
3628 def match_str(filter_str, dct, incomplete=False):
3629     """ Filter a dictionary with a simple string syntax.
3630     @returns           Whether the filter passes
3631     @param incomplete  Set of keys that is expected to be missing from dct.
3632                        Can be True/False to indicate all/none of the keys may be missing.
3633                        All conditions on incomplete keys pass if the key is missing
3634     """
3635     return all(
3636         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3637         for filter_part in re.split(r'(?<!\\)&', filter_str))
3638
3639
3640 def match_filter_func(filters):
3641     if not filters:
3642         return None
3643     filters = set(variadic(filters))
3644
3645     interactive = '-' in filters
3646     if interactive:
3647         filters.remove('-')
3648
3649     def _match_func(info_dict, incomplete=False):
3650         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3651             return NO_DEFAULT if interactive and not incomplete else None
3652         else:
3653             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3654             filter_str = ') | ('.join(map(str.strip, filters))
3655             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3656     return _match_func
3657
3658
3659 def download_range_func(chapters, ranges):
3660     def inner(info_dict, ydl):
3661         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3662                    else 'Cannot match chapters since chapter information is unavailable')
3663         for regex in chapters or []:
3664             for i, chapter in enumerate(info_dict.get('chapters') or []):
3665                 if re.search(regex, chapter['title']):
3666                     warning = None
3667                     yield {**chapter, 'index': i}
3668         if chapters and warning:
3669             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3670
3671         yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3672
3673     return inner
3674
3675
3676 def parse_dfxp_time_expr(time_expr):
3677     if not time_expr:
3678         return
3679
3680     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3681     if mobj:
3682         return float(mobj.group('time_offset'))
3683
3684     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3685     if mobj:
3686         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3687
3688
3689 def srt_subtitles_timecode(seconds):
3690     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3691
3692
3693 def ass_subtitles_timecode(seconds):
3694     time = timetuple_from_msec(seconds * 1000)
3695     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3696
3697
3698 def dfxp2srt(dfxp_data):
3699     '''
3700     @param dfxp_data A bytes-like object containing DFXP data
3701     @returns A unicode object containing converted SRT data
3702     '''
3703     LEGACY_NAMESPACES = (
3704         (b'http://www.w3.org/ns/ttml', [
3705             b'http://www.w3.org/2004/11/ttaf1',
3706             b'http://www.w3.org/2006/04/ttaf1',
3707             b'http://www.w3.org/2006/10/ttaf1',
3708         ]),
3709         (b'http://www.w3.org/ns/ttml#styling', [
3710             b'http://www.w3.org/ns/ttml#style',
3711         ]),
3712     )
3713
3714     SUPPORTED_STYLING = [
3715         'color',
3716         'fontFamily',
3717         'fontSize',
3718         'fontStyle',
3719         'fontWeight',
3720         'textDecoration'
3721     ]
3722
3723     _x = functools.partial(xpath_with_ns, ns_map={
3724         'xml': 'http://www.w3.org/XML/1998/namespace',
3725         'ttml': 'http://www.w3.org/ns/ttml',
3726         'tts': 'http://www.w3.org/ns/ttml#styling',
3727     })
3728
3729     styles = {}
3730     default_style = {}
3731
3732     class TTMLPElementParser:
3733         _out = ''
3734         _unclosed_elements = []
3735         _applied_styles = []
3736
3737         def start(self, tag, attrib):
3738             if tag in (_x('ttml:br'), 'br'):
3739                 self._out += '\n'
3740             else:
3741                 unclosed_elements = []
3742                 style = {}
3743                 element_style_id = attrib.get('style')
3744                 if default_style:
3745                     style.update(default_style)
3746                 if element_style_id:
3747                     style.update(styles.get(element_style_id, {}))
3748                 for prop in SUPPORTED_STYLING:
3749                     prop_val = attrib.get(_x('tts:' + prop))
3750                     if prop_val:
3751                         style[prop] = prop_val
3752                 if style:
3753                     font = ''
3754                     for k, v in sorted(style.items()):
3755                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3756                             continue
3757                         if k == 'color':
3758                             font += ' color="%s"' % v
3759                         elif k == 'fontSize':
3760                             font += ' size="%s"' % v
3761                         elif k == 'fontFamily':
3762                             font += ' face="%s"' % v
3763                         elif k == 'fontWeight' and v == 'bold':
3764                             self._out += '<b>'
3765                             unclosed_elements.append('b')
3766                         elif k == 'fontStyle' and v == 'italic':
3767                             self._out += '<i>'
3768                             unclosed_elements.append('i')
3769                         elif k == 'textDecoration' and v == 'underline':
3770                             self._out += '<u>'
3771                             unclosed_elements.append('u')
3772                     if font:
3773                         self._out += '<font' + font + '>'
3774                         unclosed_elements.append('font')
3775                     applied_style = {}
3776                     if self._applied_styles:
3777                         applied_style.update(self._applied_styles[-1])
3778                     applied_style.update(style)
3779                     self._applied_styles.append(applied_style)
3780                 self._unclosed_elements.append(unclosed_elements)
3781
3782         def end(self, tag):
3783             if tag not in (_x('ttml:br'), 'br'):
3784                 unclosed_elements = self._unclosed_elements.pop()
3785                 for element in reversed(unclosed_elements):
3786                     self._out += '</%s>' % element
3787                 if unclosed_elements and self._applied_styles:
3788                     self._applied_styles.pop()
3789
3790         def data(self, data):
3791             self._out += data
3792
3793         def close(self):
3794             return self._out.strip()
3795
3796     def parse_node(node):
3797         target = TTMLPElementParser()
3798         parser = xml.etree.ElementTree.XMLParser(target=target)
3799         parser.feed(xml.etree.ElementTree.tostring(node))
3800         return parser.close()
3801
3802     for k, v in LEGACY_NAMESPACES:
3803         for ns in v:
3804             dfxp_data = dfxp_data.replace(ns, k)
3805
3806     dfxp = compat_etree_fromstring(dfxp_data)
3807     out = []
3808     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3809
3810     if not paras:
3811         raise ValueError('Invalid dfxp/TTML subtitle')
3812
3813     repeat = False
3814     while True:
3815         for style in dfxp.findall(_x('.//ttml:style')):
3816             style_id = style.get('id') or style.get(_x('xml:id'))
3817             if not style_id:
3818                 continue
3819             parent_style_id = style.get('style')
3820             if parent_style_id:
3821                 if parent_style_id not in styles:
3822                     repeat = True
3823                     continue
3824                 styles[style_id] = styles[parent_style_id].copy()
3825             for prop in SUPPORTED_STYLING:
3826                 prop_val = style.get(_x('tts:' + prop))
3827                 if prop_val:
3828                     styles.setdefault(style_id, {})[prop] = prop_val
3829         if repeat:
3830             repeat = False
3831         else:
3832             break
3833
3834     for p in ('body', 'div'):
3835         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3836         if ele is None:
3837             continue
3838         style = styles.get(ele.get('style'))
3839         if not style:
3840             continue
3841         default_style.update(style)
3842
3843     for para, index in zip(paras, itertools.count(1)):
3844         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3845         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3846         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3847         if begin_time is None:
3848             continue
3849         if not end_time:
3850             if not dur:
3851                 continue
3852             end_time = begin_time + dur
3853         out.append('%d\n%s --> %s\n%s\n\n' % (
3854             index,
3855             srt_subtitles_timecode(begin_time),
3856             srt_subtitles_timecode(end_time),
3857             parse_node(para)))
3858
3859     return ''.join(out)
3860
3861
3862 def cli_option(params, command_option, param, separator=None):
3863     param = params.get(param)
3864     return ([] if param is None
3865             else [command_option, str(param)] if separator is None
3866             else [f'{command_option}{separator}{param}'])
3867
3868
3869 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3870     param = params.get(param)
3871     assert param in (True, False, None)
3872     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3873
3874
3875 def cli_valueless_option(params, command_option, param, expected_value=True):
3876     return [command_option] if params.get(param) == expected_value else []
3877
3878
3879 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3880     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3881         if use_compat:
3882             return argdict
3883         else:
3884             argdict = None
3885     if argdict is None:
3886         return default
3887     assert isinstance(argdict, dict)
3888
3889     assert isinstance(keys, (list, tuple))
3890     for key_list in keys:
3891         arg_list = list(filter(
3892             lambda x: x is not None,
3893             [argdict.get(key.lower()) for key in variadic(key_list)]))
3894         if arg_list:
3895             return [arg for args in arg_list for arg in args]
3896     return default
3897
3898
3899 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3900     main_key, exe = main_key.lower(), exe.lower()
3901     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3902     keys = [f'{root_key}{k}' for k in (keys or [''])]
3903     if root_key in keys:
3904         if main_key != exe:
3905             keys.append((main_key, exe))
3906         keys.append('default')
3907     else:
3908         use_compat = False
3909     return cli_configuration_args(argdict, keys, default, use_compat)
3910
3911
3912 class ISO639Utils:
3913     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3914     _lang_map = {
3915         'aa': 'aar',
3916         'ab': 'abk',
3917         'ae': 'ave',
3918         'af': 'afr',
3919         'ak': 'aka',
3920         'am': 'amh',
3921         'an': 'arg',
3922         'ar': 'ara',
3923         'as': 'asm',
3924         'av': 'ava',
3925         'ay': 'aym',
3926         'az': 'aze',
3927         'ba': 'bak',
3928         'be': 'bel',
3929         'bg': 'bul',
3930         'bh': 'bih',
3931         'bi': 'bis',
3932         'bm': 'bam',
3933         'bn': 'ben',
3934         'bo': 'bod',
3935         'br': 'bre',
3936         'bs': 'bos',
3937         'ca': 'cat',
3938         'ce': 'che',
3939         'ch': 'cha',
3940         'co': 'cos',
3941         'cr': 'cre',
3942         'cs': 'ces',
3943         'cu': 'chu',
3944         'cv': 'chv',
3945         'cy': 'cym',
3946         'da': 'dan',
3947         'de': 'deu',
3948         'dv': 'div',
3949         'dz': 'dzo',
3950         'ee': 'ewe',
3951         'el': 'ell',
3952         'en': 'eng',
3953         'eo': 'epo',
3954         'es': 'spa',
3955         'et': 'est',
3956         'eu': 'eus',
3957         'fa': 'fas',
3958         'ff': 'ful',
3959         'fi': 'fin',
3960         'fj': 'fij',
3961         'fo': 'fao',
3962         'fr': 'fra',
3963         'fy': 'fry',
3964         'ga': 'gle',
3965         'gd': 'gla',
3966         'gl': 'glg',
3967         'gn': 'grn',
3968         'gu': 'guj',
3969         'gv': 'glv',
3970         'ha': 'hau',
3971         'he': 'heb',
3972         'iw': 'heb',  # Replaced by he in 1989 revision
3973         'hi': 'hin',
3974         'ho': 'hmo',
3975         'hr': 'hrv',
3976         'ht': 'hat',
3977         'hu': 'hun',
3978         'hy': 'hye',
3979         'hz': 'her',
3980         'ia': 'ina',
3981         'id': 'ind',
3982         'in': 'ind',  # Replaced by id in 1989 revision
3983         'ie': 'ile',
3984         'ig': 'ibo',
3985         'ii': 'iii',
3986         'ik': 'ipk',
3987         'io': 'ido',
3988         'is': 'isl',
3989         'it': 'ita',
3990         'iu': 'iku',
3991         'ja': 'jpn',
3992         'jv': 'jav',
3993         'ka': 'kat',
3994         'kg': 'kon',
3995         'ki': 'kik',
3996         'kj': 'kua',
3997         'kk': 'kaz',
3998         'kl': 'kal',
3999         'km': 'khm',
4000         'kn': 'kan',
4001         'ko': 'kor',
4002         'kr': 'kau',
4003         'ks': 'kas',
4004         'ku': 'kur',
4005         'kv': 'kom',
4006         'kw': 'cor',
4007         'ky': 'kir',
4008         'la': 'lat',
4009         'lb': 'ltz',
4010         'lg': 'lug',
4011         'li': 'lim',
4012         'ln': 'lin',
4013         'lo': 'lao',
4014         'lt': 'lit',
4015         'lu': 'lub',
4016         'lv': 'lav',
4017         'mg': 'mlg',
4018         'mh': 'mah',
4019         'mi': 'mri',
4020         'mk': 'mkd',
4021         'ml': 'mal',
4022         'mn': 'mon',
4023         'mr': 'mar',
4024         'ms': 'msa',
4025         'mt': 'mlt',
4026         'my': 'mya',
4027         'na': 'nau',
4028         'nb': 'nob',
4029         'nd': 'nde',
4030         'ne': 'nep',
4031         'ng': 'ndo',
4032         'nl': 'nld',
4033         'nn': 'nno',
4034         'no': 'nor',
4035         'nr': 'nbl',
4036         'nv': 'nav',
4037         'ny': 'nya',
4038         'oc': 'oci',
4039         'oj': 'oji',
4040         'om': 'orm',
4041         'or': 'ori',
4042         'os': 'oss',
4043         'pa': 'pan',
4044         'pi': 'pli',
4045         'pl': 'pol',
4046         'ps': 'pus',
4047         'pt': 'por',
4048         'qu': 'que',
4049         'rm': 'roh',
4050         'rn': 'run',
4051         'ro': 'ron',
4052         'ru': 'rus',
4053         'rw': 'kin',
4054         'sa': 'san',
4055         'sc': 'srd',
4056         'sd': 'snd',
4057         'se': 'sme',
4058         'sg': 'sag',
4059         'si': 'sin',
4060         'sk': 'slk',
4061         'sl': 'slv',
4062         'sm': 'smo',
4063         'sn': 'sna',
4064         'so': 'som',
4065         'sq': 'sqi',
4066         'sr': 'srp',
4067         'ss': 'ssw',
4068         'st': 'sot',
4069         'su': 'sun',
4070         'sv': 'swe',
4071         'sw': 'swa',
4072         'ta': 'tam',
4073         'te': 'tel',
4074         'tg': 'tgk',
4075         'th': 'tha',
4076         'ti': 'tir',
4077         'tk': 'tuk',
4078         'tl': 'tgl',
4079         'tn': 'tsn',
4080         'to': 'ton',
4081         'tr': 'tur',
4082         'ts': 'tso',
4083         'tt': 'tat',
4084         'tw': 'twi',
4085         'ty': 'tah',
4086         'ug': 'uig',
4087         'uk': 'ukr',
4088         'ur': 'urd',
4089         'uz': 'uzb',
4090         've': 'ven',
4091         'vi': 'vie',
4092         'vo': 'vol',
4093         'wa': 'wln',
4094         'wo': 'wol',
4095         'xh': 'xho',
4096         'yi': 'yid',
4097         'ji': 'yid',  # Replaced by yi in 1989 revision
4098         'yo': 'yor',
4099         'za': 'zha',
4100         'zh': 'zho',
4101         'zu': 'zul',
4102     }
4103
4104     @classmethod
4105     def short2long(cls, code):
4106         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4107         return cls._lang_map.get(code[:2])
4108
4109     @classmethod
4110     def long2short(cls, code):
4111         """Convert language code from ISO 639-2/T to ISO 639-1"""
4112         for short_name, long_name in cls._lang_map.items():
4113             if long_name == code:
4114                 return short_name
4115
4116
4117 class ISO3166Utils:
4118     # From http://data.okfn.org/data/core/country-list
4119     _country_map = {
4120         'AF': 'Afghanistan',
4121         'AX': 'Åland Islands',
4122         'AL': 'Albania',
4123         'DZ': 'Algeria',
4124         'AS': 'American Samoa',
4125         'AD': 'Andorra',
4126         'AO': 'Angola',
4127         'AI': 'Anguilla',
4128         'AQ': 'Antarctica',
4129         'AG': 'Antigua and Barbuda',
4130         'AR': 'Argentina',
4131         'AM': 'Armenia',
4132         'AW': 'Aruba',
4133         'AU': 'Australia',
4134         'AT': 'Austria',
4135         'AZ': 'Azerbaijan',
4136         'BS': 'Bahamas',
4137         'BH': 'Bahrain',
4138         'BD': 'Bangladesh',
4139         'BB': 'Barbados',
4140         'BY': 'Belarus',
4141         'BE': 'Belgium',
4142         'BZ': 'Belize',
4143         'BJ': 'Benin',
4144         'BM': 'Bermuda',
4145         'BT': 'Bhutan',
4146         'BO': 'Bolivia, Plurinational State of',
4147         'BQ': 'Bonaire, Sint Eustatius and Saba',
4148         'BA': 'Bosnia and Herzegovina',
4149         'BW': 'Botswana',
4150         'BV': 'Bouvet Island',
4151         'BR': 'Brazil',
4152         'IO': 'British Indian Ocean Territory',
4153         'BN': 'Brunei Darussalam',
4154         'BG': 'Bulgaria',
4155         'BF': 'Burkina Faso',
4156         'BI': 'Burundi',
4157         'KH': 'Cambodia',
4158         'CM': 'Cameroon',
4159         'CA': 'Canada',
4160         'CV': 'Cape Verde',
4161         'KY': 'Cayman Islands',
4162         'CF': 'Central African Republic',
4163         'TD': 'Chad',
4164         'CL': 'Chile',
4165         'CN': 'China',
4166         'CX': 'Christmas Island',
4167         'CC': 'Cocos (Keeling) Islands',
4168         'CO': 'Colombia',
4169         'KM': 'Comoros',
4170         'CG': 'Congo',
4171         'CD': 'Congo, the Democratic Republic of the',
4172         'CK': 'Cook Islands',
4173         'CR': 'Costa Rica',
4174         'CI': 'Côte d\'Ivoire',
4175         'HR': 'Croatia',
4176         'CU': 'Cuba',
4177         'CW': 'Curaçao',
4178         'CY': 'Cyprus',
4179         'CZ': 'Czech Republic',
4180         'DK': 'Denmark',
4181         'DJ': 'Djibouti',
4182         'DM': 'Dominica',
4183         'DO': 'Dominican Republic',
4184         'EC': 'Ecuador',
4185         'EG': 'Egypt',
4186         'SV': 'El Salvador',
4187         'GQ': 'Equatorial Guinea',
4188         'ER': 'Eritrea',
4189         'EE': 'Estonia',
4190         'ET': 'Ethiopia',
4191         'FK': 'Falkland Islands (Malvinas)',
4192         'FO': 'Faroe Islands',
4193         'FJ': 'Fiji',
4194         'FI': 'Finland',
4195         'FR': 'France',
4196         'GF': 'French Guiana',
4197         'PF': 'French Polynesia',
4198         'TF': 'French Southern Territories',
4199         'GA': 'Gabon',
4200         'GM': 'Gambia',
4201         'GE': 'Georgia',
4202         'DE': 'Germany',
4203         'GH': 'Ghana',
4204         'GI': 'Gibraltar',
4205         'GR': 'Greece',
4206         'GL': 'Greenland',
4207         'GD': 'Grenada',
4208         'GP': 'Guadeloupe',
4209         'GU': 'Guam',
4210         'GT': 'Guatemala',
4211         'GG': 'Guernsey',
4212         'GN': 'Guinea',
4213         'GW': 'Guinea-Bissau',
4214         'GY': 'Guyana',
4215         'HT': 'Haiti',
4216         'HM': 'Heard Island and McDonald Islands',
4217         'VA': 'Holy See (Vatican City State)',
4218         'HN': 'Honduras',
4219         'HK': 'Hong Kong',
4220         'HU': 'Hungary',
4221         'IS': 'Iceland',
4222         'IN': 'India',
4223         'ID': 'Indonesia',
4224         'IR': 'Iran, Islamic Republic of',
4225         'IQ': 'Iraq',
4226         'IE': 'Ireland',
4227         'IM': 'Isle of Man',
4228         'IL': 'Israel',
4229         'IT': 'Italy',
4230         'JM': 'Jamaica',
4231         'JP': 'Japan',
4232         'JE': 'Jersey',
4233         'JO': 'Jordan',
4234         'KZ': 'Kazakhstan',
4235         'KE': 'Kenya',
4236         'KI': 'Kiribati',
4237         'KP': 'Korea, Democratic People\'s Republic of',
4238         'KR': 'Korea, Republic of',
4239         'KW': 'Kuwait',
4240         'KG': 'Kyrgyzstan',
4241         'LA': 'Lao People\'s Democratic Republic',
4242         'LV': 'Latvia',
4243         'LB': 'Lebanon',
4244         'LS': 'Lesotho',
4245         'LR': 'Liberia',
4246         'LY': 'Libya',
4247         'LI': 'Liechtenstein',
4248         'LT': 'Lithuania',
4249         'LU': 'Luxembourg',
4250         'MO': 'Macao',
4251         'MK': 'Macedonia, the Former Yugoslav Republic of',
4252         'MG': 'Madagascar',
4253         'MW': 'Malawi',
4254         'MY': 'Malaysia',
4255         'MV': 'Maldives',
4256         'ML': 'Mali',
4257         'MT': 'Malta',
4258         'MH': 'Marshall Islands',
4259         'MQ': 'Martinique',
4260         'MR': 'Mauritania',
4261         'MU': 'Mauritius',
4262         'YT': 'Mayotte',
4263         'MX': 'Mexico',
4264         'FM': 'Micronesia, Federated States of',
4265         'MD': 'Moldova, Republic of',
4266         'MC': 'Monaco',
4267         'MN': 'Mongolia',
4268         'ME': 'Montenegro',
4269         'MS': 'Montserrat',
4270         'MA': 'Morocco',
4271         'MZ': 'Mozambique',
4272         'MM': 'Myanmar',
4273         'NA': 'Namibia',
4274         'NR': 'Nauru',
4275         'NP': 'Nepal',
4276         'NL': 'Netherlands',
4277         'NC': 'New Caledonia',
4278         'NZ': 'New Zealand',
4279         'NI': 'Nicaragua',
4280         'NE': 'Niger',
4281         'NG': 'Nigeria',
4282         'NU': 'Niue',
4283         'NF': 'Norfolk Island',
4284         'MP': 'Northern Mariana Islands',
4285         'NO': 'Norway',
4286         'OM': 'Oman',
4287         'PK': 'Pakistan',
4288         'PW': 'Palau',
4289         'PS': 'Palestine, State of',
4290         'PA': 'Panama',
4291         'PG': 'Papua New Guinea',
4292         'PY': 'Paraguay',
4293         'PE': 'Peru',
4294         'PH': 'Philippines',
4295         'PN': 'Pitcairn',
4296         'PL': 'Poland',
4297         'PT': 'Portugal',
4298         'PR': 'Puerto Rico',
4299         'QA': 'Qatar',
4300         'RE': 'Réunion',
4301         'RO': 'Romania',
4302         'RU': 'Russian Federation',
4303         'RW': 'Rwanda',
4304         'BL': 'Saint Barthélemy',
4305         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4306         'KN': 'Saint Kitts and Nevis',
4307         'LC': 'Saint Lucia',
4308         'MF': 'Saint Martin (French part)',
4309         'PM': 'Saint Pierre and Miquelon',
4310         'VC': 'Saint Vincent and the Grenadines',
4311         'WS': 'Samoa',
4312         'SM': 'San Marino',
4313         'ST': 'Sao Tome and Principe',
4314         'SA': 'Saudi Arabia',
4315         'SN': 'Senegal',
4316         'RS': 'Serbia',
4317         'SC': 'Seychelles',
4318         'SL': 'Sierra Leone',
4319         'SG': 'Singapore',
4320         'SX': 'Sint Maarten (Dutch part)',
4321         'SK': 'Slovakia',
4322         'SI': 'Slovenia',
4323         'SB': 'Solomon Islands',
4324         'SO': 'Somalia',
4325         'ZA': 'South Africa',
4326         'GS': 'South Georgia and the South Sandwich Islands',
4327         'SS': 'South Sudan',
4328         'ES': 'Spain',
4329         'LK': 'Sri Lanka',
4330         'SD': 'Sudan',
4331         'SR': 'Suriname',
4332         'SJ': 'Svalbard and Jan Mayen',
4333         'SZ': 'Swaziland',
4334         'SE': 'Sweden',
4335         'CH': 'Switzerland',
4336         'SY': 'Syrian Arab Republic',
4337         'TW': 'Taiwan, Province of China',
4338         'TJ': 'Tajikistan',
4339         'TZ': 'Tanzania, United Republic of',
4340         'TH': 'Thailand',
4341         'TL': 'Timor-Leste',
4342         'TG': 'Togo',
4343         'TK': 'Tokelau',
4344         'TO': 'Tonga',
4345         'TT': 'Trinidad and Tobago',
4346         'TN': 'Tunisia',
4347         'TR': 'Turkey',
4348         'TM': 'Turkmenistan',
4349         'TC': 'Turks and Caicos Islands',
4350         'TV': 'Tuvalu',
4351         'UG': 'Uganda',
4352         'UA': 'Ukraine',
4353         'AE': 'United Arab Emirates',
4354         'GB': 'United Kingdom',
4355         'US': 'United States',
4356         'UM': 'United States Minor Outlying Islands',
4357         'UY': 'Uruguay',
4358         'UZ': 'Uzbekistan',
4359         'VU': 'Vanuatu',
4360         'VE': 'Venezuela, Bolivarian Republic of',
4361         'VN': 'Viet Nam',
4362         'VG': 'Virgin Islands, British',
4363         'VI': 'Virgin Islands, U.S.',
4364         'WF': 'Wallis and Futuna',
4365         'EH': 'Western Sahara',
4366         'YE': 'Yemen',
4367         'ZM': 'Zambia',
4368         'ZW': 'Zimbabwe',
4369         # Not ISO 3166 codes, but used for IP blocks
4370         'AP': 'Asia/Pacific Region',
4371         'EU': 'Europe',
4372     }
4373
4374     @classmethod
4375     def short2full(cls, code):
4376         """Convert an ISO 3166-2 country code to the corresponding full name"""
4377         return cls._country_map.get(code.upper())
4378
4379
4380 class GeoUtils:
4381     # Major IPv4 address blocks per country
4382     _country_ip_map = {
4383         'AD': '46.172.224.0/19',
4384         'AE': '94.200.0.0/13',
4385         'AF': '149.54.0.0/17',
4386         'AG': '209.59.64.0/18',
4387         'AI': '204.14.248.0/21',
4388         'AL': '46.99.0.0/16',
4389         'AM': '46.70.0.0/15',
4390         'AO': '105.168.0.0/13',
4391         'AP': '182.50.184.0/21',
4392         'AQ': '23.154.160.0/24',
4393         'AR': '181.0.0.0/12',
4394         'AS': '202.70.112.0/20',
4395         'AT': '77.116.0.0/14',
4396         'AU': '1.128.0.0/11',
4397         'AW': '181.41.0.0/18',
4398         'AX': '185.217.4.0/22',
4399         'AZ': '5.197.0.0/16',
4400         'BA': '31.176.128.0/17',
4401         'BB': '65.48.128.0/17',
4402         'BD': '114.130.0.0/16',
4403         'BE': '57.0.0.0/8',
4404         'BF': '102.178.0.0/15',
4405         'BG': '95.42.0.0/15',
4406         'BH': '37.131.0.0/17',
4407         'BI': '154.117.192.0/18',
4408         'BJ': '137.255.0.0/16',
4409         'BL': '185.212.72.0/23',
4410         'BM': '196.12.64.0/18',
4411         'BN': '156.31.0.0/16',
4412         'BO': '161.56.0.0/16',
4413         'BQ': '161.0.80.0/20',
4414         'BR': '191.128.0.0/12',
4415         'BS': '24.51.64.0/18',
4416         'BT': '119.2.96.0/19',
4417         'BW': '168.167.0.0/16',
4418         'BY': '178.120.0.0/13',
4419         'BZ': '179.42.192.0/18',
4420         'CA': '99.224.0.0/11',
4421         'CD': '41.243.0.0/16',
4422         'CF': '197.242.176.0/21',
4423         'CG': '160.113.0.0/16',
4424         'CH': '85.0.0.0/13',
4425         'CI': '102.136.0.0/14',
4426         'CK': '202.65.32.0/19',
4427         'CL': '152.172.0.0/14',
4428         'CM': '102.244.0.0/14',
4429         'CN': '36.128.0.0/10',
4430         'CO': '181.240.0.0/12',
4431         'CR': '201.192.0.0/12',
4432         'CU': '152.206.0.0/15',
4433         'CV': '165.90.96.0/19',
4434         'CW': '190.88.128.0/17',
4435         'CY': '31.153.0.0/16',
4436         'CZ': '88.100.0.0/14',
4437         'DE': '53.0.0.0/8',
4438         'DJ': '197.241.0.0/17',
4439         'DK': '87.48.0.0/12',
4440         'DM': '192.243.48.0/20',
4441         'DO': '152.166.0.0/15',
4442         'DZ': '41.96.0.0/12',
4443         'EC': '186.68.0.0/15',
4444         'EE': '90.190.0.0/15',
4445         'EG': '156.160.0.0/11',
4446         'ER': '196.200.96.0/20',
4447         'ES': '88.0.0.0/11',
4448         'ET': '196.188.0.0/14',
4449         'EU': '2.16.0.0/13',
4450         'FI': '91.152.0.0/13',
4451         'FJ': '144.120.0.0/16',
4452         'FK': '80.73.208.0/21',
4453         'FM': '119.252.112.0/20',
4454         'FO': '88.85.32.0/19',
4455         'FR': '90.0.0.0/9',
4456         'GA': '41.158.0.0/15',
4457         'GB': '25.0.0.0/8',
4458         'GD': '74.122.88.0/21',
4459         'GE': '31.146.0.0/16',
4460         'GF': '161.22.64.0/18',
4461         'GG': '62.68.160.0/19',
4462         'GH': '154.160.0.0/12',
4463         'GI': '95.164.0.0/16',
4464         'GL': '88.83.0.0/19',
4465         'GM': '160.182.0.0/15',
4466         'GN': '197.149.192.0/18',
4467         'GP': '104.250.0.0/19',
4468         'GQ': '105.235.224.0/20',
4469         'GR': '94.64.0.0/13',
4470         'GT': '168.234.0.0/16',
4471         'GU': '168.123.0.0/16',
4472         'GW': '197.214.80.0/20',
4473         'GY': '181.41.64.0/18',
4474         'HK': '113.252.0.0/14',
4475         'HN': '181.210.0.0/16',
4476         'HR': '93.136.0.0/13',
4477         'HT': '148.102.128.0/17',
4478         'HU': '84.0.0.0/14',
4479         'ID': '39.192.0.0/10',
4480         'IE': '87.32.0.0/12',
4481         'IL': '79.176.0.0/13',
4482         'IM': '5.62.80.0/20',
4483         'IN': '117.192.0.0/10',
4484         'IO': '203.83.48.0/21',
4485         'IQ': '37.236.0.0/14',
4486         'IR': '2.176.0.0/12',
4487         'IS': '82.221.0.0/16',
4488         'IT': '79.0.0.0/10',
4489         'JE': '87.244.64.0/18',
4490         'JM': '72.27.0.0/17',
4491         'JO': '176.29.0.0/16',
4492         'JP': '133.0.0.0/8',
4493         'KE': '105.48.0.0/12',
4494         'KG': '158.181.128.0/17',
4495         'KH': '36.37.128.0/17',
4496         'KI': '103.25.140.0/22',
4497         'KM': '197.255.224.0/20',
4498         'KN': '198.167.192.0/19',
4499         'KP': '175.45.176.0/22',
4500         'KR': '175.192.0.0/10',
4501         'KW': '37.36.0.0/14',
4502         'KY': '64.96.0.0/15',
4503         'KZ': '2.72.0.0/13',
4504         'LA': '115.84.64.0/18',
4505         'LB': '178.135.0.0/16',
4506         'LC': '24.92.144.0/20',
4507         'LI': '82.117.0.0/19',
4508         'LK': '112.134.0.0/15',
4509         'LR': '102.183.0.0/16',
4510         'LS': '129.232.0.0/17',
4511         'LT': '78.56.0.0/13',
4512         'LU': '188.42.0.0/16',
4513         'LV': '46.109.0.0/16',
4514         'LY': '41.252.0.0/14',
4515         'MA': '105.128.0.0/11',
4516         'MC': '88.209.64.0/18',
4517         'MD': '37.246.0.0/16',
4518         'ME': '178.175.0.0/17',
4519         'MF': '74.112.232.0/21',
4520         'MG': '154.126.0.0/17',
4521         'MH': '117.103.88.0/21',
4522         'MK': '77.28.0.0/15',
4523         'ML': '154.118.128.0/18',
4524         'MM': '37.111.0.0/17',
4525         'MN': '49.0.128.0/17',
4526         'MO': '60.246.0.0/16',
4527         'MP': '202.88.64.0/20',
4528         'MQ': '109.203.224.0/19',
4529         'MR': '41.188.64.0/18',
4530         'MS': '208.90.112.0/22',
4531         'MT': '46.11.0.0/16',
4532         'MU': '105.16.0.0/12',
4533         'MV': '27.114.128.0/18',
4534         'MW': '102.70.0.0/15',
4535         'MX': '187.192.0.0/11',
4536         'MY': '175.136.0.0/13',
4537         'MZ': '197.218.0.0/15',
4538         'NA': '41.182.0.0/16',
4539         'NC': '101.101.0.0/18',
4540         'NE': '197.214.0.0/18',
4541         'NF': '203.17.240.0/22',
4542         'NG': '105.112.0.0/12',
4543         'NI': '186.76.0.0/15',
4544         'NL': '145.96.0.0/11',
4545         'NO': '84.208.0.0/13',
4546         'NP': '36.252.0.0/15',
4547         'NR': '203.98.224.0/19',
4548         'NU': '49.156.48.0/22',
4549         'NZ': '49.224.0.0/14',
4550         'OM': '5.36.0.0/15',
4551         'PA': '186.72.0.0/15',
4552         'PE': '186.160.0.0/14',
4553         'PF': '123.50.64.0/18',
4554         'PG': '124.240.192.0/19',
4555         'PH': '49.144.0.0/13',
4556         'PK': '39.32.0.0/11',
4557         'PL': '83.0.0.0/11',
4558         'PM': '70.36.0.0/20',
4559         'PR': '66.50.0.0/16',
4560         'PS': '188.161.0.0/16',
4561         'PT': '85.240.0.0/13',
4562         'PW': '202.124.224.0/20',
4563         'PY': '181.120.0.0/14',
4564         'QA': '37.210.0.0/15',
4565         'RE': '102.35.0.0/16',
4566         'RO': '79.112.0.0/13',
4567         'RS': '93.86.0.0/15',
4568         'RU': '5.136.0.0/13',
4569         'RW': '41.186.0.0/16',
4570         'SA': '188.48.0.0/13',
4571         'SB': '202.1.160.0/19',
4572         'SC': '154.192.0.0/11',
4573         'SD': '102.120.0.0/13',
4574         'SE': '78.64.0.0/12',
4575         'SG': '8.128.0.0/10',
4576         'SI': '188.196.0.0/14',
4577         'SK': '78.98.0.0/15',
4578         'SL': '102.143.0.0/17',
4579         'SM': '89.186.32.0/19',
4580         'SN': '41.82.0.0/15',
4581         'SO': '154.115.192.0/18',
4582         'SR': '186.179.128.0/17',
4583         'SS': '105.235.208.0/21',
4584         'ST': '197.159.160.0/19',
4585         'SV': '168.243.0.0/16',
4586         'SX': '190.102.0.0/20',
4587         'SY': '5.0.0.0/16',
4588         'SZ': '41.84.224.0/19',
4589         'TC': '65.255.48.0/20',
4590         'TD': '154.68.128.0/19',
4591         'TG': '196.168.0.0/14',
4592         'TH': '171.96.0.0/13',
4593         'TJ': '85.9.128.0/18',
4594         'TK': '27.96.24.0/21',
4595         'TL': '180.189.160.0/20',
4596         'TM': '95.85.96.0/19',
4597         'TN': '197.0.0.0/11',
4598         'TO': '175.176.144.0/21',
4599         'TR': '78.160.0.0/11',
4600         'TT': '186.44.0.0/15',
4601         'TV': '202.2.96.0/19',
4602         'TW': '120.96.0.0/11',
4603         'TZ': '156.156.0.0/14',
4604         'UA': '37.52.0.0/14',
4605         'UG': '102.80.0.0/13',
4606         'US': '6.0.0.0/8',
4607         'UY': '167.56.0.0/13',
4608         'UZ': '84.54.64.0/18',
4609         'VA': '212.77.0.0/19',
4610         'VC': '207.191.240.0/21',
4611         'VE': '186.88.0.0/13',
4612         'VG': '66.81.192.0/20',
4613         'VI': '146.226.0.0/16',
4614         'VN': '14.160.0.0/11',
4615         'VU': '202.80.32.0/20',
4616         'WF': '117.20.32.0/21',
4617         'WS': '202.4.32.0/19',
4618         'YE': '134.35.0.0/16',
4619         'YT': '41.242.116.0/22',
4620         'ZA': '41.0.0.0/11',
4621         'ZM': '102.144.0.0/13',
4622         'ZW': '102.177.192.0/18',
4623     }
4624
4625     @classmethod
4626     def random_ipv4(cls, code_or_block):
4627         if len(code_or_block) == 2:
4628             block = cls._country_ip_map.get(code_or_block.upper())
4629             if not block:
4630                 return None
4631         else:
4632             block = code_or_block
4633         addr, preflen = block.split('/')
4634         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4635         addr_max = addr_min | (0xffffffff >> int(preflen))
4636         return compat_str(socket.inet_ntoa(
4637             struct.pack('!L', random.randint(addr_min, addr_max))))
4638
4639
4640 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4641     def __init__(self, proxies=None):
4642         # Set default handlers
4643         for type in ('http', 'https'):
4644             setattr(self, '%s_open' % type,
4645                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4646                         meth(r, proxy, type))
4647         urllib.request.ProxyHandler.__init__(self, proxies)
4648
4649     def proxy_open(self, req, proxy, type):
4650         req_proxy = req.headers.get('Ytdl-request-proxy')
4651         if req_proxy is not None:
4652             proxy = req_proxy
4653             del req.headers['Ytdl-request-proxy']
4654
4655         if proxy == '__noproxy__':
4656             return None  # No Proxy
4657         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4658             req.add_header('Ytdl-socks-proxy', proxy)
4659             # yt-dlp's http/https handlers do wrapping the socket with socks
4660             return None
4661         return urllib.request.ProxyHandler.proxy_open(
4662             self, req, proxy, type)
4663
4664
4665 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4666 # released into Public Domain
4667 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4668
4669 def long_to_bytes(n, blocksize=0):
4670     """long_to_bytes(n:long, blocksize:int) : string
4671     Convert a long integer to a byte string.
4672
4673     If optional blocksize is given and greater than zero, pad the front of the
4674     byte string with binary zeros so that the length is a multiple of
4675     blocksize.
4676     """
4677     # after much testing, this algorithm was deemed to be the fastest
4678     s = b''
4679     n = int(n)
4680     while n > 0:
4681         s = struct.pack('>I', n & 0xffffffff) + s
4682         n = n >> 32
4683     # strip off leading zeros
4684     for i in range(len(s)):
4685         if s[i] != b'\000'[0]:
4686             break
4687     else:
4688         # only happens when n == 0
4689         s = b'\000'
4690         i = 0
4691     s = s[i:]
4692     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4693     # de-padding being done above, but sigh...
4694     if blocksize > 0 and len(s) % blocksize:
4695         s = (blocksize - len(s) % blocksize) * b'\000' + s
4696     return s
4697
4698
4699 def bytes_to_long(s):
4700     """bytes_to_long(string) : long
4701     Convert a byte string to a long integer.
4702
4703     This is (essentially) the inverse of long_to_bytes().
4704     """
4705     acc = 0
4706     length = len(s)
4707     if length % 4:
4708         extra = (4 - length % 4)
4709         s = b'\000' * extra + s
4710         length = length + extra
4711     for i in range(0, length, 4):
4712         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4713     return acc
4714
4715
4716 def ohdave_rsa_encrypt(data, exponent, modulus):
4717     '''
4718     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4719
4720     Input:
4721         data: data to encrypt, bytes-like object
4722         exponent, modulus: parameter e and N of RSA algorithm, both integer
4723     Output: hex string of encrypted data
4724
4725     Limitation: supports one block encryption only
4726     '''
4727
4728     payload = int(binascii.hexlify(data[::-1]), 16)
4729     encrypted = pow(payload, exponent, modulus)
4730     return '%x' % encrypted
4731
4732
4733 def pkcs1pad(data, length):
4734     """
4735     Padding input data with PKCS#1 scheme
4736
4737     @param {int[]} data        input data
4738     @param {int}   length      target length
4739     @returns {int[]}           padded data
4740     """
4741     if len(data) > length - 11:
4742         raise ValueError('Input data too long for PKCS#1 padding')
4743
4744     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4745     return [0, 2] + pseudo_random + [0] + data
4746
4747
4748 def _base_n_table(n, table):
4749     if not table and not n:
4750         raise ValueError('Either table or n must be specified')
4751     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4752
4753     if n != len(table):
4754         raise ValueError(f'base {n} exceeds table length {len(table)}')
4755     return table
4756
4757
4758 def encode_base_n(num, n=None, table=None):
4759     """Convert given int to a base-n string"""
4760     table = _base_n_table(n, table)
4761     if not num:
4762         return table[0]
4763
4764     result, base = '', len(table)
4765     while num:
4766         result = table[num % base] + result
4767         num = num // base
4768     return result
4769
4770
4771 def decode_base_n(string, n=None, table=None):
4772     """Convert given base-n string to int"""
4773     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4774     result, base = 0, len(table)
4775     for char in string:
4776         result = result * base + table[char]
4777     return result
4778
4779
4780 def decode_base(value, digits):
4781     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4782                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4783     return decode_base_n(value, table=digits)
4784
4785
4786 def decode_packed_codes(code):
4787     mobj = re.search(PACKED_CODES_RE, code)
4788     obfuscated_code, base, count, symbols = mobj.groups()
4789     base = int(base)
4790     count = int(count)
4791     symbols = symbols.split('|')
4792     symbol_table = {}
4793
4794     while count:
4795         count -= 1
4796         base_n_count = encode_base_n(count, base)
4797         symbol_table[base_n_count] = symbols[count] or base_n_count
4798
4799     return re.sub(
4800         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4801         obfuscated_code)
4802
4803
4804 def caesar(s, alphabet, shift):
4805     if shift == 0:
4806         return s
4807     l = len(alphabet)
4808     return ''.join(
4809         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4810         for c in s)
4811
4812
4813 def rot47(s):
4814     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4815
4816
4817 def parse_m3u8_attributes(attrib):
4818     info = {}
4819     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4820         if val.startswith('"'):
4821             val = val[1:-1]
4822         info[key] = val
4823     return info
4824
4825
4826 def urshift(val, n):
4827     return val >> n if val >= 0 else (val + 0x100000000) >> n
4828
4829
4830 # Based on png2str() written by @gdkchan and improved by @yokrysty
4831 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4832 def decode_png(png_data):
4833     # Reference: https://www.w3.org/TR/PNG/
4834     header = png_data[8:]
4835
4836     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4837         raise OSError('Not a valid PNG file.')
4838
4839     int_map = {1: '>B', 2: '>H', 4: '>I'}
4840     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4841
4842     chunks = []
4843
4844     while header:
4845         length = unpack_integer(header[:4])
4846         header = header[4:]
4847
4848         chunk_type = header[:4]
4849         header = header[4:]
4850
4851         chunk_data = header[:length]
4852         header = header[length:]
4853
4854         header = header[4:]  # Skip CRC
4855
4856         chunks.append({
4857             'type': chunk_type,
4858             'length': length,
4859             'data': chunk_data
4860         })
4861
4862     ihdr = chunks[0]['data']
4863
4864     width = unpack_integer(ihdr[:4])
4865     height = unpack_integer(ihdr[4:8])
4866
4867     idat = b''
4868
4869     for chunk in chunks:
4870         if chunk['type'] == b'IDAT':
4871             idat += chunk['data']
4872
4873     if not idat:
4874         raise OSError('Unable to read PNG data.')
4875
4876     decompressed_data = bytearray(zlib.decompress(idat))
4877
4878     stride = width * 3
4879     pixels = []
4880
4881     def _get_pixel(idx):
4882         x = idx % stride
4883         y = idx // stride
4884         return pixels[y][x]
4885
4886     for y in range(height):
4887         basePos = y * (1 + stride)
4888         filter_type = decompressed_data[basePos]
4889
4890         current_row = []
4891
4892         pixels.append(current_row)
4893
4894         for x in range(stride):
4895             color = decompressed_data[1 + basePos + x]
4896             basex = y * stride + x
4897             left = 0
4898             up = 0
4899
4900             if x > 2:
4901                 left = _get_pixel(basex - 3)
4902             if y > 0:
4903                 up = _get_pixel(basex - stride)
4904
4905             if filter_type == 1:  # Sub
4906                 color = (color + left) & 0xff
4907             elif filter_type == 2:  # Up
4908                 color = (color + up) & 0xff
4909             elif filter_type == 3:  # Average
4910                 color = (color + ((left + up) >> 1)) & 0xff
4911             elif filter_type == 4:  # Paeth
4912                 a = left
4913                 b = up
4914                 c = 0
4915
4916                 if x > 2 and y > 0:
4917                     c = _get_pixel(basex - stride - 3)
4918
4919                 p = a + b - c
4920
4921                 pa = abs(p - a)
4922                 pb = abs(p - b)
4923                 pc = abs(p - c)
4924
4925                 if pa <= pb and pa <= pc:
4926                     color = (color + a) & 0xff
4927                 elif pb <= pc:
4928                     color = (color + b) & 0xff
4929                 else:
4930                     color = (color + c) & 0xff
4931
4932             current_row.append(color)
4933
4934     return width, height, pixels
4935
4936
4937 def write_xattr(path, key, value):
4938     # Windows: Write xattrs to NTFS Alternate Data Streams:
4939     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4940     if compat_os_name == 'nt':
4941         assert ':' not in key
4942         assert os.path.exists(path)
4943
4944         try:
4945             with open(f'{path}:{key}', 'wb') as f:
4946                 f.write(value)
4947         except OSError as e:
4948             raise XAttrMetadataError(e.errno, e.strerror)
4949         return
4950
4951     # UNIX Method 1. Use xattrs/pyxattrs modules
4952
4953     setxattr = None
4954     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4955         # Unicode arguments are not supported in pyxattr until version 0.5.0
4956         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4957         if version_tuple(xattr.__version__) >= (0, 5, 0):
4958             setxattr = xattr.set
4959     elif xattr:
4960         setxattr = xattr.setxattr
4961
4962     if setxattr:
4963         try:
4964             setxattr(path, key, value)
4965         except OSError as e:
4966             raise XAttrMetadataError(e.errno, e.strerror)
4967         return
4968
4969     # UNIX Method 2. Use setfattr/xattr executables
4970     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4971            else 'xattr' if check_executable('xattr', ['-h']) else None)
4972     if not exe:
4973         raise XAttrUnavailableError(
4974             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4975             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4976
4977     value = value.decode()
4978     try:
4979         _, stderr, returncode = Popen.run(
4980             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4981             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4982     except OSError as e:
4983         raise XAttrMetadataError(e.errno, e.strerror)
4984     if returncode:
4985         raise XAttrMetadataError(returncode, stderr)
4986
4987
4988 def random_birthday(year_field, month_field, day_field):
4989     start_date = datetime.date(1950, 1, 1)
4990     end_date = datetime.date(1995, 12, 31)
4991     offset = random.randint(0, (end_date - start_date).days)
4992     random_date = start_date + datetime.timedelta(offset)
4993     return {
4994         year_field: str(random_date.year),
4995         month_field: str(random_date.month),
4996         day_field: str(random_date.day),
4997     }
4998
4999
5000 # Templates for internet shortcut files, which are plain text files.
5001 DOT_URL_LINK_TEMPLATE = '''\
5002 [InternetShortcut]
5003 URL=%(url)s
5004 '''
5005
5006 DOT_WEBLOC_LINK_TEMPLATE = '''\
5007 <?xml version="1.0" encoding="UTF-8"?>
5008 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5009 <plist version="1.0">
5010 <dict>
5011 \t<key>URL</key>
5012 \t<string>%(url)s</string>
5013 </dict>
5014 </plist>
5015 '''
5016
5017 DOT_DESKTOP_LINK_TEMPLATE = '''\
5018 [Desktop Entry]
5019 Encoding=UTF-8
5020 Name=%(filename)s
5021 Type=Link
5022 URL=%(url)s
5023 Icon=text-html
5024 '''
5025
5026 LINK_TEMPLATES = {
5027     'url': DOT_URL_LINK_TEMPLATE,
5028     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5029     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5030 }
5031
5032
5033 def iri_to_uri(iri):
5034     """
5035     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5036
5037     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5038     """
5039
5040     iri_parts = compat_urllib_parse_urlparse(iri)
5041
5042     if '[' in iri_parts.netloc:
5043         raise ValueError('IPv6 URIs are not, yet, supported.')
5044         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5045
5046     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5047
5048     net_location = ''
5049     if iri_parts.username:
5050         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5051         if iri_parts.password is not None:
5052             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5053         net_location += '@'
5054
5055     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5056     # The 'idna' encoding produces ASCII text.
5057     if iri_parts.port is not None and iri_parts.port != 80:
5058         net_location += ':' + str(iri_parts.port)
5059
5060     return urllib.parse.urlunparse(
5061         (iri_parts.scheme,
5062             net_location,
5063
5064             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5065
5066             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5067             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5068
5069             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5070             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5071
5072             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5073
5074     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5075
5076
5077 def to_high_limit_path(path):
5078     if sys.platform in ['win32', 'cygwin']:
5079         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5080         return '\\\\?\\' + os.path.abspath(path)
5081
5082     return path
5083
5084
5085 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5086     val = traverse_obj(obj, *variadic(field))
5087     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5088         return default
5089     return template % func(val)
5090
5091
5092 def clean_podcast_url(url):
5093     return re.sub(r'''(?x)
5094         (?:
5095             (?:
5096                 chtbl\.com/track|
5097                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5098                 play\.podtrac\.com
5099             )/[^/]+|
5100             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5101             flex\.acast\.com|
5102             pd(?:
5103                 cn\.co| # https://podcorn.com/analytics-prefix/
5104                 st\.fm # https://podsights.com/docs/
5105             )/e
5106         )/''', '', url)
5107
5108
5109 _HEX_TABLE = '0123456789abcdef'
5110
5111
5112 def random_uuidv4():
5113     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5114
5115
5116 def make_dir(path, to_screen=None):
5117     try:
5118         dn = os.path.dirname(path)
5119         if dn and not os.path.exists(dn):
5120             os.makedirs(dn)
5121         return True
5122     except OSError as err:
5123         if callable(to_screen) is not None:
5124             to_screen('unable to create directory ' + error_to_compat_str(err))
5125         return False
5126
5127
5128 def get_executable_path():
5129     from .update import _get_variant_and_executable_path
5130
5131     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5132
5133
5134 def load_plugins(name, suffix, namespace):
5135     classes = {}
5136     with contextlib.suppress(FileNotFoundError):
5137         plugins_spec = importlib.util.spec_from_file_location(
5138             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5139         plugins = importlib.util.module_from_spec(plugins_spec)
5140         sys.modules[plugins_spec.name] = plugins
5141         plugins_spec.loader.exec_module(plugins)
5142         for name in dir(plugins):
5143             if name in namespace:
5144                 continue
5145             if not name.endswith(suffix):
5146                 continue
5147             klass = getattr(plugins, name)
5148             classes[name] = namespace[name] = klass
5149     return classes
5150
5151
5152 def traverse_obj(
5153         obj, *path_list, default=None, expected_type=None, get_all=True,
5154         casesense=True, is_user_input=False, traverse_string=False):
5155     ''' Traverse nested list/dict/tuple
5156     @param path_list        A list of paths which are checked one by one.
5157                             Each path is a list of keys where each key is a:
5158                               - None:     Do nothing
5159                               - string:   A dictionary key
5160                               - int:      An index into a list
5161                               - tuple:    A list of keys all of which will be traversed
5162                               - Ellipsis: Fetch all values in the object
5163                               - Function: Takes the key and value as arguments
5164                                           and returns whether the key matches or not
5165     @param default          Default value to return
5166     @param expected_type    Only accept final value of this type (Can also be any callable)
5167     @param get_all          Return all the values obtained from a path or only the first one
5168     @param casesense        Whether to consider dictionary keys as case sensitive
5169     @param is_user_input    Whether the keys are generated from user input. If True,
5170                             strings are converted to int/slice if necessary
5171     @param traverse_string  Whether to traverse inside strings. If True, any
5172                             non-compatible object will also be converted into a string
5173     # TODO: Write tests
5174     '''
5175     if not casesense:
5176         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5177         path_list = (map(_lower, variadic(path)) for path in path_list)
5178
5179     def _traverse_obj(obj, path, _current_depth=0):
5180         nonlocal depth
5181         path = tuple(variadic(path))
5182         for i, key in enumerate(path):
5183             if None in (key, obj):
5184                 return obj
5185             if isinstance(key, (list, tuple)):
5186                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5187                 key = ...
5188             if key is ...:
5189                 obj = (obj.values() if isinstance(obj, dict)
5190                        else obj if isinstance(obj, (list, tuple, LazyList))
5191                        else str(obj) if traverse_string else [])
5192                 _current_depth += 1
5193                 depth = max(depth, _current_depth)
5194                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5195             elif callable(key):
5196                 if isinstance(obj, (list, tuple, LazyList)):
5197                     obj = enumerate(obj)
5198                 elif isinstance(obj, dict):
5199                     obj = obj.items()
5200                 else:
5201                     if not traverse_string:
5202                         return None
5203                     obj = str(obj)
5204                 _current_depth += 1
5205                 depth = max(depth, _current_depth)
5206                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5207             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5208                 obj = (obj.get(key) if casesense or (key in obj)
5209                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5210             else:
5211                 if is_user_input:
5212                     key = (int_or_none(key) if ':' not in key
5213                            else slice(*map(int_or_none, key.split(':'))))
5214                     if key == slice(None):
5215                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5216                 if not isinstance(key, (int, slice)):
5217                     return None
5218                 if not isinstance(obj, (list, tuple, LazyList)):
5219                     if not traverse_string:
5220                         return None
5221                     obj = str(obj)
5222                 try:
5223                     obj = obj[key]
5224                 except IndexError:
5225                     return None
5226         return obj
5227
5228     if isinstance(expected_type, type):
5229         type_test = lambda val: val if isinstance(val, expected_type) else None
5230     else:
5231         type_test = expected_type or IDENTITY
5232
5233     for path in path_list:
5234         depth = 0
5235         val = _traverse_obj(obj, path)
5236         if val is not None:
5237             if depth:
5238                 for _ in range(depth - 1):
5239                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5240                 val = [v for v in map(type_test, val) if v is not None]
5241                 if val:
5242                     return val if get_all else val[0]
5243             else:
5244                 val = type_test(val)
5245                 if val is not None:
5246                     return val
5247     return default
5248
5249
5250 def traverse_dict(dictn, keys, casesense=True):
5251     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5252                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5253     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5254
5255
5256 def get_first(obj, keys, **kwargs):
5257     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5258
5259
5260 def variadic(x, allowed_types=(str, bytes, dict)):
5261     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5262
5263
5264 def time_seconds(**kwargs):
5265     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5266     return t.timestamp()
5267
5268
5269 # create a JSON Web Signature (jws) with HS256 algorithm
5270 # the resulting format is in JWS Compact Serialization
5271 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5272 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5273 def jwt_encode_hs256(payload_data, key, headers={}):
5274     header_data = {
5275         'alg': 'HS256',
5276         'typ': 'JWT',
5277     }
5278     if headers:
5279         header_data.update(headers)
5280     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5281     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5282     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5283     signature_b64 = base64.b64encode(h.digest())
5284     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5285     return token
5286
5287
5288 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5289 def jwt_decode_hs256(jwt):
5290     header_b64, payload_b64, signature_b64 = jwt.split('.')
5291     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5292     return payload_data
5293
5294
5295 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5296
5297
5298 @functools.cache
5299 def supports_terminal_sequences(stream):
5300     if compat_os_name == 'nt':
5301         if not WINDOWS_VT_MODE:
5302             return False
5303     elif not os.getenv('TERM'):
5304         return False
5305     try:
5306         return stream.isatty()
5307     except BaseException:
5308         return False
5309
5310
5311 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5312     if get_windows_version() < (10, 0, 10586):
5313         return
5314     global WINDOWS_VT_MODE
5315     try:
5316         Popen.run('', shell=True)
5317     except Exception:
5318         return
5319
5320     WINDOWS_VT_MODE = True
5321     supports_terminal_sequences.cache_clear()
5322
5323
5324 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5325
5326
5327 def remove_terminal_sequences(string):
5328     return _terminal_sequences_re.sub('', string)
5329
5330
5331 def number_of_digits(number):
5332     return len('%d' % number)
5333
5334
5335 def join_nonempty(*values, delim='-', from_dict=None):
5336     if from_dict is not None:
5337         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5338     return delim.join(map(str, filter(None, values)))
5339
5340
5341 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5342     """
5343     Find the largest format dimensions in terms of video width and, for each thumbnail:
5344     * Modify the URL: Match the width with the provided regex and replace with the former width
5345     * Update dimensions
5346
5347     This function is useful with video services that scale the provided thumbnails on demand
5348     """
5349     _keys = ('width', 'height')
5350     max_dimensions = max(
5351         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5352         default=(0, 0))
5353     if not max_dimensions[0]:
5354         return thumbnails
5355     return [
5356         merge_dicts(
5357             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5358             dict(zip(_keys, max_dimensions)), thumbnail)
5359         for thumbnail in thumbnails
5360     ]
5361
5362
5363 def parse_http_range(range):
5364     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5365     if not range:
5366         return None, None, None
5367     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5368     if not crg:
5369         return None, None, None
5370     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5371
5372
5373 def read_stdin(what):
5374     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5375     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5376     return sys.stdin
5377
5378
5379 class Config:
5380     own_args = None
5381     parsed_args = None
5382     filename = None
5383     __initialized = False
5384
5385     def __init__(self, parser, label=None):
5386         self.parser, self.label = parser, label
5387         self._loaded_paths, self.configs = set(), []
5388
5389     def init(self, args=None, filename=None):
5390         assert not self.__initialized
5391         directory = ''
5392         if filename:
5393             location = os.path.realpath(filename)
5394             directory = os.path.dirname(location)
5395             if location in self._loaded_paths:
5396                 return False
5397             self._loaded_paths.add(location)
5398
5399         self.own_args, self.__initialized = args, True
5400         opts, _ = self.parser.parse_known_args(args)
5401         self.parsed_args, self.filename = args, filename
5402
5403         for location in opts.config_locations or []:
5404             if location == '-':
5405                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5406                 continue
5407             location = os.path.join(directory, expand_path(location))
5408             if os.path.isdir(location):
5409                 location = os.path.join(location, 'yt-dlp.conf')
5410             if not os.path.exists(location):
5411                 self.parser.error(f'config location {location} does not exist')
5412             self.append_config(self.read_file(location), location)
5413         return True
5414
5415     def __str__(self):
5416         label = join_nonempty(
5417             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5418             delim=' ')
5419         return join_nonempty(
5420             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5421             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5422             delim='\n')
5423
5424     @staticmethod
5425     def read_file(filename, default=[]):
5426         try:
5427             optionf = open(filename)
5428         except OSError:
5429             return default  # silently skip if file is not present
5430         try:
5431             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5432             contents = optionf.read()
5433             res = shlex.split(contents, comments=True)
5434         except Exception as err:
5435             raise ValueError(f'Unable to parse "{filename}": {err}')
5436         finally:
5437             optionf.close()
5438         return res
5439
5440     @staticmethod
5441     def hide_login_info(opts):
5442         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5443         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5444
5445         def _scrub_eq(o):
5446             m = eqre.match(o)
5447             if m:
5448                 return m.group('key') + '=PRIVATE'
5449             else:
5450                 return o
5451
5452         opts = list(map(_scrub_eq, opts))
5453         for idx, opt in enumerate(opts):
5454             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5455                 opts[idx + 1] = 'PRIVATE'
5456         return opts
5457
5458     def append_config(self, *args, label=None):
5459         config = type(self)(self.parser, label)
5460         config._loaded_paths = self._loaded_paths
5461         if config.init(*args):
5462             self.configs.append(config)
5463
5464     @property
5465     def all_args(self):
5466         for config in reversed(self.configs):
5467             yield from config.all_args
5468         yield from self.parsed_args or []
5469
5470     def parse_known_args(self, **kwargs):
5471         return self.parser.parse_known_args(self.all_args, **kwargs)
5472
5473     def parse_args(self):
5474         return self.parser.parse_args(self.all_args)
5475
5476
5477 class WebSocketsWrapper():
5478     """Wraps websockets module to use in non-async scopes"""
5479     pool = None
5480
5481     def __init__(self, url, headers=None, connect=True):
5482         self.loop = asyncio.new_event_loop()
5483         # XXX: "loop" is deprecated
5484         self.conn = websockets.connect(
5485             url, extra_headers=headers, ping_interval=None,
5486             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5487         if connect:
5488             self.__enter__()
5489         atexit.register(self.__exit__, None, None, None)
5490
5491     def __enter__(self):
5492         if not self.pool:
5493             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5494         return self
5495
5496     def send(self, *args):
5497         self.run_with_loop(self.pool.send(*args), self.loop)
5498
5499     def recv(self, *args):
5500         return self.run_with_loop(self.pool.recv(*args), self.loop)
5501
5502     def __exit__(self, type, value, traceback):
5503         try:
5504             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5505         finally:
5506             self.loop.close()
5507             self._cancel_all_tasks(self.loop)
5508
5509     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5510     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5511     @staticmethod
5512     def run_with_loop(main, loop):
5513         if not asyncio.iscoroutine(main):
5514             raise ValueError(f'a coroutine was expected, got {main!r}')
5515
5516         try:
5517             return loop.run_until_complete(main)
5518         finally:
5519             loop.run_until_complete(loop.shutdown_asyncgens())
5520             if hasattr(loop, 'shutdown_default_executor'):
5521                 loop.run_until_complete(loop.shutdown_default_executor())
5522
5523     @staticmethod
5524     def _cancel_all_tasks(loop):
5525         to_cancel = asyncio.all_tasks(loop)
5526
5527         if not to_cancel:
5528             return
5529
5530         for task in to_cancel:
5531             task.cancel()
5532
5533         # XXX: "loop" is removed in python 3.10+
5534         loop.run_until_complete(
5535             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5536
5537         for task in to_cancel:
5538             if task.cancelled():
5539                 continue
5540             if task.exception() is not None:
5541                 loop.call_exception_handler({
5542                     'message': 'unhandled exception during asyncio.run() shutdown',
5543                     'exception': task.exception(),
5544                     'task': task,
5545                 })
5546
5547
5548 def merge_headers(*dicts):
5549     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5550     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5551
5552
5553 class classproperty:
5554     """classmethod(property(func)) that works in py < 3.9"""
5555
5556     def __init__(self, func):
5557         functools.update_wrapper(self, func)
5558         self.func = func
5559
5560     def __get__(self, _, cls):
5561         return self.func(cls)
5562
5563
5564 class Namespace(types.SimpleNamespace):
5565     """Immutable namespace"""
5566
5567     def __iter__(self):
5568         return iter(self.__dict__.values())
5569
5570     @property
5571     def items_(self):
5572         return self.__dict__.items()
5573
5574
5575 # Deprecated
5576 has_certifi = bool(certifi)
5577 has_websockets = bool(websockets)