yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import importlib.util
  22 import inspect
  23 import io
  24 import itertools
  25 import json
  26 import locale
  27 import math
  28 import mimetypes
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import unicodedata
  45 import urllib.error
  46 import urllib.parse
  47 import urllib.request
  48 import xml.etree.ElementTree
  49 import zlib
  50
  51 from .compat import functools  # isort: split
  52 from .compat import (
  53     compat_etree_fromstring,
  54     compat_expanduser,
  55     compat_HTMLParseError,
  56     compat_os_name,
  57     compat_shlex_quote,
  58 )
  59 from .dependencies import brotli, certifi, websockets, xattr
  60 from .socks import ProxyType, sockssocket
  61
  62
  63 def register_socks_protocols():
  64     # "Register" SOCKS protocols
  65     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  66     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  67     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  68         if scheme not in urllib.parse.uses_netloc:
  69             urllib.parse.uses_netloc.append(scheme)
  70
  71
  72 # This is not clearly defined otherwise
  73 compiled_regex_type = type(re.compile(''))
  74
  75
  76 def random_user_agent():
  77     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  78     _CHROME_VERSIONS = (
  79         '90.0.4430.212',
  80         '90.0.4430.24',
  81         '90.0.4430.70',
  82         '90.0.4430.72',
  83         '90.0.4430.85',
  84         '90.0.4430.93',
  85         '91.0.4472.101',
  86         '91.0.4472.106',
  87         '91.0.4472.114',
  88         '91.0.4472.124',
  89         '91.0.4472.164',
  90         '91.0.4472.19',
  91         '91.0.4472.77',
  92         '92.0.4515.107',
  93         '92.0.4515.115',
  94         '92.0.4515.131',
  95         '92.0.4515.159',
  96         '92.0.4515.43',
  97         '93.0.4556.0',
  98         '93.0.4577.15',
  99         '93.0.4577.63',
 100         '93.0.4577.82',
 101         '94.0.4606.41',
 102         '94.0.4606.54',
 103         '94.0.4606.61',
 104         '94.0.4606.71',
 105         '94.0.4606.81',
 106         '94.0.4606.85',
 107         '95.0.4638.17',
 108         '95.0.4638.50',
 109         '95.0.4638.54',
 110         '95.0.4638.69',
 111         '95.0.4638.74',
 112         '96.0.4664.18',
 113         '96.0.4664.45',
 114         '96.0.4664.55',
 115         '96.0.4664.93',
 116         '97.0.4692.20',
 117     )
 118     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 119
 120
 121 SUPPORTED_ENCODINGS = [
 122     'gzip', 'deflate'
 123 ]
 124 if brotli:
 125     SUPPORTED_ENCODINGS.append('br')
 126
 127 std_headers = {
 128     'User-Agent': random_user_agent(),
 129     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 130     'Accept-Language': 'en-us,en;q=0.5',
 131     'Sec-Fetch-Mode': 'navigate',
 132 }
 133
 134
 135 USER_AGENTS = {
 136     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 137 }
 138
 139
 140 NO_DEFAULT = object()
 141 IDENTITY = lambda x: x
 142
 143 ENGLISH_MONTH_NAMES = [
 144     'January', 'February', 'March', 'April', 'May', 'June',
 145     'July', 'August', 'September', 'October', 'November', 'December']
 146
 147 MONTH_NAMES = {
 148     'en': ENGLISH_MONTH_NAMES,
 149     'fr': [
 150         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 151         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 152     # these follow the genitive grammatical case (dopełniacz)
 153     # some websites might be using nominative, which will require another month list
 154     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 155     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 156            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 157 }
 158
 159 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 160 TIMEZONE_NAMES = {
 161     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 162     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 163     'EST': -5, 'EDT': -4,  # Eastern
 164     'CST': -6, 'CDT': -5,  # Central
 165     'MST': -7, 'MDT': -6,  # Mountain
 166     'PST': -8, 'PDT': -7   # Pacific
 167 }
 168
 169 # needed for sanitizing filenames in restricted mode
 170 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 171                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 172                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 173
 174 DATE_FORMATS = (
 175     '%d %B %Y',
 176     '%d %b %Y',
 177     '%B %d %Y',
 178     '%B %dst %Y',
 179     '%B %dnd %Y',
 180     '%B %drd %Y',
 181     '%B %dth %Y',
 182     '%b %d %Y',
 183     '%b %dst %Y',
 184     '%b %dnd %Y',
 185     '%b %drd %Y',
 186     '%b %dth %Y',
 187     '%b %dst %Y %I:%M',
 188     '%b %dnd %Y %I:%M',
 189     '%b %drd %Y %I:%M',
 190     '%b %dth %Y %I:%M',
 191     '%Y %m %d',
 192     '%Y-%m-%d',
 193     '%Y.%m.%d.',
 194     '%Y/%m/%d',
 195     '%Y/%m/%d %H:%M',
 196     '%Y/%m/%d %H:%M:%S',
 197     '%Y%m%d%H%M',
 198     '%Y%m%d%H%M%S',
 199     '%Y%m%d',
 200     '%Y-%m-%d %H:%M',
 201     '%Y-%m-%d %H:%M:%S',
 202     '%Y-%m-%d %H:%M:%S.%f',
 203     '%Y-%m-%d %H:%M:%S:%f',
 204     '%d.%m.%Y %H:%M',
 205     '%d.%m.%Y %H.%M',
 206     '%Y-%m-%dT%H:%M:%SZ',
 207     '%Y-%m-%dT%H:%M:%S.%fZ',
 208     '%Y-%m-%dT%H:%M:%S.%f0Z',
 209     '%Y-%m-%dT%H:%M:%S',
 210     '%Y-%m-%dT%H:%M:%S.%f',
 211     '%Y-%m-%dT%H:%M',
 212     '%b %d %Y at %H:%M',
 213     '%b %d %Y at %H:%M:%S',
 214     '%B %d %Y at %H:%M',
 215     '%B %d %Y at %H:%M:%S',
 216     '%H:%M %d-%b-%Y',
 217 )
 218
 219 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 220 DATE_FORMATS_DAY_FIRST.extend([
 221     '%d-%m-%Y',
 222     '%d.%m.%Y',
 223     '%d.%m.%y',
 224     '%d/%m/%Y',
 225     '%d/%m/%y',
 226     '%d/%m/%Y %H:%M:%S',
 227     '%d-%m-%Y %H:%M',
 228 ])
 229
 230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 231 DATE_FORMATS_MONTH_FIRST.extend([
 232     '%m-%d-%Y',
 233     '%m.%d.%Y',
 234     '%m/%d/%Y',
 235     '%m/%d/%y',
 236     '%m/%d/%Y %H:%M:%S',
 237 ])
 238
 239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 241
 242 NUMBER_RE = r'\d+(?:\.\d+)?'
 243
 244
 245 @functools.cache
 246 def preferredencoding():
 247     """Get preferred encoding.
 248
 249     Returns the best encoding scheme for the system, based on
 250     locale.getpreferredencoding() and some further tweaks.
 251     """
 252     try:
 253         pref = locale.getpreferredencoding()
 254         'TEST'.encode(pref)
 255     except Exception:
 256         pref = 'UTF-8'
 257
 258     return pref
 259
 260
 261 def write_json_file(obj, fn):
 262     """ Encode obj as JSON and write it to fn, atomically if possible """
 263
 264     tf = tempfile.NamedTemporaryFile(
 265         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 266         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 267
 268     try:
 269         with tf:
 270             json.dump(obj, tf, ensure_ascii=False)
 271         if sys.platform == 'win32':
 272             # Need to remove existing file on Windows, else os.rename raises
 273             # WindowsError or FileExistsError.
 274             with contextlib.suppress(OSError):
 275                 os.unlink(fn)
 276         with contextlib.suppress(OSError):
 277             mask = os.umask(0)
 278             os.umask(mask)
 279             os.chmod(tf.name, 0o666 & ~mask)
 280         os.rename(tf.name, fn)
 281     except Exception:
 282         with contextlib.suppress(OSError):
 283             os.remove(tf.name)
 284         raise
 285
 286
 287 def find_xpath_attr(node, xpath, key, val=None):
 288     """ Find the xpath xpath[@key=val] """
 289     assert re.match(r'^[a-zA-Z_-]+$', key)
 290     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 291     return node.find(expr)
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295
 296
 297 def xpath_with_ns(path, ns_map):
 298     components = [c.split(':') for c in path.split('/')]
 299     replaced = []
 300     for c in components:
 301         if len(c) == 1:
 302             replaced.append(c[0])
 303         else:
 304             ns, tag = c
 305             replaced.append('{%s}%s' % (ns_map[ns], tag))
 306     return '/'.join(replaced)
 307
 308
 309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     def _find_xpath(xpath):
 311         return node.find(xpath)
 312
 313     if isinstance(xpath, str):
 314         n = _find_xpath(xpath)
 315     else:
 316         for xp in xpath:
 317             n = _find_xpath(xp)
 318             if n is not None:
 319                 break
 320
 321     if n is None:
 322         if default is not NO_DEFAULT:
 323             return default
 324         elif fatal:
 325             name = xpath if name is None else name
 326             raise ExtractorError('Could not find XML element %s' % name)
 327         else:
 328             return None
 329     return n
 330
 331
 332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 333     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 334     if n is None or n == default:
 335         return n
 336     if n.text is None:
 337         if default is not NO_DEFAULT:
 338             return default
 339         elif fatal:
 340             name = xpath if name is None else name
 341             raise ExtractorError('Could not find XML element\'s text %s' % name)
 342         else:
 343             return None
 344     return n.text
 345
 346
 347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 348     n = find_xpath_attr(node, xpath, key)
 349     if n is None:
 350         if default is not NO_DEFAULT:
 351             return default
 352         elif fatal:
 353             name = f'{xpath}[@{key}]' if name is None else name
 354             raise ExtractorError('Could not find XML attribute %s' % name)
 355         else:
 356             return None
 357     return n.attrib[key]
 358
 359
 360 def get_element_by_id(id, html, **kwargs):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute('id', id, html, **kwargs)
 363
 364
 365 def get_element_html_by_id(id, html, **kwargs):
 366     """Return the html of the tag with the specified ID in the passed HTML document"""
 367     return get_element_html_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_by_class(class_name, html):
 371     """Return the content of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_html_by_class(class_name, html):
 377     """Return the html of the first tag with the specified class in the passed HTML document"""
 378     retval = get_elements_html_by_class(class_name, html)
 379     return retval[0] if retval else None
 380
 381
 382 def get_element_by_attribute(attribute, value, html, **kwargs):
 383     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_html_by_attribute(attribute, value, html, **kargs):
 388     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_elements_by_class(class_name, html, **kargs):
 393     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_html_by_class(class_name, html):
 400     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_html_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_by_attribute(*args, **kwargs):
 407     """Return the content of the tag with the specified attribute in the passed HTML document"""
 408     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 409
 410
 411 def get_elements_html_by_attribute(*args, **kwargs):
 412     """Return the html of the tag with the specified attribute in the passed HTML document"""
 413     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 417     """
 418     Return the text (content) and the html (whole) of the tag with the specified
 419     attribute in the passed HTML document
 420     """
 421     if not value:
 422         return
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>{tag})
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 # XXX: This should be far less strict
 486 def get_element_text_and_html_by_tag(tag, html):
 487     """
 488     For the first element with the specified tag in the passed HTML document
 489     return its' content (text) and the whole element (html)
 490     """
 491     def find_or_raise(haystack, needle, exc):
 492         try:
 493             return haystack.index(needle)
 494         except ValueError:
 495             raise exc
 496     closing_tag = f'</{tag}>'
 497     whole_start = find_or_raise(
 498         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 499     content_start = find_or_raise(
 500         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 501     content_start += whole_start + 1
 502     with HTMLBreakOnClosingTagParser() as parser:
 503         parser.feed(html[whole_start:content_start])
 504         if not parser.tagstack or parser.tagstack[0] != tag:
 505             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 506         offset = content_start
 507         while offset < len(html):
 508             next_closing_tag_start = find_or_raise(
 509                 html[offset:], closing_tag,
 510                 compat_HTMLParseError(f'closing {tag} tag not found'))
 511             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 512             try:
 513                 parser.feed(html[offset:offset + next_closing_tag_end])
 514                 offset += next_closing_tag_end
 515             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 516                 return html[content_start:offset + next_closing_tag_start], \
 517                     html[whole_start:offset + next_closing_tag_end]
 518         raise compat_HTMLParseError('unexpected end of html')
 519
 520
 521 class HTMLAttributeParser(html.parser.HTMLParser):
 522     """Trivial HTML parser to gather the attributes for a single element"""
 523
 524     def __init__(self):
 525         self.attrs = {}
 526         html.parser.HTMLParser.__init__(self)
 527
 528     def handle_starttag(self, tag, attrs):
 529         self.attrs = dict(attrs)
 530         raise compat_HTMLParseError('done')
 531
 532
 533 class HTMLListAttrsParser(html.parser.HTMLParser):
 534     """HTML parser to gather the attributes for the elements of a list"""
 535
 536     def __init__(self):
 537         html.parser.HTMLParser.__init__(self)
 538         self.items = []
 539         self._level = 0
 540
 541     def handle_starttag(self, tag, attrs):
 542         if tag == 'li' and self._level == 0:
 543             self.items.append(dict(attrs))
 544         self._level += 1
 545
 546     def handle_endtag(self, tag):
 547         self._level -= 1
 548
 549
 550 def extract_attributes(html_element):
 551     """Given a string for an HTML element such as
 552     <el
 553          a="foo" B="bar" c="&98;az" d=boz
 554          empty= noval entity="&amp;"
 555          sq='"' dq="'"
 556     >
 557     Decode and return a dictionary of attributes.
 558     {
 559         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 560         'empty': '', 'noval': None, 'entity': '&',
 561         'sq': '"', 'dq': '\''
 562     }.
 563     """
 564     parser = HTMLAttributeParser()
 565     with contextlib.suppress(compat_HTMLParseError):
 566         parser.feed(html_element)
 567         parser.close()
 568     return parser.attrs
 569
 570
 571 def parse_list(webpage):
 572     """Given a string for an series of HTML <li> elements,
 573     return a dictionary of their attributes"""
 574     parser = HTMLListAttrsParser()
 575     parser.feed(webpage)
 576     parser.close()
 577     return parser.items
 578
 579
 580 def clean_html(html):
 581     """Clean an HTML snippet into a readable string"""
 582
 583     if html is None:  # Convenience for sanitizing descriptions etc.
 584         return html
 585
 586     html = re.sub(r'\s+', ' ', html)
 587     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 588     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 589     # Strip html tags
 590     html = re.sub('<.*?>', '', html)
 591     # Replace html entities
 592     html = unescapeHTML(html)
 593     return html.strip()
 594
 595
 596 class LenientJSONDecoder(json.JSONDecoder):
 597     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 598         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 599         super().__init__(*args, **kwargs)
 600
 601     def decode(self, s):
 602         if self.transform_source:
 603             s = self.transform_source(s)
 604         try:
 605             if self.ignore_extra:
 606                 return self.raw_decode(s.lstrip())[0]
 607             return super().decode(s)
 608         except json.JSONDecodeError as e:
 609             if e.pos is not None:
 610                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 611             raise
 612
 613
 614 def sanitize_open(filename, open_mode):
 615     """Try to open the given filename, and slightly tweak it if this fails.
 616
 617     Attempts to open the given filename. If this fails, it tries to change
 618     the filename slightly, step by step, until it's either able to open it
 619     or it fails and raises a final exception, like the standard open()
 620     function.
 621
 622     It returns the tuple (stream, definitive_file_name).
 623     """
 624     if filename == '-':
 625         if sys.platform == 'win32':
 626             import msvcrt
 627
 628             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 629             with contextlib.suppress(io.UnsupportedOperation):
 630                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 631         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 632
 633     for attempt in range(2):
 634         try:
 635             try:
 636                 if sys.platform == 'win32':
 637                     # FIXME: An exclusive lock also locks the file from being read.
 638                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 639                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 640                     raise LockingUnsupportedError()
 641                 stream = locked_file(filename, open_mode, block=False).__enter__()
 642             except OSError:
 643                 stream = open(filename, open_mode)
 644             return stream, filename
 645         except OSError as err:
 646             if attempt or err.errno in (errno.EACCES,):
 647                 raise
 648             old_filename, filename = filename, sanitize_path(filename)
 649             if old_filename == filename:
 650                 raise
 651
 652
 653 def timeconvert(timestr):
 654     """Convert RFC 2822 defined time string into system timestamp"""
 655     timestamp = None
 656     timetuple = email.utils.parsedate_tz(timestr)
 657     if timetuple is not None:
 658         timestamp = email.utils.mktime_tz(timetuple)
 659     return timestamp
 660
 661
 662 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 663     """Sanitizes a string so it could be used as part of a filename.
 664     @param restricted   Use a stricter subset of allowed characters
 665     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 666                         If unset, yt-dlp's new sanitization rules are in effect
 667     """
 668     if s == '':
 669         return ''
 670
 671     def replace_insane(char):
 672         if restricted and char in ACCENT_CHARS:
 673             return ACCENT_CHARS[char]
 674         elif not restricted and char == '\n':
 675             return '\0 '
 676         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 677             # Replace with their full-width unicode counterparts
 678             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 679         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 680             return ''
 681         elif char == '"':
 682             return '' if restricted else '\''
 683         elif char == ':':
 684             return '\0_\0-' if restricted else '\0 \0-'
 685         elif char in '\\/|*<>':
 686             return '\0_'
 687         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 688             return '\0_'
 689         return char
 690
 691     # Replace look-alike Unicode glyphs
 692     if restricted and (is_id is NO_DEFAULT or not is_id):
 693         s = unicodedata.normalize('NFKC', s)
 694     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 695     result = ''.join(map(replace_insane, s))
 696     if is_id is NO_DEFAULT:
 697         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 698         STRIP_RE = r'(?:\0.|[ _-])*'
 699         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 700     result = result.replace('\0', '') or '_'
 701
 702     if not is_id:
 703         while '__' in result:
 704             result = result.replace('__', '_')
 705         result = result.strip('_')
 706         # Common case of "Foreign band name - English song title"
 707         if restricted and result.startswith('-_'):
 708             result = result[2:]
 709         if result.startswith('-'):
 710             result = '_' + result[len('-'):]
 711         result = result.lstrip('.')
 712         if not result:
 713             result = '_'
 714     return result
 715
 716
 717 def sanitize_path(s, force=False):
 718     """Sanitizes and normalizes path on Windows"""
 719     if sys.platform == 'win32':
 720         force = False
 721         drive_or_unc, _ = os.path.splitdrive(s)
 722     elif force:
 723         drive_or_unc = ''
 724     else:
 725         return s
 726
 727     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 728     if drive_or_unc:
 729         norm_path.pop(0)
 730     sanitized_path = [
 731         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 732         for path_part in norm_path]
 733     if drive_or_unc:
 734         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 735     elif force and s and s[0] == os.path.sep:
 736         sanitized_path.insert(0, os.path.sep)
 737     return os.path.join(*sanitized_path)
 738
 739
 740 def sanitize_url(url, *, scheme='http'):
 741     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 742     # the number of unwanted failures due to missing protocol
 743     if url is None:
 744         return
 745     elif url.startswith('//'):
 746         return f'{scheme}:{url}'
 747     # Fix some common typos seen so far
 748     COMMON_TYPOS = (
 749         # https://github.com/ytdl-org/youtube-dl/issues/15649
 750         (r'^httpss://', r'https://'),
 751         # https://bx1.be/lives/direct-tv/
 752         (r'^rmtp([es]?)://', r'rtmp\1://'),
 753     )
 754     for mistake, fixup in COMMON_TYPOS:
 755         if re.match(mistake, url):
 756             return re.sub(mistake, fixup, url)
 757     return url
 758
 759
 760 def extract_basic_auth(url):
 761     parts = urllib.parse.urlsplit(url)
 762     if parts.username is None:
 763         return url, None
 764     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 765         parts.hostname if parts.port is None
 766         else '%s:%d' % (parts.hostname, parts.port))))
 767     auth_payload = base64.b64encode(
 768         ('%s:%s' % (parts.username, parts.password or '')).encode())
 769     return url, f'Basic {auth_payload.decode()}'
 770
 771
 772 def sanitized_Request(url, *args, **kwargs):
 773     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 774     if auth_header is not None:
 775         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 776         headers['Authorization'] = auth_header
 777     return urllib.request.Request(url, *args, **kwargs)
 778
 779
 780 def expand_path(s):
 781     """Expand shell variables and ~"""
 782     return os.path.expandvars(compat_expanduser(s))
 783
 784
 785 def orderedSet(iterable, *, lazy=False):
 786     """Remove all duplicates from the input iterable"""
 787     def _iter():
 788         seen = []  # Do not use set since the items can be unhashable
 789         for x in iterable:
 790             if x not in seen:
 791                 seen.append(x)
 792                 yield x
 793
 794     return _iter() if lazy else list(_iter())
 795
 796
 797 def _htmlentity_transform(entity_with_semicolon):
 798     """Transforms an HTML entity to a character."""
 799     entity = entity_with_semicolon[:-1]
 800
 801     # Known non-numeric HTML entity
 802     if entity in html.entities.name2codepoint:
 803         return chr(html.entities.name2codepoint[entity])
 804
 805     # TODO: HTML5 allows entities without a semicolon.
 806     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 807     if entity_with_semicolon in html.entities.html5:
 808         return html.entities.html5[entity_with_semicolon]
 809
 810     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 811     if mobj is not None:
 812         numstr = mobj.group(1)
 813         if numstr.startswith('x'):
 814             base = 16
 815             numstr = '0%s' % numstr
 816         else:
 817             base = 10
 818         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 819         with contextlib.suppress(ValueError):
 820             return chr(int(numstr, base))
 821
 822     # Unknown entity in name, return its literal representation
 823     return '&%s;' % entity
 824
 825
 826 def unescapeHTML(s):
 827     if s is None:
 828         return None
 829     assert isinstance(s, str)
 830
 831     return re.sub(
 832         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 833
 834
 835 def escapeHTML(text):
 836     return (
 837         text
 838         .replace('&', '&amp;')
 839         .replace('<', '&lt;')
 840         .replace('>', '&gt;')
 841         .replace('"', '&quot;')
 842         .replace("'", '&#39;')
 843     )
 844
 845
 846 def process_communicate_or_kill(p, *args, **kwargs):
 847     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 848                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 849     return Popen.communicate_or_kill(p, *args, **kwargs)
 850
 851
 852 class Popen(subprocess.Popen):
 853     if sys.platform == 'win32':
 854         _startupinfo = subprocess.STARTUPINFO()
 855         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 856     else:
 857         _startupinfo = None
 858
 859     @staticmethod
 860     def _fix_pyinstaller_ld_path(env):
 861         """Restore LD_LIBRARY_PATH when using PyInstaller
 862             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 863                  https://github.com/yt-dlp/yt-dlp/issues/4573
 864         """
 865         if not hasattr(sys, '_MEIPASS'):
 866             return
 867
 868         def _fix(key):
 869             orig = env.get(f'{key}_ORIG')
 870             if orig is None:
 871                 env.pop(key, None)
 872             else:
 873                 env[key] = orig
 874
 875         _fix('LD_LIBRARY_PATH')  # Linux
 876         _fix('DYLD_LIBRARY_PATH')  # macOS
 877
 878     def __init__(self, *args, env=None, text=False, **kwargs):
 879         if env is None:
 880             env = os.environ.copy()
 881         self._fix_pyinstaller_ld_path(env)
 882
 883         if text is True:
 884             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 885             kwargs.setdefault('encoding', 'utf-8')
 886             kwargs.setdefault('errors', 'replace')
 887         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 888
 889     def communicate_or_kill(self, *args, **kwargs):
 890         try:
 891             return self.communicate(*args, **kwargs)
 892         except BaseException:  # Including KeyboardInterrupt
 893             self.kill(timeout=None)
 894             raise
 895
 896     def kill(self, *, timeout=0):
 897         super().kill()
 898         if timeout != 0:
 899             self.wait(timeout=timeout)
 900
 901     @classmethod
 902     def run(cls, *args, timeout=None, **kwargs):
 903         with cls(*args, **kwargs) as proc:
 904             default = '' if proc.text_mode else b''
 905             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 906             return stdout or default, stderr or default, proc.returncode
 907
 908
 909 def get_subprocess_encoding():
 910     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 911         # For subprocess calls, encode with locale encoding
 912         # Refer to http://stackoverflow.com/a/9951851/35070
 913         encoding = preferredencoding()
 914     else:
 915         encoding = sys.getfilesystemencoding()
 916     if encoding is None:
 917         encoding = 'utf-8'
 918     return encoding
 919
 920
 921 def encodeFilename(s, for_subprocess=False):
 922     assert isinstance(s, str)
 923     return s
 924
 925
 926 def decodeFilename(b, for_subprocess=False):
 927     return b
 928
 929
 930 def encodeArgument(s):
 931     # Legacy code that uses byte strings
 932     # Uncomment the following line after fixing all post processors
 933     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 934     return s if isinstance(s, str) else s.decode('ascii')
 935
 936
 937 def decodeArgument(b):
 938     return b
 939
 940
 941 def decodeOption(optval):
 942     if optval is None:
 943         return optval
 944     if isinstance(optval, bytes):
 945         optval = optval.decode(preferredencoding())
 946
 947     assert isinstance(optval, str)
 948     return optval
 949
 950
 951 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 952
 953
 954 def timetuple_from_msec(msec):
 955     secs, msec = divmod(msec, 1000)
 956     mins, secs = divmod(secs, 60)
 957     hrs, mins = divmod(mins, 60)
 958     return _timetuple(hrs, mins, secs, msec)
 959
 960
 961 def formatSeconds(secs, delim=':', msec=False):
 962     time = timetuple_from_msec(secs * 1000)
 963     if time.hours:
 964         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 965     elif time.minutes:
 966         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 967     else:
 968         ret = '%d' % time.seconds
 969     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 970
 971
 972 def _ssl_load_windows_store_certs(ssl_context, storename):
 973     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 974     try:
 975         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 976                  if encoding == 'x509_asn' and (
 977                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 978     except PermissionError:
 979         return
 980     for cert in certs:
 981         with contextlib.suppress(ssl.SSLError):
 982             ssl_context.load_verify_locations(cadata=cert)
 983
 984
 985 def make_HTTPS_handler(params, **kwargs):
 986     opts_check_certificate = not params.get('nocheckcertificate')
 987     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 988     context.check_hostname = opts_check_certificate
 989     if params.get('legacyserverconnect'):
 990         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 991         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 992         context.set_ciphers('DEFAULT')
 993     elif (
 994         sys.version_info < (3, 10)
 995         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 996         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 997     ):
 998         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 999         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000         # in some situations [2][3].
1001         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1003         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1004         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007         # 4. https://peps.python.org/pep-0644/
1008         # 5. https://peps.python.org/pep-0644/#libressl-support
1009         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1010         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011         context.minimum_version = ssl.TLSVersion.TLSv1_2
1012
1013     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014     if opts_check_certificate:
1015         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016             context.load_verify_locations(cafile=certifi.where())
1017         else:
1018             try:
1019                 context.load_default_certs()
1020                 # Work around the issue in load_default_certs when there are bad certificates. See:
1021                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023             except ssl.SSLError:
1024                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026                     for storename in ('CA', 'ROOT'):
1027                         _ssl_load_windows_store_certs(context, storename)
1028                 context.set_default_verify_paths()
1029
1030     client_certfile = params.get('client_certificate')
1031     if client_certfile:
1032         try:
1033             context.load_cert_chain(
1034                 client_certfile, keyfile=params.get('client_certificate_key'),
1035                 password=params.get('client_certificate_password'))
1036         except ssl.SSLError:
1037             raise YoutubeDLError('Unable to load client certificate')
1038
1039     # Some servers may reject requests if ALPN extension is not sent. See:
1040     # https://github.com/python/cpython/issues/85140
1041     # https://github.com/yt-dlp/yt-dlp/issues/3878
1042     with contextlib.suppress(NotImplementedError):
1043         context.set_alpn_protocols(['http/1.1'])
1044
1045     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1046
1047
1048 def bug_reports_message(before=';'):
1049     from .update import REPOSITORY
1050
1051     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1052            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1053
1054     before = before.rstrip()
1055     if not before or before.endswith(('.', '!', '?')):
1056         msg = msg[0].title() + msg[1:]
1057
1058     return (before + ' ' if before else '') + msg
1059
1060
1061 class YoutubeDLError(Exception):
1062     """Base exception for YoutubeDL errors."""
1063     msg = None
1064
1065     def __init__(self, msg=None):
1066         if msg is not None:
1067             self.msg = msg
1068         elif self.msg is None:
1069             self.msg = type(self).__name__
1070         super().__init__(self.msg)
1071
1072
1073 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1074 if hasattr(ssl, 'CertificateError'):
1075     network_exceptions.append(ssl.CertificateError)
1076 network_exceptions = tuple(network_exceptions)
1077
1078
1079 class ExtractorError(YoutubeDLError):
1080     """Error during info extraction."""
1081
1082     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1083         """ tb, if given, is the original traceback (so that it can be printed out).
1084         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1085         """
1086         if sys.exc_info()[0] in network_exceptions:
1087             expected = True
1088
1089         self.orig_msg = str(msg)
1090         self.traceback = tb
1091         self.expected = expected
1092         self.cause = cause
1093         self.video_id = video_id
1094         self.ie = ie
1095         self.exc_info = sys.exc_info()  # preserve original exception
1096         if isinstance(self.exc_info[1], ExtractorError):
1097             self.exc_info = self.exc_info[1].exc_info
1098         super().__init__(self.__msg)
1099
1100     @property
1101     def __msg(self):
1102         return ''.join((
1103             format_field(self.ie, None, '[%s] '),
1104             format_field(self.video_id, None, '%s: '),
1105             self.orig_msg,
1106             format_field(self.cause, None, ' (caused by %r)'),
1107             '' if self.expected else bug_reports_message()))
1108
1109     def format_traceback(self):
1110         return join_nonempty(
1111             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1112             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1113             delim='\n') or None
1114
1115     def __setattr__(self, name, value):
1116         super().__setattr__(name, value)
1117         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118             self.msg = self.__msg or type(self).__name__
1119             self.args = (self.msg, )  # Cannot be property
1120
1121
1122 class UnsupportedError(ExtractorError):
1123     def __init__(self, url):
1124         super().__init__(
1125             'Unsupported URL: %s' % url, expected=True)
1126         self.url = url
1127
1128
1129 class RegexNotFoundError(ExtractorError):
1130     """Error when a regex didn't match"""
1131     pass
1132
1133
1134 class GeoRestrictedError(ExtractorError):
1135     """Geographic restriction Error exception.
1136
1137     This exception may be thrown when a video is not available from your
1138     geographic location due to geographic restrictions imposed by a website.
1139     """
1140
1141     def __init__(self, msg, countries=None, **kwargs):
1142         kwargs['expected'] = True
1143         super().__init__(msg, **kwargs)
1144         self.countries = countries
1145
1146
1147 class UserNotLive(ExtractorError):
1148     """Error when a channel/user is not live"""
1149
1150     def __init__(self, msg=None, **kwargs):
1151         kwargs['expected'] = True
1152         super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
1155 class DownloadError(YoutubeDLError):
1156     """Download Error exception.
1157
1158     This exception may be thrown by FileDownloader objects if they are not
1159     configured to continue on errors. They will contain the appropriate
1160     error message.
1161     """
1162
1163     def __init__(self, msg, exc_info=None):
1164         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1165         super().__init__(msg)
1166         self.exc_info = exc_info
1167
1168
1169 class EntryNotInPlaylist(YoutubeDLError):
1170     """Entry not in playlist exception.
1171
1172     This exception will be thrown by YoutubeDL when a requested entry
1173     is not found in the playlist info_dict
1174     """
1175     msg = 'Entry not found in info'
1176
1177
1178 class SameFileError(YoutubeDLError):
1179     """Same File exception.
1180
1181     This exception will be thrown by FileDownloader objects if they detect
1182     multiple files would have to be downloaded to the same file on disk.
1183     """
1184     msg = 'Fixed output name but more than one file to download'
1185
1186     def __init__(self, filename=None):
1187         if filename is not None:
1188             self.msg += f': {filename}'
1189         super().__init__(self.msg)
1190
1191
1192 class PostProcessingError(YoutubeDLError):
1193     """Post Processing exception.
1194
1195     This exception may be raised by PostProcessor's .run() method to
1196     indicate an error in the postprocessing task.
1197     """
1198
1199
1200 class DownloadCancelled(YoutubeDLError):
1201     """ Exception raised when the download queue should be interrupted """
1202     msg = 'The download was cancelled'
1203
1204
1205 class ExistingVideoReached(DownloadCancelled):
1206     """ --break-on-existing triggered """
1207     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1208
1209
1210 class RejectedVideoReached(DownloadCancelled):
1211     """ --break-on-reject triggered """
1212     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1213
1214
1215 class MaxDownloadsReached(DownloadCancelled):
1216     """ --max-downloads limit has been reached. """
1217     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
1220 class ReExtractInfo(YoutubeDLError):
1221     """ Video info needs to be re-extracted. """
1222
1223     def __init__(self, msg, expected=False):
1224         super().__init__(msg)
1225         self.expected = expected
1226
1227
1228 class ThrottledDownload(ReExtractInfo):
1229     """ Download speed below --throttled-rate. """
1230     msg = 'The download speed is below throttle limit'
1231
1232     def __init__(self):
1233         super().__init__(self.msg, expected=False)
1234
1235
1236 class UnavailableVideoError(YoutubeDLError):
1237     """Unavailable Format exception.
1238
1239     This exception will be thrown when a video is requested
1240     in a format that is not available for that video.
1241     """
1242     msg = 'Unable to download video'
1243
1244     def __init__(self, err=None):
1245         if err is not None:
1246             self.msg += f': {err}'
1247         super().__init__(self.msg)
1248
1249
1250 class ContentTooShortError(YoutubeDLError):
1251     """Content Too Short exception.
1252
1253     This exception may be raised by FileDownloader objects when a file they
1254     download is too small for what the server announced first, indicating
1255     the connection was probably interrupted.
1256     """
1257
1258     def __init__(self, downloaded, expected):
1259         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1260         # Both in bytes
1261         self.downloaded = downloaded
1262         self.expected = expected
1263
1264
1265 class XAttrMetadataError(YoutubeDLError):
1266     def __init__(self, code=None, msg='Unknown error'):
1267         super().__init__(msg)
1268         self.code = code
1269         self.msg = msg
1270
1271         # Parsing code and msg
1272         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1273                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1274             self.reason = 'NO_SPACE'
1275         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276             self.reason = 'VALUE_TOO_LONG'
1277         else:
1278             self.reason = 'NOT_SUPPORTED'
1279
1280
1281 class XAttrUnavailableError(YoutubeDLError):
1282     pass
1283
1284
1285 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1286     hc = http_class(*args, **kwargs)
1287     source_address = ydl_handler._params.get('source_address')
1288
1289     if source_address is not None:
1290         # This is to workaround _create_connection() from socket where it will try all
1291         # address data from getaddrinfo() including IPv6. This filters the result from
1292         # getaddrinfo() based on the source_address value.
1293         # This is based on the cpython socket.create_connection() function.
1294         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296             host, port = address
1297             err = None
1298             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1299             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300             ip_addrs = [addr for addr in addrs if addr[0] == af]
1301             if addrs and not ip_addrs:
1302                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1303                 raise OSError(
1304                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305                     % (ip_version, source_address[0]))
1306             for res in ip_addrs:
1307                 af, socktype, proto, canonname, sa = res
1308                 sock = None
1309                 try:
1310                     sock = socket.socket(af, socktype, proto)
1311                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312                         sock.settimeout(timeout)
1313                     sock.bind(source_address)
1314                     sock.connect(sa)
1315                     err = None  # Explicitly break reference cycle
1316                     return sock
1317                 except OSError as _:
1318                     err = _
1319                     if sock is not None:
1320                         sock.close()
1321             if err is not None:
1322                 raise err
1323             else:
1324                 raise OSError('getaddrinfo returns an empty list')
1325         if hasattr(hc, '_create_connection'):
1326             hc._create_connection = _create_connection
1327         hc.source_address = (source_address, 0)
1328
1329     return hc
1330
1331
1332 def handle_youtubedl_headers(headers):
1333     filtered_headers = headers
1334
1335     if 'Youtubedl-no-compression' in filtered_headers:
1336         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1337         del filtered_headers['Youtubedl-no-compression']
1338
1339     return filtered_headers
1340
1341
1342 class YoutubeDLHandler(urllib.request.HTTPHandler):
1343     """Handler for HTTP requests and responses.
1344
1345     This class, when installed with an OpenerDirector, automatically adds
1346     the standard headers to every HTTP request and handles gzipped and
1347     deflated responses from web servers. If compression is to be avoided in
1348     a particular request, the original request in the program code only has
1349     to include the HTTP header "Youtubedl-no-compression", which will be
1350     removed before making the real request.
1351
1352     Part of this code was copied from:
1353
1354     http://techknack.net/python-urllib2-handlers/
1355
1356     Andrew Rowls, the author of that code, agreed to release it to the
1357     public domain.
1358     """
1359
1360     def __init__(self, params, *args, **kwargs):
1361         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1362         self._params = params
1363
1364     def http_open(self, req):
1365         conn_class = http.client.HTTPConnection
1366
1367         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368         if socks_proxy:
1369             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370             del req.headers['Ytdl-socks-proxy']
1371
1372         return self.do_open(functools.partial(
1373             _create_http_connection, self, conn_class, False),
1374             req)
1375
1376     @staticmethod
1377     def deflate(data):
1378         if not data:
1379             return data
1380         try:
1381             return zlib.decompress(data, -zlib.MAX_WBITS)
1382         except zlib.error:
1383             return zlib.decompress(data)
1384
1385     @staticmethod
1386     def brotli(data):
1387         if not data:
1388             return data
1389         return brotli.decompress(data)
1390
1391     def http_request(self, req):
1392         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393         # always respected by websites, some tend to give out URLs with non percent-encoded
1394         # non-ASCII characters (see telemb.py, ard.py [#3412])
1395         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396         # To work around aforementioned issue we will replace request's original URL with
1397         # percent-encoded one
1398         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400         url = req.get_full_url()
1401         url_escaped = escape_url(url)
1402
1403         # Substitute URL if any change after escaping
1404         if url != url_escaped:
1405             req = update_Request(req, url=url_escaped)
1406
1407         for h, v in self._params.get('http_headers', std_headers).items():
1408             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409             # The dict keys are capitalized because of this bug by urllib
1410             if h.capitalize() not in req.headers:
1411                 req.add_header(h, v)
1412
1413         if 'Accept-encoding' not in req.headers:
1414             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
1416         req.headers = handle_youtubedl_headers(req.headers)
1417
1418         return super().do_request_(req)
1419
1420     def http_response(self, req, resp):
1421         old_resp = resp
1422         # gzip
1423         if resp.headers.get('Content-encoding', '') == 'gzip':
1424             content = resp.read()
1425             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426             try:
1427                 uncompressed = io.BytesIO(gz.read())
1428             except OSError as original_ioerror:
1429                 # There may be junk add the end of the file
1430                 # See http://stackoverflow.com/q/4928560/35070 for details
1431                 for i in range(1, 1024):
1432                     try:
1433                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434                         uncompressed = io.BytesIO(gz.read())
1435                     except OSError:
1436                         continue
1437                     break
1438                 else:
1439                     raise original_ioerror
1440             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1441             resp.msg = old_resp.msg
1442             del resp.headers['Content-encoding']
1443         # deflate
1444         if resp.headers.get('Content-encoding', '') == 'deflate':
1445             gz = io.BytesIO(self.deflate(resp.read()))
1446             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1447             resp.msg = old_resp.msg
1448             del resp.headers['Content-encoding']
1449         # brotli
1450         if resp.headers.get('Content-encoding', '') == 'br':
1451             resp = urllib.request.addinfourl(
1452                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453             resp.msg = old_resp.msg
1454             del resp.headers['Content-encoding']
1455         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1456         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1457         if 300 <= resp.code < 400:
1458             location = resp.headers.get('Location')
1459             if location:
1460                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1461                 location = location.encode('iso-8859-1').decode()
1462                 location_escaped = escape_url(location)
1463                 if location != location_escaped:
1464                     del resp.headers['Location']
1465                     resp.headers['Location'] = location_escaped
1466         return resp
1467
1468     https_request = http_request
1469     https_response = http_response
1470
1471
1472 def make_socks_conn_class(base_class, socks_proxy):
1473     assert issubclass(base_class, (
1474         http.client.HTTPConnection, http.client.HTTPSConnection))
1475
1476     url_components = urllib.parse.urlparse(socks_proxy)
1477     if url_components.scheme.lower() == 'socks5':
1478         socks_type = ProxyType.SOCKS5
1479     elif url_components.scheme.lower() in ('socks', 'socks4'):
1480         socks_type = ProxyType.SOCKS4
1481     elif url_components.scheme.lower() == 'socks4a':
1482         socks_type = ProxyType.SOCKS4A
1483
1484     def unquote_if_non_empty(s):
1485         if not s:
1486             return s
1487         return urllib.parse.unquote_plus(s)
1488
1489     proxy_args = (
1490         socks_type,
1491         url_components.hostname, url_components.port or 1080,
1492         True,  # Remote DNS
1493         unquote_if_non_empty(url_components.username),
1494         unquote_if_non_empty(url_components.password),
1495     )
1496
1497     class SocksConnection(base_class):
1498         def connect(self):
1499             self.sock = sockssocket()
1500             self.sock.setproxy(*proxy_args)
1501             if isinstance(self.timeout, (int, float)):
1502                 self.sock.settimeout(self.timeout)
1503             self.sock.connect((self.host, self.port))
1504
1505             if isinstance(self, http.client.HTTPSConnection):
1506                 if hasattr(self, '_context'):  # Python > 2.6
1507                     self.sock = self._context.wrap_socket(
1508                         self.sock, server_hostname=self.host)
1509                 else:
1510                     self.sock = ssl.wrap_socket(self.sock)
1511
1512     return SocksConnection
1513
1514
1515 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1516     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1517         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1519         self._params = params
1520
1521     def https_open(self, req):
1522         kwargs = {}
1523         conn_class = self._https_conn_class
1524
1525         if hasattr(self, '_context'):  # python > 2.6
1526             kwargs['context'] = self._context
1527         if hasattr(self, '_check_hostname'):  # python 3.x
1528             kwargs['check_hostname'] = self._check_hostname
1529
1530         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531         if socks_proxy:
1532             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533             del req.headers['Ytdl-socks-proxy']
1534
1535         try:
1536             return self.do_open(
1537                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538         except urllib.error.URLError as e:
1539             if (isinstance(e.reason, ssl.SSLError)
1540                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542             raise
1543
1544
1545 def is_path_like(f):
1546     return isinstance(f, (str, bytes, os.PathLike))
1547
1548
1549 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1550     """
1551     See [1] for cookie file format.
1552
1553     1. https://curl.haxx.se/docs/http-cookies.html
1554     """
1555     _HTTPONLY_PREFIX = '#HttpOnly_'
1556     _ENTRY_LEN = 7
1557     _HEADER = '''# Netscape HTTP Cookie File
1558 # This file is generated by yt-dlp.  Do not edit.
1559
1560 '''
1561     _CookieFileEntry = collections.namedtuple(
1562         'CookieFileEntry',
1563         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1564
1565     def __init__(self, filename=None, *args, **kwargs):
1566         super().__init__(None, *args, **kwargs)
1567         if is_path_like(filename):
1568             filename = os.fspath(filename)
1569         self.filename = filename
1570
1571     @staticmethod
1572     def _true_or_false(cndn):
1573         return 'TRUE' if cndn else 'FALSE'
1574
1575     @contextlib.contextmanager
1576     def open(self, file, *, write=False):
1577         if is_path_like(file):
1578             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579                 yield f
1580         else:
1581             if write:
1582                 file.truncate(0)
1583             yield file
1584
1585     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586         now = time.time()
1587         for cookie in self:
1588             if (not ignore_discard and cookie.discard
1589                     or not ignore_expires and cookie.is_expired(now)):
1590                 continue
1591             name, value = cookie.name, cookie.value
1592             if value is None:
1593                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594                 # with no name, whereas http.cookiejar regards it as a
1595                 # cookie with no value.
1596                 name, value = '', name
1597             f.write('%s\n' % '\t'.join((
1598                 cookie.domain,
1599                 self._true_or_false(cookie.domain.startswith('.')),
1600                 cookie.path,
1601                 self._true_or_false(cookie.secure),
1602                 str_or_none(cookie.expires, default=''),
1603                 name, value
1604             )))
1605
1606     def save(self, filename=None, *args, **kwargs):
1607         """
1608         Save cookies to a file.
1609         Code is taken from CPython 3.6
1610         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1611
1612         if filename is None:
1613             if self.filename is not None:
1614                 filename = self.filename
1615             else:
1616                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1617
1618         # Store session cookies with `expires` set to 0 instead of an empty string
1619         for cookie in self:
1620             if cookie.expires is None:
1621                 cookie.expires = 0
1622
1623         with self.open(filename, write=True) as f:
1624             f.write(self._HEADER)
1625             self._really_save(f, *args, **kwargs)
1626
1627     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1628         """Load cookies from a file."""
1629         if filename is None:
1630             if self.filename is not None:
1631                 filename = self.filename
1632             else:
1633                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1634
1635         def prepare_line(line):
1636             if line.startswith(self._HTTPONLY_PREFIX):
1637                 line = line[len(self._HTTPONLY_PREFIX):]
1638             # comments and empty lines are fine
1639             if line.startswith('#') or not line.strip():
1640                 return line
1641             cookie_list = line.split('\t')
1642             if len(cookie_list) != self._ENTRY_LEN:
1643                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1644             cookie = self._CookieFileEntry(*cookie_list)
1645             if cookie.expires_at and not cookie.expires_at.isdigit():
1646                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1647             return line
1648
1649         cf = io.StringIO()
1650         with self.open(filename) as f:
1651             for line in f:
1652                 try:
1653                     cf.write(prepare_line(line))
1654                 except http.cookiejar.LoadError as e:
1655                     if f'{line.strip()} '[0] in '[{"':
1656                         raise http.cookiejar.LoadError(
1657                             'Cookies file must be Netscape formatted, not JSON. See  '
1658                             'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1659                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1660                     continue
1661         cf.seek(0)
1662         self._really_load(cf, filename, ignore_discard, ignore_expires)
1663         # Session cookies are denoted by either `expires` field set to
1664         # an empty string or 0. MozillaCookieJar only recognizes the former
1665         # (see [1]). So we need force the latter to be recognized as session
1666         # cookies on our own.
1667         # Session cookies may be important for cookies-based authentication,
1668         # e.g. usually, when user does not check 'Remember me' check box while
1669         # logging in on a site, some important cookies are stored as session
1670         # cookies so that not recognizing them will result in failed login.
1671         # 1. https://bugs.python.org/issue17164
1672         for cookie in self:
1673             # Treat `expires=0` cookies as session cookies
1674             if cookie.expires == 0:
1675                 cookie.expires = None
1676                 cookie.discard = True
1677
1678
1679 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1680     def __init__(self, cookiejar=None):
1681         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1682
1683     def http_response(self, request, response):
1684         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1685
1686     https_request = urllib.request.HTTPCookieProcessor.http_request
1687     https_response = http_response
1688
1689
1690 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1691     """YoutubeDL redirect handler
1692
1693     The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695     This redirect handler solves two issues:
1696      - ensures redirect URL is always unicode under python 2
1697      - introduces support for experimental HTTP response status code
1698        308 Permanent Redirect [2] used by some sites [3]
1699
1700     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703     """
1704
1705     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1706
1707     def redirect_request(self, req, fp, code, msg, headers, newurl):
1708         """Return a Request or None in response to a redirect.
1709
1710         This is called by the http_error_30x methods when a
1711         redirection response is received.  If a redirection should
1712         take place, return a new Request to allow http_error_30x to
1713         perform the redirect.  Otherwise, raise HTTPError if no-one
1714         else should try to handle this url.  Return None if you can't
1715         but another Handler might.
1716         """
1717         m = req.get_method()
1718         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719                  or code in (301, 302, 303) and m == "POST")):
1720             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1721         # Strictly (according to RFC 2616), 301 or 302 in response to
1722         # a POST MUST NOT cause a redirection without confirmation
1723         # from the user (of urllib.request, in this case).  In practice,
1724         # essentially all clients do redirect in this case, so we do
1725         # the same.
1726
1727         # Be conciliant with URIs containing a space.  This is mainly
1728         # redundant with the more complete encoding done in http_error_302(),
1729         # but it is kept for compatibility with other callers.
1730         newurl = newurl.replace(' ', '%20')
1731
1732         CONTENT_HEADERS = ("content-length", "content-type")
1733         # NB: don't use dict comprehension for python 2.6 compatibility
1734         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1735
1736         # A 303 must either use GET or HEAD for subsequent request
1737         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738         if code == 303 and m != 'HEAD':
1739             m = 'GET'
1740         # 301 and 302 redirects are commonly turned into a GET from a POST
1741         # for subsequent requests by browsers, so we'll do the same.
1742         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744         if code in (301, 302) and m == 'POST':
1745             m = 'GET'
1746
1747         return urllib.request.Request(
1748             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1749             unverifiable=True, method=m)
1750
1751
1752 def extract_timezone(date_str):
1753     m = re.search(
1754         r'''(?x)
1755             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1756             (?P<tz>Z|                                            # just the UTC Z, or
1757                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1758                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759                    [ ]?                                          # optional space
1760                 (?P<sign>\+|-)                                   # +/-
1761                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1762             $)
1763         ''', date_str)
1764     if not m:
1765         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767         if timezone is not None:
1768             date_str = date_str[:-len(m.group('tz'))]
1769         timezone = datetime.timedelta(hours=timezone or 0)
1770     else:
1771         date_str = date_str[:-len(m.group('tz'))]
1772         if not m.group('sign'):
1773             timezone = datetime.timedelta()
1774         else:
1775             sign = 1 if m.group('sign') == '+' else -1
1776             timezone = datetime.timedelta(
1777                 hours=sign * int(m.group('hours')),
1778                 minutes=sign * int(m.group('minutes')))
1779     return timezone, date_str
1780
1781
1782 def parse_iso8601(date_str, delimiter='T', timezone=None):
1783     """ Return a UNIX timestamp from the given date """
1784
1785     if date_str is None:
1786         return None
1787
1788     date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
1790     if timezone is None:
1791         timezone, date_str = extract_timezone(date_str)
1792
1793     with contextlib.suppress(ValueError):
1794         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1795         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796         return calendar.timegm(dt.timetuple())
1797
1798
1799 def date_formats(day_first=True):
1800     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
1803 def unified_strdate(date_str, day_first=True):
1804     """Return a string with the date in the format YYYYMMDD"""
1805
1806     if date_str is None:
1807         return None
1808     upload_date = None
1809     # Replace commas
1810     date_str = date_str.replace(',', ' ')
1811     # Remove AM/PM + timezone
1812     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1813     _, date_str = extract_timezone(date_str)
1814
1815     for expression in date_formats(day_first):
1816         with contextlib.suppress(ValueError):
1817             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1818     if upload_date is None:
1819         timetuple = email.utils.parsedate_tz(date_str)
1820         if timetuple:
1821             with contextlib.suppress(ValueError):
1822                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1823     if upload_date is not None:
1824         return str(upload_date)
1825
1826
1827 def unified_timestamp(date_str, day_first=True):
1828     if date_str is None:
1829         return None
1830
1831     date_str = re.sub(r'\s+', ' ', re.sub(
1832         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1833
1834     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1835     timezone, date_str = extract_timezone(date_str)
1836
1837     # Remove AM/PM + timezone
1838     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
1840     # Remove unrecognized timezones from ISO 8601 alike timestamps
1841     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842     if m:
1843         date_str = date_str[:-len(m.group('tz'))]
1844
1845     # Python only supports microseconds, so remove nanoseconds
1846     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847     if m:
1848         date_str = m.group(1)
1849
1850     for expression in date_formats(day_first):
1851         with contextlib.suppress(ValueError):
1852             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1853             return calendar.timegm(dt.timetuple())
1854
1855     timetuple = email.utils.parsedate_tz(date_str)
1856     if timetuple:
1857         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1858
1859
1860 def determine_ext(url, default_ext='unknown_video'):
1861     if url is None or '.' not in url:
1862         return default_ext
1863     guess = url.partition('?')[0].rpartition('.')[2]
1864     if re.match(r'^[A-Za-z0-9]+$', guess):
1865         return guess
1866     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1868         return guess.rstrip('/')
1869     else:
1870         return default_ext
1871
1872
1873 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1875
1876
1877 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1878     R"""
1879     Return a datetime object from a string.
1880     Supported format:
1881         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883     @param format       strftime format of DATE
1884     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1885                         auto: round to the unit provided in date_str (if applicable).
1886     """
1887     auto_precision = False
1888     if precision == 'auto':
1889         auto_precision = True
1890         precision = 'microsecond'
1891     today = datetime_round(datetime.datetime.utcnow(), precision)
1892     if date_str in ('now', 'today'):
1893         return today
1894     if date_str == 'yesterday':
1895         return today - datetime.timedelta(days=1)
1896     match = re.match(
1897         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1898         date_str)
1899     if match is not None:
1900         start_time = datetime_from_str(match.group('start'), precision, format)
1901         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1902         unit = match.group('unit')
1903         if unit == 'month' or unit == 'year':
1904             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1905             unit = 'day'
1906         else:
1907             if unit == 'week':
1908                 unit = 'day'
1909                 time *= 7
1910             delta = datetime.timedelta(**{unit + 's': time})
1911             new_date = start_time + delta
1912         if auto_precision:
1913             return datetime_round(new_date, unit)
1914         return new_date
1915
1916     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
1919 def date_from_str(date_str, format='%Y%m%d', strict=False):
1920     R"""
1921     Return a date object from a string using datetime_from_str
1922
1923     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1924                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1925     """
1926     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927         raise ValueError(f'Invalid date format "{date_str}"')
1928     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931 def datetime_add_months(dt, months):
1932     """Increment/Decrement a datetime object by months."""
1933     month = dt.month + months - 1
1934     year = dt.year + month // 12
1935     month = month % 12 + 1
1936     day = min(dt.day, calendar.monthrange(year, month)[1])
1937     return dt.replace(year, month, day)
1938
1939
1940 def datetime_round(dt, precision='day'):
1941     """
1942     Round a datetime object's time to a specific precision
1943     """
1944     if precision == 'microsecond':
1945         return dt
1946
1947     unit_seconds = {
1948         'day': 86400,
1949         'hour': 3600,
1950         'minute': 60,
1951         'second': 1,
1952     }
1953     roundto = lambda x, n: ((x + n / 2) // n) * n
1954     timestamp = calendar.timegm(dt.timetuple())
1955     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1956
1957
1958 def hyphenate_date(date_str):
1959     """
1960     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962     if match is not None:
1963         return '-'.join(match.groups())
1964     else:
1965         return date_str
1966
1967
1968 class DateRange:
1969     """Represents a time interval between two dates"""
1970
1971     def __init__(self, start=None, end=None):
1972         """start and end must be strings in the format accepted by date"""
1973         if start is not None:
1974             self.start = date_from_str(start, strict=True)
1975         else:
1976             self.start = datetime.datetime.min.date()
1977         if end is not None:
1978             self.end = date_from_str(end, strict=True)
1979         else:
1980             self.end = datetime.datetime.max.date()
1981         if self.start > self.end:
1982             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1983
1984     @classmethod
1985     def day(cls, day):
1986         """Returns a range that only contains the given day"""
1987         return cls(day, day)
1988
1989     def __contains__(self, date):
1990         """Check if the date is in the range"""
1991         if not isinstance(date, datetime.date):
1992             date = date_from_str(date)
1993         return self.start <= date <= self.end
1994
1995     def __str__(self):
1996         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1997
1998     def __eq__(self, other):
1999         return (isinstance(other, DateRange)
2000                 and self.start == other.start and self.end == other.end)
2001
2002
2003 def platform_name():
2004     """ Returns the platform name as a str """
2005     deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2006     return platform.platform()
2007
2008
2009 @functools.cache
2010 def system_identifier():
2011     python_implementation = platform.python_implementation()
2012     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2014     libc_ver = []
2015     with contextlib.suppress(OSError):  # We may not have access to the executable
2016         libc_ver = platform.libc_ver()
2017
2018     return 'Python %s (%s %s %s) - %s (%s%s)' % (
2019         platform.python_version(),
2020         python_implementation,
2021         platform.machine(),
2022         platform.architecture()[0],
2023         platform.platform(),
2024         ssl.OPENSSL_VERSION,
2025         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2026     )
2027
2028
2029 @functools.cache
2030 def get_windows_version():
2031     ''' Get Windows version. returns () if it's not running on Windows '''
2032     if compat_os_name == 'nt':
2033         return version_tuple(platform.win32_ver()[1])
2034     else:
2035         return ()
2036
2037
2038 def write_string(s, out=None, encoding=None):
2039     assert isinstance(s, str)
2040     out = out or sys.stderr
2041
2042     if compat_os_name == 'nt' and supports_terminal_sequences(out):
2043         s = re.sub(r'([\r\n]+)', r' \1', s)
2044
2045     enc, buffer = None, out
2046     if 'b' in getattr(out, 'mode', ''):
2047         enc = encoding or preferredencoding()
2048     elif hasattr(out, 'buffer'):
2049         buffer = out.buffer
2050         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2051
2052     buffer.write(s.encode(enc, 'ignore') if enc else s)
2053     out.flush()
2054
2055
2056 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057     from . import _IN_CLI
2058     if _IN_CLI:
2059         if msg in deprecation_warning._cache:
2060             return
2061         deprecation_warning._cache.add(msg)
2062         if printer:
2063             return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065     else:
2066         import warnings
2067         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070 deprecation_warning._cache = set()
2071
2072
2073 def bytes_to_intlist(bs):
2074     if not bs:
2075         return []
2076     if isinstance(bs[0], int):  # Python 3
2077         return list(bs)
2078     else:
2079         return [ord(c) for c in bs]
2080
2081
2082 def intlist_to_bytes(xs):
2083     if not xs:
2084         return b''
2085     return struct.pack('%dB' % len(xs), *xs)
2086
2087
2088 class LockingUnsupportedError(OSError):
2089     msg = 'File locking is not supported'
2090
2091     def __init__(self):
2092         super().__init__(self.msg)
2093
2094
2095 # Cross-platform file locking
2096 if sys.platform == 'win32':
2097     import ctypes
2098     import ctypes.wintypes
2099     import msvcrt
2100
2101     class OVERLAPPED(ctypes.Structure):
2102         _fields_ = [
2103             ('Internal', ctypes.wintypes.LPVOID),
2104             ('InternalHigh', ctypes.wintypes.LPVOID),
2105             ('Offset', ctypes.wintypes.DWORD),
2106             ('OffsetHigh', ctypes.wintypes.DWORD),
2107             ('hEvent', ctypes.wintypes.HANDLE),
2108         ]
2109
2110     kernel32 = ctypes.windll.kernel32
2111     LockFileEx = kernel32.LockFileEx
2112     LockFileEx.argtypes = [
2113         ctypes.wintypes.HANDLE,     # hFile
2114         ctypes.wintypes.DWORD,      # dwFlags
2115         ctypes.wintypes.DWORD,      # dwReserved
2116         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2117         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2118         ctypes.POINTER(OVERLAPPED)  # Overlapped
2119     ]
2120     LockFileEx.restype = ctypes.wintypes.BOOL
2121     UnlockFileEx = kernel32.UnlockFileEx
2122     UnlockFileEx.argtypes = [
2123         ctypes.wintypes.HANDLE,     # hFile
2124         ctypes.wintypes.DWORD,      # dwReserved
2125         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2126         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2127         ctypes.POINTER(OVERLAPPED)  # Overlapped
2128     ]
2129     UnlockFileEx.restype = ctypes.wintypes.BOOL
2130     whole_low = 0xffffffff
2131     whole_high = 0x7fffffff
2132
2133     def _lock_file(f, exclusive, block):
2134         overlapped = OVERLAPPED()
2135         overlapped.Offset = 0
2136         overlapped.OffsetHigh = 0
2137         overlapped.hEvent = 0
2138         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2139
2140         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2143             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2145
2146     def _unlock_file(f):
2147         assert f._lock_file_overlapped_p
2148         handle = msvcrt.get_osfhandle(f.fileno())
2149         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2150             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152 else:
2153     try:
2154         import fcntl
2155
2156         def _lock_file(f, exclusive, block):
2157             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158             if not block:
2159                 flags |= fcntl.LOCK_NB
2160             try:
2161                 fcntl.flock(f, flags)
2162             except BlockingIOError:
2163                 raise
2164             except OSError:  # AOSP does not have flock()
2165                 fcntl.lockf(f, flags)
2166
2167         def _unlock_file(f):
2168             try:
2169                 fcntl.flock(f, fcntl.LOCK_UN)
2170             except OSError:
2171                 fcntl.lockf(f, fcntl.LOCK_UN)
2172
2173     except ImportError:
2174
2175         def _lock_file(f, exclusive, block):
2176             raise LockingUnsupportedError()
2177
2178         def _unlock_file(f):
2179             raise LockingUnsupportedError()
2180
2181
2182 class locked_file:
2183     locked = False
2184
2185     def __init__(self, filename, mode, block=True, encoding=None):
2186         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187             raise NotImplementedError(mode)
2188         self.mode, self.block = mode, block
2189
2190         writable = any(f in mode for f in 'wax+')
2191         readable = any(f in mode for f in 'r+')
2192         flags = functools.reduce(operator.ior, (
2193             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2194             getattr(os, 'O_BINARY', 0),  # Windows only
2195             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2196             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2197             os.O_APPEND if 'a' in mode else 0,
2198             os.O_EXCL if 'x' in mode else 0,
2199             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200         ))
2201
2202         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2203
2204     def __enter__(self):
2205         exclusive = 'r' not in self.mode
2206         try:
2207             _lock_file(self.f, exclusive, self.block)
2208             self.locked = True
2209         except OSError:
2210             self.f.close()
2211             raise
2212         if 'w' in self.mode:
2213             try:
2214                 self.f.truncate()
2215             except OSError as e:
2216                 if e.errno not in (
2217                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2218                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2219                 ):
2220                     raise
2221         return self
2222
2223     def unlock(self):
2224         if not self.locked:
2225             return
2226         try:
2227             _unlock_file(self.f)
2228         finally:
2229             self.locked = False
2230
2231     def __exit__(self, *_):
2232         try:
2233             self.unlock()
2234         finally:
2235             self.f.close()
2236
2237     open = __enter__
2238     close = __exit__
2239
2240     def __getattr__(self, attr):
2241         return getattr(self.f, attr)
2242
2243     def __iter__(self):
2244         return iter(self.f)
2245
2246
2247 @functools.cache
2248 def get_filesystem_encoding():
2249     encoding = sys.getfilesystemencoding()
2250     return encoding if encoding is not None else 'utf-8'
2251
2252
2253 def shell_quote(args):
2254     quoted_args = []
2255     encoding = get_filesystem_encoding()
2256     for a in args:
2257         if isinstance(a, bytes):
2258             # We may get a filename encoded with 'encodeFilename'
2259             a = a.decode(encoding)
2260         quoted_args.append(compat_shlex_quote(a))
2261     return ' '.join(quoted_args)
2262
2263
2264 def smuggle_url(url, data):
2265     """ Pass additional data in a URL for internal use. """
2266
2267     url, idata = unsmuggle_url(url, {})
2268     data.update(idata)
2269     sdata = urllib.parse.urlencode(
2270         {'__youtubedl_smuggle': json.dumps(data)})
2271     return url + '#' + sdata
2272
2273
2274 def unsmuggle_url(smug_url, default=None):
2275     if '#__youtubedl_smuggle' not in smug_url:
2276         return smug_url, default
2277     url, _, sdata = smug_url.rpartition('#')
2278     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2279     data = json.loads(jsond)
2280     return url, data
2281
2282
2283 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284     """ Formats numbers with decimal sufixes like K, M, etc """
2285     num, factor = float_or_none(num), float(factor)
2286     if num is None or num < 0:
2287         return None
2288     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2291     if factor == 1024:
2292         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2293     converted = num / (factor ** exponent)
2294     return fmt % (converted, suffix)
2295
2296
2297 def format_bytes(bytes):
2298     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2299
2300
2301 def lookup_unit_table(unit_table, s, strict=False):
2302     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2303     units_re = '|'.join(re.escape(u) for u in unit_table)
2304     m = (re.fullmatch if strict else re.match)(
2305         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2306     if not m:
2307         return None
2308
2309     num = float(m.group('num').replace(',', '.'))
2310     mult = unit_table[m.group('unit')]
2311     return round(num * mult)
2312
2313
2314 def parse_bytes(s):
2315     """Parse a string indicating a byte quantity into an integer"""
2316     return lookup_unit_table(
2317         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318         s.upper(), strict=True)
2319
2320
2321 def parse_filesize(s):
2322     if s is None:
2323         return None
2324
2325     # The lower-case forms are of course incorrect and unofficial,
2326     # but we support those too
2327     _UNIT_TABLE = {
2328         'B': 1,
2329         'b': 1,
2330         'bytes': 1,
2331         'KiB': 1024,
2332         'KB': 1000,
2333         'kB': 1024,
2334         'Kb': 1000,
2335         'kb': 1000,
2336         'kilobytes': 1000,
2337         'kibibytes': 1024,
2338         'MiB': 1024 ** 2,
2339         'MB': 1000 ** 2,
2340         'mB': 1024 ** 2,
2341         'Mb': 1000 ** 2,
2342         'mb': 1000 ** 2,
2343         'megabytes': 1000 ** 2,
2344         'mebibytes': 1024 ** 2,
2345         'GiB': 1024 ** 3,
2346         'GB': 1000 ** 3,
2347         'gB': 1024 ** 3,
2348         'Gb': 1000 ** 3,
2349         'gb': 1000 ** 3,
2350         'gigabytes': 1000 ** 3,
2351         'gibibytes': 1024 ** 3,
2352         'TiB': 1024 ** 4,
2353         'TB': 1000 ** 4,
2354         'tB': 1024 ** 4,
2355         'Tb': 1000 ** 4,
2356         'tb': 1000 ** 4,
2357         'terabytes': 1000 ** 4,
2358         'tebibytes': 1024 ** 4,
2359         'PiB': 1024 ** 5,
2360         'PB': 1000 ** 5,
2361         'pB': 1024 ** 5,
2362         'Pb': 1000 ** 5,
2363         'pb': 1000 ** 5,
2364         'petabytes': 1000 ** 5,
2365         'pebibytes': 1024 ** 5,
2366         'EiB': 1024 ** 6,
2367         'EB': 1000 ** 6,
2368         'eB': 1024 ** 6,
2369         'Eb': 1000 ** 6,
2370         'eb': 1000 ** 6,
2371         'exabytes': 1000 ** 6,
2372         'exbibytes': 1024 ** 6,
2373         'ZiB': 1024 ** 7,
2374         'ZB': 1000 ** 7,
2375         'zB': 1024 ** 7,
2376         'Zb': 1000 ** 7,
2377         'zb': 1000 ** 7,
2378         'zettabytes': 1000 ** 7,
2379         'zebibytes': 1024 ** 7,
2380         'YiB': 1024 ** 8,
2381         'YB': 1000 ** 8,
2382         'yB': 1024 ** 8,
2383         'Yb': 1000 ** 8,
2384         'yb': 1000 ** 8,
2385         'yottabytes': 1000 ** 8,
2386         'yobibytes': 1024 ** 8,
2387     }
2388
2389     return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392 def parse_count(s):
2393     if s is None:
2394         return None
2395
2396     s = re.sub(r'^[^\d]+\s', '', s).strip()
2397
2398     if re.match(r'^[\d,.]+$', s):
2399         return str_to_int(s)
2400
2401     _UNIT_TABLE = {
2402         'k': 1000,
2403         'K': 1000,
2404         'm': 1000 ** 2,
2405         'M': 1000 ** 2,
2406         'kk': 1000 ** 2,
2407         'KK': 1000 ** 2,
2408         'b': 1000 ** 3,
2409         'B': 1000 ** 3,
2410     }
2411
2412     ret = lookup_unit_table(_UNIT_TABLE, s)
2413     if ret is not None:
2414         return ret
2415
2416     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417     if mobj:
2418         return str_to_int(mobj.group(1))
2419
2420
2421 def parse_resolution(s, *, lenient=False):
2422     if s is None:
2423         return {}
2424
2425     if lenient:
2426         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427     else:
2428         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2429     if mobj:
2430         return {
2431             'width': int(mobj.group('w')),
2432             'height': int(mobj.group('h')),
2433         }
2434
2435     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2436     if mobj:
2437         return {'height': int(mobj.group(1))}
2438
2439     mobj = re.search(r'\b([48])[kK]\b', s)
2440     if mobj:
2441         return {'height': int(mobj.group(1)) * 540}
2442
2443     return {}
2444
2445
2446 def parse_bitrate(s):
2447     if not isinstance(s, str):
2448         return
2449     mobj = re.search(r'\b(\d+)\s*kbps', s)
2450     if mobj:
2451         return int(mobj.group(1))
2452
2453
2454 def month_by_name(name, lang='en'):
2455     """ Return the number of a month by (locale-independently) English name """
2456
2457     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2458
2459     try:
2460         return month_names.index(name) + 1
2461     except ValueError:
2462         return None
2463
2464
2465 def month_by_abbreviation(abbrev):
2466     """ Return the number of a month by (locale-independently) English
2467         abbreviations """
2468
2469     try:
2470         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2471     except ValueError:
2472         return None
2473
2474
2475 def fix_xml_ampersands(xml_str):
2476     """Replace all the '&' by '&amp;' in XML"""
2477     return re.sub(
2478         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2479         '&amp;',
2480         xml_str)
2481
2482
2483 def setproctitle(title):
2484     assert isinstance(title, str)
2485
2486     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487     try:
2488         import ctypes
2489     except ImportError:
2490         return
2491
2492     try:
2493         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2494     except OSError:
2495         return
2496     except TypeError:
2497         # LoadLibrary in Windows Python 2.7.13 only expects
2498         # a bytestring, but since unicode_literals turns
2499         # every string into a unicode string, it fails.
2500         return
2501     title_bytes = title.encode()
2502     buf = ctypes.create_string_buffer(len(title_bytes))
2503     buf.value = title_bytes
2504     try:
2505         libc.prctl(15, buf, 0, 0, 0)
2506     except AttributeError:
2507         return  # Strange libc, just skip this
2508
2509
2510 def remove_start(s, start):
2511     return s[len(start):] if s is not None and s.startswith(start) else s
2512
2513
2514 def remove_end(s, end):
2515     return s[:-len(end)] if s is not None and s.endswith(end) else s
2516
2517
2518 def remove_quotes(s):
2519     if s is None or len(s) < 2:
2520         return s
2521     for quote in ('"', "'", ):
2522         if s[0] == quote and s[-1] == quote:
2523             return s[1:-1]
2524     return s
2525
2526
2527 def get_domain(url):
2528     """
2529     This implementation is inconsistent, but is kept for compatibility.
2530     Use this only for "webpage_url_domain"
2531     """
2532     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2533
2534
2535 def url_basename(url):
2536     path = urllib.parse.urlparse(url).path
2537     return path.strip('/').split('/')[-1]
2538
2539
2540 def base_url(url):
2541     return re.match(r'https?://[^?#]+/', url).group()
2542
2543
2544 def urljoin(base, path):
2545     if isinstance(path, bytes):
2546         path = path.decode()
2547     if not isinstance(path, str) or not path:
2548         return None
2549     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2550         return path
2551     if isinstance(base, bytes):
2552         base = base.decode()
2553     if not isinstance(base, str) or not re.match(
2554             r'^(?:https?:)?//', base):
2555         return None
2556     return urllib.parse.urljoin(base, path)
2557
2558
2559 class HEADRequest(urllib.request.Request):
2560     def get_method(self):
2561         return 'HEAD'
2562
2563
2564 class PUTRequest(urllib.request.Request):
2565     def get_method(self):
2566         return 'PUT'
2567
2568
2569 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2570     if get_attr and v is not None:
2571         v = getattr(v, get_attr, None)
2572     try:
2573         return int(v) * invscale // scale
2574     except (ValueError, TypeError, OverflowError):
2575         return default
2576
2577
2578 def str_or_none(v, default=None):
2579     return default if v is None else str(v)
2580
2581
2582 def str_to_int(int_str):
2583     """ A more relaxed version of int_or_none """
2584     if isinstance(int_str, int):
2585         return int_str
2586     elif isinstance(int_str, str):
2587         int_str = re.sub(r'[,\.\+]', '', int_str)
2588         return int_or_none(int_str)
2589
2590
2591 def float_or_none(v, scale=1, invscale=1, default=None):
2592     if v is None:
2593         return default
2594     try:
2595         return float(v) * invscale / scale
2596     except (ValueError, TypeError):
2597         return default
2598
2599
2600 def bool_or_none(v, default=None):
2601     return v if isinstance(v, bool) else default
2602
2603
2604 def strip_or_none(v, default=None):
2605     return v.strip() if isinstance(v, str) else default
2606
2607
2608 def url_or_none(url):
2609     if not url or not isinstance(url, str):
2610         return None
2611     url = url.strip()
2612     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2613
2614
2615 def request_to_url(req):
2616     if isinstance(req, urllib.request.Request):
2617         return req.get_full_url()
2618     else:
2619         return req
2620
2621
2622 def strftime_or_none(timestamp, date_format, default=None):
2623     datetime_object = None
2624     try:
2625         if isinstance(timestamp, (int, float)):  # unix timestamp
2626             # Using naive datetime here can break timestamp() in Windows
2627             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628             datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2629         elif isinstance(timestamp, str):  # assume YYYYMMDD
2630             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2631         date_format = re.sub(  # Support %s on windows
2632             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2633         return datetime_object.strftime(date_format)
2634     except (ValueError, TypeError, AttributeError):
2635         return default
2636
2637
2638 def parse_duration(s):
2639     if not isinstance(s, str):
2640         return None
2641     s = s.strip()
2642     if not s:
2643         return None
2644
2645     days, hours, mins, secs, ms = [None] * 5
2646     m = re.match(r'''(?x)
2647             (?P<before_secs>
2648                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650             (?P<ms>[.:][0-9]+)?Z?$
2651         ''', s)
2652     if m:
2653         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2654     else:
2655         m = re.match(
2656             r'''(?ix)(?:P?
2657                 (?:
2658                     [0-9]+\s*y(?:ears?)?,?\s*
2659                 )?
2660                 (?:
2661                     [0-9]+\s*m(?:onths?)?,?\s*
2662                 )?
2663                 (?:
2664                     [0-9]+\s*w(?:eeks?)?,?\s*
2665                 )?
2666                 (?:
2667                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2668                 )?
2669                 T)?
2670                 (?:
2671                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2672                 )?
2673                 (?:
2674                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2675                 )?
2676                 (?:
2677                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2678                 )?Z?$''', s)
2679         if m:
2680             days, hours, mins, secs, ms = m.groups()
2681         else:
2682             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2683             if m:
2684                 hours, mins = m.groups()
2685             else:
2686                 return None
2687
2688     if ms:
2689         ms = ms.replace(':', '.')
2690     return sum(float(part or 0) * mult for part, mult in (
2691         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2692
2693
2694 def prepend_extension(filename, ext, expected_real_ext=None):
2695     name, real_ext = os.path.splitext(filename)
2696     return (
2697         f'{name}.{ext}{real_ext}'
2698         if not expected_real_ext or real_ext[1:] == expected_real_ext
2699         else f'{filename}.{ext}')
2700
2701
2702 def replace_extension(filename, ext, expected_real_ext=None):
2703     name, real_ext = os.path.splitext(filename)
2704     return '{}.{}'.format(
2705         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706         ext)
2707
2708
2709 def check_executable(exe, args=[]):
2710     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711     args can be a list of arguments for a short output (like -version) """
2712     try:
2713         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2714     except OSError:
2715         return False
2716     return exe
2717
2718
2719 def _get_exe_version_output(exe, args):
2720     try:
2721         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2722         # SIGTTOU if yt-dlp is run in the background.
2723         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2724         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2726     except OSError:
2727         return False
2728     return stdout
2729
2730
2731 def detect_exe_version(output, version_re=None, unrecognized='present'):
2732     assert isinstance(output, str)
2733     if version_re is None:
2734         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735     m = re.search(version_re, output)
2736     if m:
2737         return m.group(1)
2738     else:
2739         return unrecognized
2740
2741
2742 def get_exe_version(exe, args=['--version'],
2743                     version_re=None, unrecognized='present'):
2744     """ Returns the version of the specified executable,
2745     or False if the executable is not present """
2746     out = _get_exe_version_output(exe, args)
2747     return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
2750 def frange(start=0, stop=None, step=1):
2751     """Float range"""
2752     if stop is None:
2753         start, stop = 0, start
2754     sign = [-1, 1][step > 0] if step else 0
2755     while sign * start < sign * stop:
2756         yield start
2757         start += step
2758
2759
2760 class LazyList(collections.abc.Sequence):
2761     """Lazy immutable list from an iterable
2762     Note that slices of a LazyList are lists and not LazyList"""
2763
2764     class IndexError(IndexError):
2765         pass
2766
2767     def __init__(self, iterable, *, reverse=False, _cache=None):
2768         self._iterable = iter(iterable)
2769         self._cache = [] if _cache is None else _cache
2770         self._reversed = reverse
2771
2772     def __iter__(self):
2773         if self._reversed:
2774             # We need to consume the entire iterable to iterate in reverse
2775             yield from self.exhaust()
2776             return
2777         yield from self._cache
2778         for item in self._iterable:
2779             self._cache.append(item)
2780             yield item
2781
2782     def _exhaust(self):
2783         self._cache.extend(self._iterable)
2784         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2785         return self._cache
2786
2787     def exhaust(self):
2788         """Evaluate the entire iterable"""
2789         return self._exhaust()[::-1 if self._reversed else 1]
2790
2791     @staticmethod
2792     def _reverse_index(x):
2793         return None if x is None else ~x
2794
2795     def __getitem__(self, idx):
2796         if isinstance(idx, slice):
2797             if self._reversed:
2798                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2799             start, stop, step = idx.start, idx.stop, idx.step or 1
2800         elif isinstance(idx, int):
2801             if self._reversed:
2802                 idx = self._reverse_index(idx)
2803             start, stop, step = idx, idx, 0
2804         else:
2805             raise TypeError('indices must be integers or slices')
2806         if ((start or 0) < 0 or (stop or 0) < 0
2807                 or (start is None and step < 0)
2808                 or (stop is None and step > 0)):
2809             # We need to consume the entire iterable to be able to slice from the end
2810             # Obviously, never use this with infinite iterables
2811             self._exhaust()
2812             try:
2813                 return self._cache[idx]
2814             except IndexError as e:
2815                 raise self.IndexError(e) from e
2816         n = max(start or 0, stop or 0) - len(self._cache) + 1
2817         if n > 0:
2818             self._cache.extend(itertools.islice(self._iterable, n))
2819         try:
2820             return self._cache[idx]
2821         except IndexError as e:
2822             raise self.IndexError(e) from e
2823
2824     def __bool__(self):
2825         try:
2826             self[-1] if self._reversed else self[0]
2827         except self.IndexError:
2828             return False
2829         return True
2830
2831     def __len__(self):
2832         self._exhaust()
2833         return len(self._cache)
2834
2835     def __reversed__(self):
2836         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2837
2838     def __copy__(self):
2839         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2840
2841     def __repr__(self):
2842         # repr and str should mimic a list. So we exhaust the iterable
2843         return repr(self.exhaust())
2844
2845     def __str__(self):
2846         return repr(self.exhaust())
2847
2848
2849 class PagedList:
2850
2851     class IndexError(IndexError):
2852         pass
2853
2854     def __len__(self):
2855         # This is only useful for tests
2856         return len(self.getslice())
2857
2858     def __init__(self, pagefunc, pagesize, use_cache=True):
2859         self._pagefunc = pagefunc
2860         self._pagesize = pagesize
2861         self._pagecount = float('inf')
2862         self._use_cache = use_cache
2863         self._cache = {}
2864
2865     def getpage(self, pagenum):
2866         page_results = self._cache.get(pagenum)
2867         if page_results is None:
2868             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2869         if self._use_cache:
2870             self._cache[pagenum] = page_results
2871         return page_results
2872
2873     def getslice(self, start=0, end=None):
2874         return list(self._getslice(start, end))
2875
2876     def _getslice(self, start, end):
2877         raise NotImplementedError('This method must be implemented by subclasses')
2878
2879     def __getitem__(self, idx):
2880         assert self._use_cache, 'Indexing PagedList requires cache'
2881         if not isinstance(idx, int) or idx < 0:
2882             raise TypeError('indices must be non-negative integers')
2883         entries = self.getslice(idx, idx + 1)
2884         if not entries:
2885             raise self.IndexError()
2886         return entries[0]
2887
2888
2889 class OnDemandPagedList(PagedList):
2890     """Download pages until a page with less than maximum results"""
2891
2892     def _getslice(self, start, end):
2893         for pagenum in itertools.count(start // self._pagesize):
2894             firstid = pagenum * self._pagesize
2895             nextfirstid = pagenum * self._pagesize + self._pagesize
2896             if start >= nextfirstid:
2897                 continue
2898
2899             startv = (
2900                 start % self._pagesize
2901                 if firstid <= start < nextfirstid
2902                 else 0)
2903             endv = (
2904                 ((end - 1) % self._pagesize) + 1
2905                 if (end is not None and firstid <= end <= nextfirstid)
2906                 else None)
2907
2908             try:
2909                 page_results = self.getpage(pagenum)
2910             except Exception:
2911                 self._pagecount = pagenum - 1
2912                 raise
2913             if startv != 0 or endv is not None:
2914                 page_results = page_results[startv:endv]
2915             yield from page_results
2916
2917             # A little optimization - if current page is not "full", ie. does
2918             # not contain page_size videos then we can assume that this page
2919             # is the last one - there are no more ids on further pages -
2920             # i.e. no need to query again.
2921             if len(page_results) + startv < self._pagesize:
2922                 break
2923
2924             # If we got the whole page, but the next page is not interesting,
2925             # break out early as well
2926             if end == nextfirstid:
2927                 break
2928
2929
2930 class InAdvancePagedList(PagedList):
2931     """PagedList with total number of pages known in advance"""
2932
2933     def __init__(self, pagefunc, pagecount, pagesize):
2934         PagedList.__init__(self, pagefunc, pagesize, True)
2935         self._pagecount = pagecount
2936
2937     def _getslice(self, start, end):
2938         start_page = start // self._pagesize
2939         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2940         skip_elems = start - start_page * self._pagesize
2941         only_more = None if end is None else end - start
2942         for pagenum in range(start_page, end_page):
2943             page_results = self.getpage(pagenum)
2944             if skip_elems:
2945                 page_results = page_results[skip_elems:]
2946                 skip_elems = None
2947             if only_more is not None:
2948                 if len(page_results) < only_more:
2949                     only_more -= len(page_results)
2950                 else:
2951                     yield from page_results[:only_more]
2952                     break
2953             yield from page_results
2954
2955
2956 class PlaylistEntries:
2957     MissingEntry = object()
2958     is_exhausted = False
2959
2960     def __init__(self, ydl, info_dict):
2961         self.ydl = ydl
2962
2963         # _entries must be assigned now since infodict can change during iteration
2964         entries = info_dict.get('entries')
2965         if entries is None:
2966             raise EntryNotInPlaylist('There are no entries')
2967         elif isinstance(entries, list):
2968             self.is_exhausted = True
2969
2970         requested_entries = info_dict.get('requested_entries')
2971         self.is_incomplete = requested_entries is not None
2972         if self.is_incomplete:
2973             assert self.is_exhausted
2974             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2975             for i, entry in zip(requested_entries, entries):
2976                 self._entries[i - 1] = entry
2977         elif isinstance(entries, (list, PagedList, LazyList)):
2978             self._entries = entries
2979         else:
2980             self._entries = LazyList(entries)
2981
2982     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983         (?P<start>[+-]?\d+)?
2984         (?P<range>[:-]
2985             (?P<end>[+-]?\d+|inf(?:inite)?)?
2986             (?::(?P<step>[+-]?\d+))?
2987         )?''')
2988
2989     @classmethod
2990     def parse_playlist_items(cls, string):
2991         for segment in string.split(','):
2992             if not segment:
2993                 raise ValueError('There is two or more consecutive commas')
2994             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995             if not mobj:
2996                 raise ValueError(f'{segment!r} is not a valid specification')
2997             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998             if int_or_none(step) == 0:
2999                 raise ValueError(f'Step in {segment!r} cannot be zero')
3000             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002     def get_requested_items(self):
3003         playlist_items = self.ydl.params.get('playlist_items')
3004         playlist_start = self.ydl.params.get('playliststart', 1)
3005         playlist_end = self.ydl.params.get('playlistend')
3006         # For backwards compatibility, interpret -1 as whole list
3007         if playlist_end in (-1, None):
3008             playlist_end = ''
3009         if not playlist_items:
3010             playlist_items = f'{playlist_start}:{playlist_end}'
3011         elif playlist_start != 1 or playlist_end:
3012             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014         for index in self.parse_playlist_items(playlist_items):
3015             for i, entry in self[index]:
3016                 yield i, entry
3017                 if not entry:
3018                     continue
3019                 try:
3020                     # TODO: Add auto-generated fields
3021                     self.ydl._match_entry(entry, incomplete=True, silent=True)
3022                 except (ExistingVideoReached, RejectedVideoReached):
3023                     return
3024
3025     def get_full_count(self):
3026         if self.is_exhausted and not self.is_incomplete:
3027             return len(self)
3028         elif isinstance(self._entries, InAdvancePagedList):
3029             if self._entries._pagesize == 1:
3030                 return self._entries._pagecount
3031
3032     @functools.cached_property
3033     def _getter(self):
3034         if isinstance(self._entries, list):
3035             def get_entry(i):
3036                 try:
3037                     entry = self._entries[i]
3038                 except IndexError:
3039                     entry = self.MissingEntry
3040                     if not self.is_incomplete:
3041                         raise self.IndexError()
3042                 if entry is self.MissingEntry:
3043                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3044                 return entry
3045         else:
3046             def get_entry(i):
3047                 try:
3048                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049                 except (LazyList.IndexError, PagedList.IndexError):
3050                     raise self.IndexError()
3051         return get_entry
3052
3053     def __getitem__(self, idx):
3054         if isinstance(idx, int):
3055             idx = slice(idx, idx)
3056
3057         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058         step = 1 if idx.step is None else idx.step
3059         if idx.start is None:
3060             start = 0 if step > 0 else len(self) - 1
3061         else:
3062             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064         # NB: Do not call len(self) when idx == [:]
3065         if idx.stop is None:
3066             stop = 0 if step < 0 else float('inf')
3067         else:
3068             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069         stop += [-1, 1][step > 0]
3070
3071         for i in frange(start, stop, step):
3072             if i < 0:
3073                 continue
3074             try:
3075                 entry = self._getter(i)
3076             except self.IndexError:
3077                 self.is_exhausted = True
3078                 if step > 0:
3079                     break
3080                 continue
3081             yield i + 1, entry
3082
3083     def __len__(self):
3084         return len(tuple(self[:]))
3085
3086     class IndexError(IndexError):
3087         pass
3088
3089
3090 def uppercase_escape(s):
3091     unicode_escape = codecs.getdecoder('unicode_escape')
3092     return re.sub(
3093         r'\\U[0-9a-fA-F]{8}',
3094         lambda m: unicode_escape(m.group(0))[0],
3095         s)
3096
3097
3098 def lowercase_escape(s):
3099     unicode_escape = codecs.getdecoder('unicode_escape')
3100     return re.sub(
3101         r'\\u[0-9a-fA-F]{4}',
3102         lambda m: unicode_escape(m.group(0))[0],
3103         s)
3104
3105
3106 def escape_rfc3986(s):
3107     """Escape non-ASCII characters as suggested by RFC 3986"""
3108     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3109
3110
3111 def escape_url(url):
3112     """Escape URL as suggested by RFC 3986"""
3113     url_parsed = urllib.parse.urlparse(url)
3114     return url_parsed._replace(
3115         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3116         path=escape_rfc3986(url_parsed.path),
3117         params=escape_rfc3986(url_parsed.params),
3118         query=escape_rfc3986(url_parsed.query),
3119         fragment=escape_rfc3986(url_parsed.fragment)
3120     ).geturl()
3121
3122
3123 def parse_qs(url, **kwargs):
3124     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3125
3126
3127 def read_batch_urls(batch_fd):
3128     def fixup(url):
3129         if not isinstance(url, str):
3130             url = url.decode('utf-8', 'replace')
3131         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132         for bom in BOM_UTF8:
3133             if url.startswith(bom):
3134                 url = url[len(bom):]
3135         url = url.lstrip()
3136         if not url or url.startswith(('#', ';', ']')):
3137             return False
3138         # "#" cannot be stripped out since it is part of the URI
3139         # However, it can be safely stripped out if following a whitespace
3140         return re.split(r'\s#', url, 1)[0].rstrip()
3141
3142     with contextlib.closing(batch_fd) as fd:
3143         return [url for url in map(fixup, fd) if url]
3144
3145
3146 def urlencode_postdata(*args, **kargs):
3147     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3148
3149
3150 def update_url_query(url, query):
3151     if not query:
3152         return url
3153     parsed_url = urllib.parse.urlparse(url)
3154     qs = urllib.parse.parse_qs(parsed_url.query)
3155     qs.update(query)
3156     return urllib.parse.urlunparse(parsed_url._replace(
3157         query=urllib.parse.urlencode(qs, True)))
3158
3159
3160 def update_Request(req, url=None, data=None, headers=None, query=None):
3161     req_headers = req.headers.copy()
3162     req_headers.update(headers or {})
3163     req_data = data or req.data
3164     req_url = update_url_query(url or req.get_full_url(), query)
3165     req_get_method = req.get_method()
3166     if req_get_method == 'HEAD':
3167         req_type = HEADRequest
3168     elif req_get_method == 'PUT':
3169         req_type = PUTRequest
3170     else:
3171         req_type = urllib.request.Request
3172     new_req = req_type(
3173         req_url, data=req_data, headers=req_headers,
3174         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175     if hasattr(req, 'timeout'):
3176         new_req.timeout = req.timeout
3177     return new_req
3178
3179
3180 def _multipart_encode_impl(data, boundary):
3181     content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183     out = b''
3184     for k, v in data.items():
3185         out += b'--' + boundary.encode('ascii') + b'\r\n'
3186         if isinstance(k, str):
3187             k = k.encode()
3188         if isinstance(v, str):
3189             v = v.encode()
3190         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3192         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3193         if boundary.encode('ascii') in content:
3194             raise ValueError('Boundary overlaps with data')
3195         out += content
3196
3197     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199     return out, content_type
3200
3201
3202 def multipart_encode(data, boundary=None):
3203     '''
3204     Encode a dict to RFC 7578-compliant form-data
3205
3206     data:
3207         A dict where keys and values can be either Unicode or bytes-like
3208         objects.
3209     boundary:
3210         If specified a Unicode object, it's used as the boundary. Otherwise
3211         a random boundary is generated.
3212
3213     Reference: https://tools.ietf.org/html/rfc7578
3214     '''
3215     has_specified_boundary = boundary is not None
3216
3217     while True:
3218         if boundary is None:
3219             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221         try:
3222             out, content_type = _multipart_encode_impl(data, boundary)
3223             break
3224         except ValueError:
3225             if has_specified_boundary:
3226                 raise
3227             boundary = None
3228
3229     return out, content_type
3230
3231
3232 def variadic(x, allowed_types=(str, bytes, dict)):
3233     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
3236 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3237     for val in map(d.get, variadic(key_or_keys)):
3238         if val is not None and (val or not skip_false_values):
3239             return val
3240     return default
3241
3242
3243 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244     for f in funcs:
3245         try:
3246             val = f(*args, **kwargs)
3247         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3248             pass
3249         else:
3250             if expected_type is None or isinstance(val, expected_type):
3251                 return val
3252
3253
3254 def try_get(src, getter, expected_type=None):
3255     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3256
3257
3258 def filter_dict(dct, cndn=lambda _, v: v is not None):
3259     return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
3262 def merge_dicts(*dicts):
3263     merged = {}
3264     for a_dict in dicts:
3265         for k, v in a_dict.items():
3266             if (v is not None and k not in merged
3267                     or isinstance(v, str) and merged[k] == ''):
3268                 merged[k] = v
3269     return merged
3270
3271
3272 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3273     return string if isinstance(string, str) else str(string, encoding, errors)
3274
3275
3276 US_RATINGS = {
3277     'G': 0,
3278     'PG': 10,
3279     'PG-13': 13,
3280     'R': 16,
3281     'NC': 18,
3282 }
3283
3284
3285 TV_PARENTAL_GUIDELINES = {
3286     'TV-Y': 0,
3287     'TV-Y7': 7,
3288     'TV-G': 0,
3289     'TV-PG': 0,
3290     'TV-14': 14,
3291     'TV-MA': 17,
3292 }
3293
3294
3295 def parse_age_limit(s):
3296     # isinstance(False, int) is True. So type() must be used instead
3297     if type(s) is int:  # noqa: E721
3298         return s if 0 <= s <= 21 else None
3299     elif not isinstance(s, str):
3300         return None
3301     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3302     if m:
3303         return int(m.group('age'))
3304     s = s.upper()
3305     if s in US_RATINGS:
3306         return US_RATINGS[s]
3307     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3308     if m:
3309         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3310     return None
3311
3312
3313 def strip_jsonp(code):
3314     return re.sub(
3315         r'''(?sx)^
3316             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3317             (?:\s*&&\s*(?P=func_name))?
3318             \s*\(\s*(?P<callback_data>.*)\);?
3319             \s*?(?://[^\n]*)*$''',
3320         r'\g<callback_data>', code)
3321
3322
3323 def js_to_json(code, vars={}, *, strict=False):
3324     # vars is a dict of var, val pairs to substitute
3325     STRING_QUOTES = '\'"'
3326     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3327     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3328     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3329     INTEGER_TABLE = (
3330         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3332     )
3333
3334     def process_escape(match):
3335         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336         escape = match.group(1) or match.group(2)
3337
3338         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339                 else R'\u00' if escape == 'x'
3340                 else '' if escape == '\n'
3341                 else escape)
3342
3343     def fix_kv(m):
3344         v = m.group(0)
3345         if v in ('true', 'false', 'null'):
3346             return v
3347         elif v in ('undefined', 'void 0'):
3348             return 'null'
3349         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3350             return ''
3351
3352         if v[0] in STRING_QUOTES:
3353             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354             return f'"{escaped}"'
3355
3356         for regex, base in INTEGER_TABLE:
3357             im = re.match(regex, v)
3358             if im:
3359                 i = int(im.group(1), base)
3360                 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362         if v in vars:
3363             return json.dumps(vars[v])
3364
3365         if not strict:
3366             return f'"{v}"'
3367
3368         raise ValueError(f'Unknown value: {v}')
3369
3370     def create_map(mobj):
3371         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3372
3373     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3374     if not strict:
3375         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3376         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3377
3378     return re.sub(rf'''(?sx)
3379         {STRING_RE}|
3380         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3381         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3382         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3383         [0-9]+(?={SKIP_RE}:)|
3384         !+
3385         ''', fix_kv, code)
3386
3387
3388 def qualities(quality_ids):
3389     """ Get a numeric quality value out of a list of possible values """
3390     def q(qid):
3391         try:
3392             return quality_ids.index(qid)
3393         except ValueError:
3394             return -1
3395     return q
3396
3397
3398 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3399
3400
3401 DEFAULT_OUTTMPL = {
3402     'default': '%(title)s [%(id)s].%(ext)s',
3403     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3404 }
3405 OUTTMPL_TYPES = {
3406     'chapter': None,
3407     'subtitle': None,
3408     'thumbnail': None,
3409     'description': 'description',
3410     'annotation': 'annotations.xml',
3411     'infojson': 'info.json',
3412     'link': None,
3413     'pl_video': None,
3414     'pl_thumbnail': None,
3415     'pl_description': 'description',
3416     'pl_infojson': 'info.json',
3417 }
3418
3419 # As of [1] format syntax is:
3420 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3421 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3422 STR_FORMAT_RE_TMPL = r'''(?x)
3423     (?<!%)(?P<prefix>(?:%%)*)
3424     %
3425     (?P<has_key>\((?P<key>{0})\))?
3426     (?P<format>
3427         (?P<conversion>[#0\-+ ]+)?
3428         (?P<min_width>\d+)?
3429         (?P<precision>\.\d+)?
3430         (?P<len_mod>[hlL])?  # unused in python
3431         {1}  # conversion type
3432     )
3433 '''
3434
3435
3436 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3437
3438
3439 def limit_length(s, length):
3440     """ Add ellipses to overly long strings """
3441     if s is None:
3442         return None
3443     ELLIPSES = '...'
3444     if len(s) > length:
3445         return s[:length - len(ELLIPSES)] + ELLIPSES
3446     return s
3447
3448
3449 def version_tuple(v):
3450     return tuple(int(e) for e in re.split(r'[-.]', v))
3451
3452
3453 def is_outdated_version(version, limit, assume_new=True):
3454     if not version:
3455         return not assume_new
3456     try:
3457         return version_tuple(version) < version_tuple(limit)
3458     except ValueError:
3459         return not assume_new
3460
3461
3462 def ytdl_is_updateable():
3463     """ Returns if yt-dlp can be updated with -U """
3464
3465     from .update import is_non_updateable
3466
3467     return not is_non_updateable()
3468
3469
3470 def args_to_str(args):
3471     # Get a short string representation for a subprocess command
3472     return ' '.join(compat_shlex_quote(a) for a in args)
3473
3474
3475 def error_to_compat_str(err):
3476     return str(err)
3477
3478
3479 def error_to_str(err):
3480     return f'{type(err).__name__}: {err}'
3481
3482
3483 def mimetype2ext(mt):
3484     if mt is None:
3485         return None
3486
3487     mt, _, params = mt.partition(';')
3488     mt = mt.strip()
3489
3490     FULL_MAP = {
3491         'audio/mp4': 'm4a',
3492         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3493         # it's the most popular one
3494         'audio/mpeg': 'mp3',
3495         'audio/x-wav': 'wav',
3496         'audio/wav': 'wav',
3497         'audio/wave': 'wav',
3498     }
3499
3500     ext = FULL_MAP.get(mt)
3501     if ext is not None:
3502         return ext
3503
3504     SUBTYPE_MAP = {
3505         '3gpp': '3gp',
3506         'smptett+xml': 'tt',
3507         'ttaf+xml': 'dfxp',
3508         'ttml+xml': 'ttml',
3509         'x-flv': 'flv',
3510         'x-mp4-fragmented': 'mp4',
3511         'x-ms-sami': 'sami',
3512         'x-ms-wmv': 'wmv',
3513         'mpegurl': 'm3u8',
3514         'x-mpegurl': 'm3u8',
3515         'vnd.apple.mpegurl': 'm3u8',
3516         'dash+xml': 'mpd',
3517         'f4m+xml': 'f4m',
3518         'hds+xml': 'f4m',
3519         'vnd.ms-sstr+xml': 'ism',
3520         'quicktime': 'mov',
3521         'mp2t': 'ts',
3522         'x-wav': 'wav',
3523         'filmstrip+json': 'fs',
3524         'svg+xml': 'svg',
3525     }
3526
3527     _, _, subtype = mt.rpartition('/')
3528     ext = SUBTYPE_MAP.get(subtype.lower())
3529     if ext is not None:
3530         return ext
3531
3532     SUFFIX_MAP = {
3533         'json': 'json',
3534         'xml': 'xml',
3535         'zip': 'zip',
3536         'gzip': 'gz',
3537     }
3538
3539     _, _, suffix = subtype.partition('+')
3540     ext = SUFFIX_MAP.get(suffix)
3541     if ext is not None:
3542         return ext
3543
3544     return subtype.replace('+', '.')
3545
3546
3547 def ext2mimetype(ext_or_url):
3548     if not ext_or_url:
3549         return None
3550     if '.' not in ext_or_url:
3551         ext_or_url = f'file.{ext_or_url}'
3552     return mimetypes.guess_type(ext_or_url)[0]
3553
3554
3555 def parse_codecs(codecs_str):
3556     # http://tools.ietf.org/html/rfc6381
3557     if not codecs_str:
3558         return {}
3559     split_codecs = list(filter(None, map(
3560         str.strip, codecs_str.strip().strip(',').split(','))))
3561     vcodec, acodec, scodec, hdr = None, None, None, None
3562     for full_codec in split_codecs:
3563         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3564         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3565                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3566             if vcodec:
3567                 continue
3568             vcodec = full_codec
3569             if parts[0] in ('dvh1', 'dvhe'):
3570                 hdr = 'DV'
3571             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3572                 hdr = 'HDR10'
3573             elif parts[:2] == ['vp9', '2']:
3574                 hdr = 'HDR10'
3575         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3576                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3577             acodec = acodec or full_codec
3578         elif parts[0] in ('stpp', 'wvtt'):
3579             scodec = scodec or full_codec
3580         else:
3581             write_string(f'WARNING: Unknown codec {full_codec}\n')
3582     if vcodec or acodec or scodec:
3583         return {
3584             'vcodec': vcodec or 'none',
3585             'acodec': acodec or 'none',
3586             'dynamic_range': hdr,
3587             **({'scodec': scodec} if scodec is not None else {}),
3588         }
3589     elif len(split_codecs) == 2:
3590         return {
3591             'vcodec': split_codecs[0],
3592             'acodec': split_codecs[1],
3593         }
3594     return {}
3595
3596
3597 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3598     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3599
3600     allow_mkv = not preferences or 'mkv' in preferences
3601
3602     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3603         return 'mkv'  # TODO: any other format allows this?
3604
3605     # TODO: All codecs supported by parse_codecs isn't handled here
3606     COMPATIBLE_CODECS = {
3607         'mp4': {
3608             'av1', 'hevc', 'avc1', 'mp4a',  # fourcc (m3u8, mpd)
3609             'h264', 'aacl', 'ec-3',  # Set in ISM
3610         },
3611         'webm': {
3612             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3613             'vp9x', 'vp8x',  # in the webm spec
3614         },
3615     }
3616
3617     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3618     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3619
3620     for ext in preferences or COMPATIBLE_CODECS.keys():
3621         codec_set = COMPATIBLE_CODECS.get(ext, set())
3622         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3623             return ext
3624
3625     COMPATIBLE_EXTS = (
3626         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3627         {'webm'},
3628     )
3629     for ext in preferences or vexts:
3630         current_exts = {ext, *vexts, *aexts}
3631         if ext == 'mkv' or current_exts == {ext} or any(
3632                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3633             return ext
3634     return 'mkv' if allow_mkv else preferences[-1]
3635
3636
3637 def urlhandle_detect_ext(url_handle):
3638     getheader = url_handle.headers.get
3639
3640     cd = getheader('Content-Disposition')
3641     if cd:
3642         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3643         if m:
3644             e = determine_ext(m.group('filename'), default_ext=None)
3645             if e:
3646                 return e
3647
3648     return mimetype2ext(getheader('Content-Type'))
3649
3650
3651 def encode_data_uri(data, mime_type):
3652     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3653
3654
3655 def age_restricted(content_limit, age_limit):
3656     """ Returns True iff the content should be blocked """
3657
3658     if age_limit is None:  # No limit set
3659         return False
3660     if content_limit is None:
3661         return False  # Content available for everyone
3662     return age_limit < content_limit
3663
3664
3665 # List of known byte-order-marks (BOM)
3666 BOMS = [
3667     (b'\xef\xbb\xbf', 'utf-8'),
3668     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3669     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3670     (b'\xff\xfe', 'utf-16-le'),
3671     (b'\xfe\xff', 'utf-16-be'),
3672 ]
3673
3674
3675 def is_html(first_bytes):
3676     """ Detect whether a file contains HTML by examining its first bytes. """
3677
3678     encoding = 'utf-8'
3679     for bom, enc in BOMS:
3680         while first_bytes.startswith(bom):
3681             encoding, first_bytes = enc, first_bytes[len(bom):]
3682
3683     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3684
3685
3686 def determine_protocol(info_dict):
3687     protocol = info_dict.get('protocol')
3688     if protocol is not None:
3689         return protocol
3690
3691     url = sanitize_url(info_dict['url'])
3692     if url.startswith('rtmp'):
3693         return 'rtmp'
3694     elif url.startswith('mms'):
3695         return 'mms'
3696     elif url.startswith('rtsp'):
3697         return 'rtsp'
3698
3699     ext = determine_ext(url)
3700     if ext == 'm3u8':
3701         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3702     elif ext == 'f4m':
3703         return 'f4m'
3704
3705     return urllib.parse.urlparse(url).scheme
3706
3707
3708 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3709     """ Render a list of rows, each as a list of values.
3710     Text after a \t will be right aligned """
3711     def width(string):
3712         return len(remove_terminal_sequences(string).replace('\t', ''))
3713
3714     def get_max_lens(table):
3715         return [max(width(str(v)) for v in col) for col in zip(*table)]
3716
3717     def filter_using_list(row, filterArray):
3718         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3719
3720     max_lens = get_max_lens(data) if hide_empty else []
3721     header_row = filter_using_list(header_row, max_lens)
3722     data = [filter_using_list(row, max_lens) for row in data]
3723
3724     table = [header_row] + data
3725     max_lens = get_max_lens(table)
3726     extra_gap += 1
3727     if delim:
3728         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3729         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3730     for row in table:
3731         for pos, text in enumerate(map(str, row)):
3732             if '\t' in text:
3733                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3734             else:
3735                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3736     ret = '\n'.join(''.join(row).rstrip() for row in table)
3737     return ret
3738
3739
3740 def _match_one(filter_part, dct, incomplete):
3741     # TODO: Generalize code with YoutubeDL._build_format_filter
3742     STRING_OPERATORS = {
3743         '*=': operator.contains,
3744         '^=': lambda attr, value: attr.startswith(value),
3745         '$=': lambda attr, value: attr.endswith(value),
3746         '~=': lambda attr, value: re.search(value, attr),
3747     }
3748     COMPARISON_OPERATORS = {
3749         **STRING_OPERATORS,
3750         '<=': operator.le,  # "<=" must be defined above "<"
3751         '<': operator.lt,
3752         '>=': operator.ge,
3753         '>': operator.gt,
3754         '=': operator.eq,
3755     }
3756
3757     if isinstance(incomplete, bool):
3758         is_incomplete = lambda _: incomplete
3759     else:
3760         is_incomplete = lambda k: k in incomplete
3761
3762     operator_rex = re.compile(r'''(?x)
3763         (?P<key>[a-z_]+)
3764         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3765         (?:
3766             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3767             (?P<strval>.+?)
3768         )
3769         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3770     m = operator_rex.fullmatch(filter_part.strip())
3771     if m:
3772         m = m.groupdict()
3773         unnegated_op = COMPARISON_OPERATORS[m['op']]
3774         if m['negation']:
3775             op = lambda attr, value: not unnegated_op(attr, value)
3776         else:
3777             op = unnegated_op
3778         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3779         if m['quote']:
3780             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3781         actual_value = dct.get(m['key'])
3782         numeric_comparison = None
3783         if isinstance(actual_value, (int, float)):
3784             # If the original field is a string and matching comparisonvalue is
3785             # a number we should respect the origin of the original field
3786             # and process comparison value as a string (see
3787             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3788             try:
3789                 numeric_comparison = int(comparison_value)
3790             except ValueError:
3791                 numeric_comparison = parse_filesize(comparison_value)
3792                 if numeric_comparison is None:
3793                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3794                 if numeric_comparison is None:
3795                     numeric_comparison = parse_duration(comparison_value)
3796         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3797             raise ValueError('Operator %s only supports string values!' % m['op'])
3798         if actual_value is None:
3799             return is_incomplete(m['key']) or m['none_inclusive']
3800         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3801
3802     UNARY_OPERATORS = {
3803         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3804         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3805     }
3806     operator_rex = re.compile(r'''(?x)
3807         (?P<op>%s)\s*(?P<key>[a-z_]+)
3808         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3809     m = operator_rex.fullmatch(filter_part.strip())
3810     if m:
3811         op = UNARY_OPERATORS[m.group('op')]
3812         actual_value = dct.get(m.group('key'))
3813         if is_incomplete(m.group('key')) and actual_value is None:
3814             return True
3815         return op(actual_value)
3816
3817     raise ValueError('Invalid filter part %r' % filter_part)
3818
3819
3820 def match_str(filter_str, dct, incomplete=False):
3821     """ Filter a dictionary with a simple string syntax.
3822     @returns           Whether the filter passes
3823     @param incomplete  Set of keys that is expected to be missing from dct.
3824                        Can be True/False to indicate all/none of the keys may be missing.
3825                        All conditions on incomplete keys pass if the key is missing
3826     """
3827     return all(
3828         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3829         for filter_part in re.split(r'(?<!\\)&', filter_str))
3830
3831
3832 def match_filter_func(filters):
3833     if not filters:
3834         return None
3835     filters = set(variadic(filters))
3836
3837     interactive = '-' in filters
3838     if interactive:
3839         filters.remove('-')
3840
3841     def _match_func(info_dict, incomplete=False):
3842         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3843             return NO_DEFAULT if interactive and not incomplete else None
3844         else:
3845             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3846             filter_str = ') | ('.join(map(str.strip, filters))
3847             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3848     return _match_func
3849
3850
3851 class download_range_func:
3852     def __init__(self, chapters, ranges):
3853         self.chapters, self.ranges = chapters, ranges
3854
3855     def __call__(self, info_dict, ydl):
3856         if not self.ranges and not self.chapters:
3857             yield {}
3858
3859         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3860                    else 'Cannot match chapters since chapter information is unavailable')
3861         for regex in self.chapters or []:
3862             for i, chapter in enumerate(info_dict.get('chapters') or []):
3863                 if re.search(regex, chapter['title']):
3864                     warning = None
3865                     yield {**chapter, 'index': i}
3866         if self.chapters and warning:
3867             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3868
3869         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3870
3871     def __eq__(self, other):
3872         return (isinstance(other, download_range_func)
3873                 and self.chapters == other.chapters and self.ranges == other.ranges)
3874
3875
3876 def parse_dfxp_time_expr(time_expr):
3877     if not time_expr:
3878         return
3879
3880     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3881     if mobj:
3882         return float(mobj.group('time_offset'))
3883
3884     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3885     if mobj:
3886         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3887
3888
3889 def srt_subtitles_timecode(seconds):
3890     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3891
3892
3893 def ass_subtitles_timecode(seconds):
3894     time = timetuple_from_msec(seconds * 1000)
3895     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3896
3897
3898 def dfxp2srt(dfxp_data):
3899     '''
3900     @param dfxp_data A bytes-like object containing DFXP data
3901     @returns A unicode object containing converted SRT data
3902     '''
3903     LEGACY_NAMESPACES = (
3904         (b'http://www.w3.org/ns/ttml', [
3905             b'http://www.w3.org/2004/11/ttaf1',
3906             b'http://www.w3.org/2006/04/ttaf1',
3907             b'http://www.w3.org/2006/10/ttaf1',
3908         ]),
3909         (b'http://www.w3.org/ns/ttml#styling', [
3910             b'http://www.w3.org/ns/ttml#style',
3911         ]),
3912     )
3913
3914     SUPPORTED_STYLING = [
3915         'color',
3916         'fontFamily',
3917         'fontSize',
3918         'fontStyle',
3919         'fontWeight',
3920         'textDecoration'
3921     ]
3922
3923     _x = functools.partial(xpath_with_ns, ns_map={
3924         'xml': 'http://www.w3.org/XML/1998/namespace',
3925         'ttml': 'http://www.w3.org/ns/ttml',
3926         'tts': 'http://www.w3.org/ns/ttml#styling',
3927     })
3928
3929     styles = {}
3930     default_style = {}
3931
3932     class TTMLPElementParser:
3933         _out = ''
3934         _unclosed_elements = []
3935         _applied_styles = []
3936
3937         def start(self, tag, attrib):
3938             if tag in (_x('ttml:br'), 'br'):
3939                 self._out += '\n'
3940             else:
3941                 unclosed_elements = []
3942                 style = {}
3943                 element_style_id = attrib.get('style')
3944                 if default_style:
3945                     style.update(default_style)
3946                 if element_style_id:
3947                     style.update(styles.get(element_style_id, {}))
3948                 for prop in SUPPORTED_STYLING:
3949                     prop_val = attrib.get(_x('tts:' + prop))
3950                     if prop_val:
3951                         style[prop] = prop_val
3952                 if style:
3953                     font = ''
3954                     for k, v in sorted(style.items()):
3955                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3956                             continue
3957                         if k == 'color':
3958                             font += ' color="%s"' % v
3959                         elif k == 'fontSize':
3960                             font += ' size="%s"' % v
3961                         elif k == 'fontFamily':
3962                             font += ' face="%s"' % v
3963                         elif k == 'fontWeight' and v == 'bold':
3964                             self._out += '<b>'
3965                             unclosed_elements.append('b')
3966                         elif k == 'fontStyle' and v == 'italic':
3967                             self._out += '<i>'
3968                             unclosed_elements.append('i')
3969                         elif k == 'textDecoration' and v == 'underline':
3970                             self._out += '<u>'
3971                             unclosed_elements.append('u')
3972                     if font:
3973                         self._out += '<font' + font + '>'
3974                         unclosed_elements.append('font')
3975                     applied_style = {}
3976                     if self._applied_styles:
3977                         applied_style.update(self._applied_styles[-1])
3978                     applied_style.update(style)
3979                     self._applied_styles.append(applied_style)
3980                 self._unclosed_elements.append(unclosed_elements)
3981
3982         def end(self, tag):
3983             if tag not in (_x('ttml:br'), 'br'):
3984                 unclosed_elements = self._unclosed_elements.pop()
3985                 for element in reversed(unclosed_elements):
3986                     self._out += '</%s>' % element
3987                 if unclosed_elements and self._applied_styles:
3988                     self._applied_styles.pop()
3989
3990         def data(self, data):
3991             self._out += data
3992
3993         def close(self):
3994             return self._out.strip()
3995
3996     def parse_node(node):
3997         target = TTMLPElementParser()
3998         parser = xml.etree.ElementTree.XMLParser(target=target)
3999         parser.feed(xml.etree.ElementTree.tostring(node))
4000         return parser.close()
4001
4002     for k, v in LEGACY_NAMESPACES:
4003         for ns in v:
4004             dfxp_data = dfxp_data.replace(ns, k)
4005
4006     dfxp = compat_etree_fromstring(dfxp_data)
4007     out = []
4008     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4009
4010     if not paras:
4011         raise ValueError('Invalid dfxp/TTML subtitle')
4012
4013     repeat = False
4014     while True:
4015         for style in dfxp.findall(_x('.//ttml:style')):
4016             style_id = style.get('id') or style.get(_x('xml:id'))
4017             if not style_id:
4018                 continue
4019             parent_style_id = style.get('style')
4020             if parent_style_id:
4021                 if parent_style_id not in styles:
4022                     repeat = True
4023                     continue
4024                 styles[style_id] = styles[parent_style_id].copy()
4025             for prop in SUPPORTED_STYLING:
4026                 prop_val = style.get(_x('tts:' + prop))
4027                 if prop_val:
4028                     styles.setdefault(style_id, {})[prop] = prop_val
4029         if repeat:
4030             repeat = False
4031         else:
4032             break
4033
4034     for p in ('body', 'div'):
4035         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4036         if ele is None:
4037             continue
4038         style = styles.get(ele.get('style'))
4039         if not style:
4040             continue
4041         default_style.update(style)
4042
4043     for para, index in zip(paras, itertools.count(1)):
4044         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4045         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4046         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4047         if begin_time is None:
4048             continue
4049         if not end_time:
4050             if not dur:
4051                 continue
4052             end_time = begin_time + dur
4053         out.append('%d\n%s --> %s\n%s\n\n' % (
4054             index,
4055             srt_subtitles_timecode(begin_time),
4056             srt_subtitles_timecode(end_time),
4057             parse_node(para)))
4058
4059     return ''.join(out)
4060
4061
4062 def cli_option(params, command_option, param, separator=None):
4063     param = params.get(param)
4064     return ([] if param is None
4065             else [command_option, str(param)] if separator is None
4066             else [f'{command_option}{separator}{param}'])
4067
4068
4069 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4070     param = params.get(param)
4071     assert param in (True, False, None)
4072     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4073
4074
4075 def cli_valueless_option(params, command_option, param, expected_value=True):
4076     return [command_option] if params.get(param) == expected_value else []
4077
4078
4079 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4080     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4081         if use_compat:
4082             return argdict
4083         else:
4084             argdict = None
4085     if argdict is None:
4086         return default
4087     assert isinstance(argdict, dict)
4088
4089     assert isinstance(keys, (list, tuple))
4090     for key_list in keys:
4091         arg_list = list(filter(
4092             lambda x: x is not None,
4093             [argdict.get(key.lower()) for key in variadic(key_list)]))
4094         if arg_list:
4095             return [arg for args in arg_list for arg in args]
4096     return default
4097
4098
4099 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4100     main_key, exe = main_key.lower(), exe.lower()
4101     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4102     keys = [f'{root_key}{k}' for k in (keys or [''])]
4103     if root_key in keys:
4104         if main_key != exe:
4105             keys.append((main_key, exe))
4106         keys.append('default')
4107     else:
4108         use_compat = False
4109     return cli_configuration_args(argdict, keys, default, use_compat)
4110
4111
4112 class ISO639Utils:
4113     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4114     _lang_map = {
4115         'aa': 'aar',
4116         'ab': 'abk',
4117         'ae': 'ave',
4118         'af': 'afr',
4119         'ak': 'aka',
4120         'am': 'amh',
4121         'an': 'arg',
4122         'ar': 'ara',
4123         'as': 'asm',
4124         'av': 'ava',
4125         'ay': 'aym',
4126         'az': 'aze',
4127         'ba': 'bak',
4128         'be': 'bel',
4129         'bg': 'bul',
4130         'bh': 'bih',
4131         'bi': 'bis',
4132         'bm': 'bam',
4133         'bn': 'ben',
4134         'bo': 'bod',
4135         'br': 'bre',
4136         'bs': 'bos',
4137         'ca': 'cat',
4138         'ce': 'che',
4139         'ch': 'cha',
4140         'co': 'cos',
4141         'cr': 'cre',
4142         'cs': 'ces',
4143         'cu': 'chu',
4144         'cv': 'chv',
4145         'cy': 'cym',
4146         'da': 'dan',
4147         'de': 'deu',
4148         'dv': 'div',
4149         'dz': 'dzo',
4150         'ee': 'ewe',
4151         'el': 'ell',
4152         'en': 'eng',
4153         'eo': 'epo',
4154         'es': 'spa',
4155         'et': 'est',
4156         'eu': 'eus',
4157         'fa': 'fas',
4158         'ff': 'ful',
4159         'fi': 'fin',
4160         'fj': 'fij',
4161         'fo': 'fao',
4162         'fr': 'fra',
4163         'fy': 'fry',
4164         'ga': 'gle',
4165         'gd': 'gla',
4166         'gl': 'glg',
4167         'gn': 'grn',
4168         'gu': 'guj',
4169         'gv': 'glv',
4170         'ha': 'hau',
4171         'he': 'heb',
4172         'iw': 'heb',  # Replaced by he in 1989 revision
4173         'hi': 'hin',
4174         'ho': 'hmo',
4175         'hr': 'hrv',
4176         'ht': 'hat',
4177         'hu': 'hun',
4178         'hy': 'hye',
4179         'hz': 'her',
4180         'ia': 'ina',
4181         'id': 'ind',
4182         'in': 'ind',  # Replaced by id in 1989 revision
4183         'ie': 'ile',
4184         'ig': 'ibo',
4185         'ii': 'iii',
4186         'ik': 'ipk',
4187         'io': 'ido',
4188         'is': 'isl',
4189         'it': 'ita',
4190         'iu': 'iku',
4191         'ja': 'jpn',
4192         'jv': 'jav',
4193         'ka': 'kat',
4194         'kg': 'kon',
4195         'ki': 'kik',
4196         'kj': 'kua',
4197         'kk': 'kaz',
4198         'kl': 'kal',
4199         'km': 'khm',
4200         'kn': 'kan',
4201         'ko': 'kor',
4202         'kr': 'kau',
4203         'ks': 'kas',
4204         'ku': 'kur',
4205         'kv': 'kom',
4206         'kw': 'cor',
4207         'ky': 'kir',
4208         'la': 'lat',
4209         'lb': 'ltz',
4210         'lg': 'lug',
4211         'li': 'lim',
4212         'ln': 'lin',
4213         'lo': 'lao',
4214         'lt': 'lit',
4215         'lu': 'lub',
4216         'lv': 'lav',
4217         'mg': 'mlg',
4218         'mh': 'mah',
4219         'mi': 'mri',
4220         'mk': 'mkd',
4221         'ml': 'mal',
4222         'mn': 'mon',
4223         'mr': 'mar',
4224         'ms': 'msa',
4225         'mt': 'mlt',
4226         'my': 'mya',
4227         'na': 'nau',
4228         'nb': 'nob',
4229         'nd': 'nde',
4230         'ne': 'nep',
4231         'ng': 'ndo',
4232         'nl': 'nld',
4233         'nn': 'nno',
4234         'no': 'nor',
4235         'nr': 'nbl',
4236         'nv': 'nav',
4237         'ny': 'nya',
4238         'oc': 'oci',
4239         'oj': 'oji',
4240         'om': 'orm',
4241         'or': 'ori',
4242         'os': 'oss',
4243         'pa': 'pan',
4244         'pi': 'pli',
4245         'pl': 'pol',
4246         'ps': 'pus',
4247         'pt': 'por',
4248         'qu': 'que',
4249         'rm': 'roh',
4250         'rn': 'run',
4251         'ro': 'ron',
4252         'ru': 'rus',
4253         'rw': 'kin',
4254         'sa': 'san',
4255         'sc': 'srd',
4256         'sd': 'snd',
4257         'se': 'sme',
4258         'sg': 'sag',
4259         'si': 'sin',
4260         'sk': 'slk',
4261         'sl': 'slv',
4262         'sm': 'smo',
4263         'sn': 'sna',
4264         'so': 'som',
4265         'sq': 'sqi',
4266         'sr': 'srp',
4267         'ss': 'ssw',
4268         'st': 'sot',
4269         'su': 'sun',
4270         'sv': 'swe',
4271         'sw': 'swa',
4272         'ta': 'tam',
4273         'te': 'tel',
4274         'tg': 'tgk',
4275         'th': 'tha',
4276         'ti': 'tir',
4277         'tk': 'tuk',
4278         'tl': 'tgl',
4279         'tn': 'tsn',
4280         'to': 'ton',
4281         'tr': 'tur',
4282         'ts': 'tso',
4283         'tt': 'tat',
4284         'tw': 'twi',
4285         'ty': 'tah',
4286         'ug': 'uig',
4287         'uk': 'ukr',
4288         'ur': 'urd',
4289         'uz': 'uzb',
4290         've': 'ven',
4291         'vi': 'vie',
4292         'vo': 'vol',
4293         'wa': 'wln',
4294         'wo': 'wol',
4295         'xh': 'xho',
4296         'yi': 'yid',
4297         'ji': 'yid',  # Replaced by yi in 1989 revision
4298         'yo': 'yor',
4299         'za': 'zha',
4300         'zh': 'zho',
4301         'zu': 'zul',
4302     }
4303
4304     @classmethod
4305     def short2long(cls, code):
4306         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4307         return cls._lang_map.get(code[:2])
4308
4309     @classmethod
4310     def long2short(cls, code):
4311         """Convert language code from ISO 639-2/T to ISO 639-1"""
4312         for short_name, long_name in cls._lang_map.items():
4313             if long_name == code:
4314                 return short_name
4315
4316
4317 class ISO3166Utils:
4318     # From http://data.okfn.org/data/core/country-list
4319     _country_map = {
4320         'AF': 'Afghanistan',
4321         'AX': 'Åland Islands',
4322         'AL': 'Albania',
4323         'DZ': 'Algeria',
4324         'AS': 'American Samoa',
4325         'AD': 'Andorra',
4326         'AO': 'Angola',
4327         'AI': 'Anguilla',
4328         'AQ': 'Antarctica',
4329         'AG': 'Antigua and Barbuda',
4330         'AR': 'Argentina',
4331         'AM': 'Armenia',
4332         'AW': 'Aruba',
4333         'AU': 'Australia',
4334         'AT': 'Austria',
4335         'AZ': 'Azerbaijan',
4336         'BS': 'Bahamas',
4337         'BH': 'Bahrain',
4338         'BD': 'Bangladesh',
4339         'BB': 'Barbados',
4340         'BY': 'Belarus',
4341         'BE': 'Belgium',
4342         'BZ': 'Belize',
4343         'BJ': 'Benin',
4344         'BM': 'Bermuda',
4345         'BT': 'Bhutan',
4346         'BO': 'Bolivia, Plurinational State of',
4347         'BQ': 'Bonaire, Sint Eustatius and Saba',
4348         'BA': 'Bosnia and Herzegovina',
4349         'BW': 'Botswana',
4350         'BV': 'Bouvet Island',
4351         'BR': 'Brazil',
4352         'IO': 'British Indian Ocean Territory',
4353         'BN': 'Brunei Darussalam',
4354         'BG': 'Bulgaria',
4355         'BF': 'Burkina Faso',
4356         'BI': 'Burundi',
4357         'KH': 'Cambodia',
4358         'CM': 'Cameroon',
4359         'CA': 'Canada',
4360         'CV': 'Cape Verde',
4361         'KY': 'Cayman Islands',
4362         'CF': 'Central African Republic',
4363         'TD': 'Chad',
4364         'CL': 'Chile',
4365         'CN': 'China',
4366         'CX': 'Christmas Island',
4367         'CC': 'Cocos (Keeling) Islands',
4368         'CO': 'Colombia',
4369         'KM': 'Comoros',
4370         'CG': 'Congo',
4371         'CD': 'Congo, the Democratic Republic of the',
4372         'CK': 'Cook Islands',
4373         'CR': 'Costa Rica',
4374         'CI': 'Côte d\'Ivoire',
4375         'HR': 'Croatia',
4376         'CU': 'Cuba',
4377         'CW': 'Curaçao',
4378         'CY': 'Cyprus',
4379         'CZ': 'Czech Republic',
4380         'DK': 'Denmark',
4381         'DJ': 'Djibouti',
4382         'DM': 'Dominica',
4383         'DO': 'Dominican Republic',
4384         'EC': 'Ecuador',
4385         'EG': 'Egypt',
4386         'SV': 'El Salvador',
4387         'GQ': 'Equatorial Guinea',
4388         'ER': 'Eritrea',
4389         'EE': 'Estonia',
4390         'ET': 'Ethiopia',
4391         'FK': 'Falkland Islands (Malvinas)',
4392         'FO': 'Faroe Islands',
4393         'FJ': 'Fiji',
4394         'FI': 'Finland',
4395         'FR': 'France',
4396         'GF': 'French Guiana',
4397         'PF': 'French Polynesia',
4398         'TF': 'French Southern Territories',
4399         'GA': 'Gabon',
4400         'GM': 'Gambia',
4401         'GE': 'Georgia',
4402         'DE': 'Germany',
4403         'GH': 'Ghana',
4404         'GI': 'Gibraltar',
4405         'GR': 'Greece',
4406         'GL': 'Greenland',
4407         'GD': 'Grenada',
4408         'GP': 'Guadeloupe',
4409         'GU': 'Guam',
4410         'GT': 'Guatemala',
4411         'GG': 'Guernsey',
4412         'GN': 'Guinea',
4413         'GW': 'Guinea-Bissau',
4414         'GY': 'Guyana',
4415         'HT': 'Haiti',
4416         'HM': 'Heard Island and McDonald Islands',
4417         'VA': 'Holy See (Vatican City State)',
4418         'HN': 'Honduras',
4419         'HK': 'Hong Kong',
4420         'HU': 'Hungary',
4421         'IS': 'Iceland',
4422         'IN': 'India',
4423         'ID': 'Indonesia',
4424         'IR': 'Iran, Islamic Republic of',
4425         'IQ': 'Iraq',
4426         'IE': 'Ireland',
4427         'IM': 'Isle of Man',
4428         'IL': 'Israel',
4429         'IT': 'Italy',
4430         'JM': 'Jamaica',
4431         'JP': 'Japan',
4432         'JE': 'Jersey',
4433         'JO': 'Jordan',
4434         'KZ': 'Kazakhstan',
4435         'KE': 'Kenya',
4436         'KI': 'Kiribati',
4437         'KP': 'Korea, Democratic People\'s Republic of',
4438         'KR': 'Korea, Republic of',
4439         'KW': 'Kuwait',
4440         'KG': 'Kyrgyzstan',
4441         'LA': 'Lao People\'s Democratic Republic',
4442         'LV': 'Latvia',
4443         'LB': 'Lebanon',
4444         'LS': 'Lesotho',
4445         'LR': 'Liberia',
4446         'LY': 'Libya',
4447         'LI': 'Liechtenstein',
4448         'LT': 'Lithuania',
4449         'LU': 'Luxembourg',
4450         'MO': 'Macao',
4451         'MK': 'Macedonia, the Former Yugoslav Republic of',
4452         'MG': 'Madagascar',
4453         'MW': 'Malawi',
4454         'MY': 'Malaysia',
4455         'MV': 'Maldives',
4456         'ML': 'Mali',
4457         'MT': 'Malta',
4458         'MH': 'Marshall Islands',
4459         'MQ': 'Martinique',
4460         'MR': 'Mauritania',
4461         'MU': 'Mauritius',
4462         'YT': 'Mayotte',
4463         'MX': 'Mexico',
4464         'FM': 'Micronesia, Federated States of',
4465         'MD': 'Moldova, Republic of',
4466         'MC': 'Monaco',
4467         'MN': 'Mongolia',
4468         'ME': 'Montenegro',
4469         'MS': 'Montserrat',
4470         'MA': 'Morocco',
4471         'MZ': 'Mozambique',
4472         'MM': 'Myanmar',
4473         'NA': 'Namibia',
4474         'NR': 'Nauru',
4475         'NP': 'Nepal',
4476         'NL': 'Netherlands',
4477         'NC': 'New Caledonia',
4478         'NZ': 'New Zealand',
4479         'NI': 'Nicaragua',
4480         'NE': 'Niger',
4481         'NG': 'Nigeria',
4482         'NU': 'Niue',
4483         'NF': 'Norfolk Island',
4484         'MP': 'Northern Mariana Islands',
4485         'NO': 'Norway',
4486         'OM': 'Oman',
4487         'PK': 'Pakistan',
4488         'PW': 'Palau',
4489         'PS': 'Palestine, State of',
4490         'PA': 'Panama',
4491         'PG': 'Papua New Guinea',
4492         'PY': 'Paraguay',
4493         'PE': 'Peru',
4494         'PH': 'Philippines',
4495         'PN': 'Pitcairn',
4496         'PL': 'Poland',
4497         'PT': 'Portugal',
4498         'PR': 'Puerto Rico',
4499         'QA': 'Qatar',
4500         'RE': 'Réunion',
4501         'RO': 'Romania',
4502         'RU': 'Russian Federation',
4503         'RW': 'Rwanda',
4504         'BL': 'Saint Barthélemy',
4505         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4506         'KN': 'Saint Kitts and Nevis',
4507         'LC': 'Saint Lucia',
4508         'MF': 'Saint Martin (French part)',
4509         'PM': 'Saint Pierre and Miquelon',
4510         'VC': 'Saint Vincent and the Grenadines',
4511         'WS': 'Samoa',
4512         'SM': 'San Marino',
4513         'ST': 'Sao Tome and Principe',
4514         'SA': 'Saudi Arabia',
4515         'SN': 'Senegal',
4516         'RS': 'Serbia',
4517         'SC': 'Seychelles',
4518         'SL': 'Sierra Leone',
4519         'SG': 'Singapore',
4520         'SX': 'Sint Maarten (Dutch part)',
4521         'SK': 'Slovakia',
4522         'SI': 'Slovenia',
4523         'SB': 'Solomon Islands',
4524         'SO': 'Somalia',
4525         'ZA': 'South Africa',
4526         'GS': 'South Georgia and the South Sandwich Islands',
4527         'SS': 'South Sudan',
4528         'ES': 'Spain',
4529         'LK': 'Sri Lanka',
4530         'SD': 'Sudan',
4531         'SR': 'Suriname',
4532         'SJ': 'Svalbard and Jan Mayen',
4533         'SZ': 'Swaziland',
4534         'SE': 'Sweden',
4535         'CH': 'Switzerland',
4536         'SY': 'Syrian Arab Republic',
4537         'TW': 'Taiwan, Province of China',
4538         'TJ': 'Tajikistan',
4539         'TZ': 'Tanzania, United Republic of',
4540         'TH': 'Thailand',
4541         'TL': 'Timor-Leste',
4542         'TG': 'Togo',
4543         'TK': 'Tokelau',
4544         'TO': 'Tonga',
4545         'TT': 'Trinidad and Tobago',
4546         'TN': 'Tunisia',
4547         'TR': 'Turkey',
4548         'TM': 'Turkmenistan',
4549         'TC': 'Turks and Caicos Islands',
4550         'TV': 'Tuvalu',
4551         'UG': 'Uganda',
4552         'UA': 'Ukraine',
4553         'AE': 'United Arab Emirates',
4554         'GB': 'United Kingdom',
4555         'US': 'United States',
4556         'UM': 'United States Minor Outlying Islands',
4557         'UY': 'Uruguay',
4558         'UZ': 'Uzbekistan',
4559         'VU': 'Vanuatu',
4560         'VE': 'Venezuela, Bolivarian Republic of',
4561         'VN': 'Viet Nam',
4562         'VG': 'Virgin Islands, British',
4563         'VI': 'Virgin Islands, U.S.',
4564         'WF': 'Wallis and Futuna',
4565         'EH': 'Western Sahara',
4566         'YE': 'Yemen',
4567         'ZM': 'Zambia',
4568         'ZW': 'Zimbabwe',
4569         # Not ISO 3166 codes, but used for IP blocks
4570         'AP': 'Asia/Pacific Region',
4571         'EU': 'Europe',
4572     }
4573
4574     @classmethod
4575     def short2full(cls, code):
4576         """Convert an ISO 3166-2 country code to the corresponding full name"""
4577         return cls._country_map.get(code.upper())
4578
4579
4580 class GeoUtils:
4581     # Major IPv4 address blocks per country
4582     _country_ip_map = {
4583         'AD': '46.172.224.0/19',
4584         'AE': '94.200.0.0/13',
4585         'AF': '149.54.0.0/17',
4586         'AG': '209.59.64.0/18',
4587         'AI': '204.14.248.0/21',
4588         'AL': '46.99.0.0/16',
4589         'AM': '46.70.0.0/15',
4590         'AO': '105.168.0.0/13',
4591         'AP': '182.50.184.0/21',
4592         'AQ': '23.154.160.0/24',
4593         'AR': '181.0.0.0/12',
4594         'AS': '202.70.112.0/20',
4595         'AT': '77.116.0.0/14',
4596         'AU': '1.128.0.0/11',
4597         'AW': '181.41.0.0/18',
4598         'AX': '185.217.4.0/22',
4599         'AZ': '5.197.0.0/16',
4600         'BA': '31.176.128.0/17',
4601         'BB': '65.48.128.0/17',
4602         'BD': '114.130.0.0/16',
4603         'BE': '57.0.0.0/8',
4604         'BF': '102.178.0.0/15',
4605         'BG': '95.42.0.0/15',
4606         'BH': '37.131.0.0/17',
4607         'BI': '154.117.192.0/18',
4608         'BJ': '137.255.0.0/16',
4609         'BL': '185.212.72.0/23',
4610         'BM': '196.12.64.0/18',
4611         'BN': '156.31.0.0/16',
4612         'BO': '161.56.0.0/16',
4613         'BQ': '161.0.80.0/20',
4614         'BR': '191.128.0.0/12',
4615         'BS': '24.51.64.0/18',
4616         'BT': '119.2.96.0/19',
4617         'BW': '168.167.0.0/16',
4618         'BY': '178.120.0.0/13',
4619         'BZ': '179.42.192.0/18',
4620         'CA': '99.224.0.0/11',
4621         'CD': '41.243.0.0/16',
4622         'CF': '197.242.176.0/21',
4623         'CG': '160.113.0.0/16',
4624         'CH': '85.0.0.0/13',
4625         'CI': '102.136.0.0/14',
4626         'CK': '202.65.32.0/19',
4627         'CL': '152.172.0.0/14',
4628         'CM': '102.244.0.0/14',
4629         'CN': '36.128.0.0/10',
4630         'CO': '181.240.0.0/12',
4631         'CR': '201.192.0.0/12',
4632         'CU': '152.206.0.0/15',
4633         'CV': '165.90.96.0/19',
4634         'CW': '190.88.128.0/17',
4635         'CY': '31.153.0.0/16',
4636         'CZ': '88.100.0.0/14',
4637         'DE': '53.0.0.0/8',
4638         'DJ': '197.241.0.0/17',
4639         'DK': '87.48.0.0/12',
4640         'DM': '192.243.48.0/20',
4641         'DO': '152.166.0.0/15',
4642         'DZ': '41.96.0.0/12',
4643         'EC': '186.68.0.0/15',
4644         'EE': '90.190.0.0/15',
4645         'EG': '156.160.0.0/11',
4646         'ER': '196.200.96.0/20',
4647         'ES': '88.0.0.0/11',
4648         'ET': '196.188.0.0/14',
4649         'EU': '2.16.0.0/13',
4650         'FI': '91.152.0.0/13',
4651         'FJ': '144.120.0.0/16',
4652         'FK': '80.73.208.0/21',
4653         'FM': '119.252.112.0/20',
4654         'FO': '88.85.32.0/19',
4655         'FR': '90.0.0.0/9',
4656         'GA': '41.158.0.0/15',
4657         'GB': '25.0.0.0/8',
4658         'GD': '74.122.88.0/21',
4659         'GE': '31.146.0.0/16',
4660         'GF': '161.22.64.0/18',
4661         'GG': '62.68.160.0/19',
4662         'GH': '154.160.0.0/12',
4663         'GI': '95.164.0.0/16',
4664         'GL': '88.83.0.0/19',
4665         'GM': '160.182.0.0/15',
4666         'GN': '197.149.192.0/18',
4667         'GP': '104.250.0.0/19',
4668         'GQ': '105.235.224.0/20',
4669         'GR': '94.64.0.0/13',
4670         'GT': '168.234.0.0/16',
4671         'GU': '168.123.0.0/16',
4672         'GW': '197.214.80.0/20',
4673         'GY': '181.41.64.0/18',
4674         'HK': '113.252.0.0/14',
4675         'HN': '181.210.0.0/16',
4676         'HR': '93.136.0.0/13',
4677         'HT': '148.102.128.0/17',
4678         'HU': '84.0.0.0/14',
4679         'ID': '39.192.0.0/10',
4680         'IE': '87.32.0.0/12',
4681         'IL': '79.176.0.0/13',
4682         'IM': '5.62.80.0/20',
4683         'IN': '117.192.0.0/10',
4684         'IO': '203.83.48.0/21',
4685         'IQ': '37.236.0.0/14',
4686         'IR': '2.176.0.0/12',
4687         'IS': '82.221.0.0/16',
4688         'IT': '79.0.0.0/10',
4689         'JE': '87.244.64.0/18',
4690         'JM': '72.27.0.0/17',
4691         'JO': '176.29.0.0/16',
4692         'JP': '133.0.0.0/8',
4693         'KE': '105.48.0.0/12',
4694         'KG': '158.181.128.0/17',
4695         'KH': '36.37.128.0/17',
4696         'KI': '103.25.140.0/22',
4697         'KM': '197.255.224.0/20',
4698         'KN': '198.167.192.0/19',
4699         'KP': '175.45.176.0/22',
4700         'KR': '175.192.0.0/10',
4701         'KW': '37.36.0.0/14',
4702         'KY': '64.96.0.0/15',
4703         'KZ': '2.72.0.0/13',
4704         'LA': '115.84.64.0/18',
4705         'LB': '178.135.0.0/16',
4706         'LC': '24.92.144.0/20',
4707         'LI': '82.117.0.0/19',
4708         'LK': '112.134.0.0/15',
4709         'LR': '102.183.0.0/16',
4710         'LS': '129.232.0.0/17',
4711         'LT': '78.56.0.0/13',
4712         'LU': '188.42.0.0/16',
4713         'LV': '46.109.0.0/16',
4714         'LY': '41.252.0.0/14',
4715         'MA': '105.128.0.0/11',
4716         'MC': '88.209.64.0/18',
4717         'MD': '37.246.0.0/16',
4718         'ME': '178.175.0.0/17',
4719         'MF': '74.112.232.0/21',
4720         'MG': '154.126.0.0/17',
4721         'MH': '117.103.88.0/21',
4722         'MK': '77.28.0.0/15',
4723         'ML': '154.118.128.0/18',
4724         'MM': '37.111.0.0/17',
4725         'MN': '49.0.128.0/17',
4726         'MO': '60.246.0.0/16',
4727         'MP': '202.88.64.0/20',
4728         'MQ': '109.203.224.0/19',
4729         'MR': '41.188.64.0/18',
4730         'MS': '208.90.112.0/22',
4731         'MT': '46.11.0.0/16',
4732         'MU': '105.16.0.0/12',
4733         'MV': '27.114.128.0/18',
4734         'MW': '102.70.0.0/15',
4735         'MX': '187.192.0.0/11',
4736         'MY': '175.136.0.0/13',
4737         'MZ': '197.218.0.0/15',
4738         'NA': '41.182.0.0/16',
4739         'NC': '101.101.0.0/18',
4740         'NE': '197.214.0.0/18',
4741         'NF': '203.17.240.0/22',
4742         'NG': '105.112.0.0/12',
4743         'NI': '186.76.0.0/15',
4744         'NL': '145.96.0.0/11',
4745         'NO': '84.208.0.0/13',
4746         'NP': '36.252.0.0/15',
4747         'NR': '203.98.224.0/19',
4748         'NU': '49.156.48.0/22',
4749         'NZ': '49.224.0.0/14',
4750         'OM': '5.36.0.0/15',
4751         'PA': '186.72.0.0/15',
4752         'PE': '186.160.0.0/14',
4753         'PF': '123.50.64.0/18',
4754         'PG': '124.240.192.0/19',
4755         'PH': '49.144.0.0/13',
4756         'PK': '39.32.0.0/11',
4757         'PL': '83.0.0.0/11',
4758         'PM': '70.36.0.0/20',
4759         'PR': '66.50.0.0/16',
4760         'PS': '188.161.0.0/16',
4761         'PT': '85.240.0.0/13',
4762         'PW': '202.124.224.0/20',
4763         'PY': '181.120.0.0/14',
4764         'QA': '37.210.0.0/15',
4765         'RE': '102.35.0.0/16',
4766         'RO': '79.112.0.0/13',
4767         'RS': '93.86.0.0/15',
4768         'RU': '5.136.0.0/13',
4769         'RW': '41.186.0.0/16',
4770         'SA': '188.48.0.0/13',
4771         'SB': '202.1.160.0/19',
4772         'SC': '154.192.0.0/11',
4773         'SD': '102.120.0.0/13',
4774         'SE': '78.64.0.0/12',
4775         'SG': '8.128.0.0/10',
4776         'SI': '188.196.0.0/14',
4777         'SK': '78.98.0.0/15',
4778         'SL': '102.143.0.0/17',
4779         'SM': '89.186.32.0/19',
4780         'SN': '41.82.0.0/15',
4781         'SO': '154.115.192.0/18',
4782         'SR': '186.179.128.0/17',
4783         'SS': '105.235.208.0/21',
4784         'ST': '197.159.160.0/19',
4785         'SV': '168.243.0.0/16',
4786         'SX': '190.102.0.0/20',
4787         'SY': '5.0.0.0/16',
4788         'SZ': '41.84.224.0/19',
4789         'TC': '65.255.48.0/20',
4790         'TD': '154.68.128.0/19',
4791         'TG': '196.168.0.0/14',
4792         'TH': '171.96.0.0/13',
4793         'TJ': '85.9.128.0/18',
4794         'TK': '27.96.24.0/21',
4795         'TL': '180.189.160.0/20',
4796         'TM': '95.85.96.0/19',
4797         'TN': '197.0.0.0/11',
4798         'TO': '175.176.144.0/21',
4799         'TR': '78.160.0.0/11',
4800         'TT': '186.44.0.0/15',
4801         'TV': '202.2.96.0/19',
4802         'TW': '120.96.0.0/11',
4803         'TZ': '156.156.0.0/14',
4804         'UA': '37.52.0.0/14',
4805         'UG': '102.80.0.0/13',
4806         'US': '6.0.0.0/8',
4807         'UY': '167.56.0.0/13',
4808         'UZ': '84.54.64.0/18',
4809         'VA': '212.77.0.0/19',
4810         'VC': '207.191.240.0/21',
4811         'VE': '186.88.0.0/13',
4812         'VG': '66.81.192.0/20',
4813         'VI': '146.226.0.0/16',
4814         'VN': '14.160.0.0/11',
4815         'VU': '202.80.32.0/20',
4816         'WF': '117.20.32.0/21',
4817         'WS': '202.4.32.0/19',
4818         'YE': '134.35.0.0/16',
4819         'YT': '41.242.116.0/22',
4820         'ZA': '41.0.0.0/11',
4821         'ZM': '102.144.0.0/13',
4822         'ZW': '102.177.192.0/18',
4823     }
4824
4825     @classmethod
4826     def random_ipv4(cls, code_or_block):
4827         if len(code_or_block) == 2:
4828             block = cls._country_ip_map.get(code_or_block.upper())
4829             if not block:
4830                 return None
4831         else:
4832             block = code_or_block
4833         addr, preflen = block.split('/')
4834         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4835         addr_max = addr_min | (0xffffffff >> int(preflen))
4836         return str(socket.inet_ntoa(
4837             struct.pack('!L', random.randint(addr_min, addr_max))))
4838
4839
4840 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4841     def __init__(self, proxies=None):
4842         # Set default handlers
4843         for type in ('http', 'https'):
4844             setattr(self, '%s_open' % type,
4845                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4846                         meth(r, proxy, type))
4847         urllib.request.ProxyHandler.__init__(self, proxies)
4848
4849     def proxy_open(self, req, proxy, type):
4850         req_proxy = req.headers.get('Ytdl-request-proxy')
4851         if req_proxy is not None:
4852             proxy = req_proxy
4853             del req.headers['Ytdl-request-proxy']
4854
4855         if proxy == '__noproxy__':
4856             return None  # No Proxy
4857         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4858             req.add_header('Ytdl-socks-proxy', proxy)
4859             # yt-dlp's http/https handlers do wrapping the socket with socks
4860             return None
4861         return urllib.request.ProxyHandler.proxy_open(
4862             self, req, proxy, type)
4863
4864
4865 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4866 # released into Public Domain
4867 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4868
4869 def long_to_bytes(n, blocksize=0):
4870     """long_to_bytes(n:long, blocksize:int) : string
4871     Convert a long integer to a byte string.
4872
4873     If optional blocksize is given and greater than zero, pad the front of the
4874     byte string with binary zeros so that the length is a multiple of
4875     blocksize.
4876     """
4877     # after much testing, this algorithm was deemed to be the fastest
4878     s = b''
4879     n = int(n)
4880     while n > 0:
4881         s = struct.pack('>I', n & 0xffffffff) + s
4882         n = n >> 32
4883     # strip off leading zeros
4884     for i in range(len(s)):
4885         if s[i] != b'\000'[0]:
4886             break
4887     else:
4888         # only happens when n == 0
4889         s = b'\000'
4890         i = 0
4891     s = s[i:]
4892     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4893     # de-padding being done above, but sigh...
4894     if blocksize > 0 and len(s) % blocksize:
4895         s = (blocksize - len(s) % blocksize) * b'\000' + s
4896     return s
4897
4898
4899 def bytes_to_long(s):
4900     """bytes_to_long(string) : long
4901     Convert a byte string to a long integer.
4902
4903     This is (essentially) the inverse of long_to_bytes().
4904     """
4905     acc = 0
4906     length = len(s)
4907     if length % 4:
4908         extra = (4 - length % 4)
4909         s = b'\000' * extra + s
4910         length = length + extra
4911     for i in range(0, length, 4):
4912         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4913     return acc
4914
4915
4916 def ohdave_rsa_encrypt(data, exponent, modulus):
4917     '''
4918     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4919
4920     Input:
4921         data: data to encrypt, bytes-like object
4922         exponent, modulus: parameter e and N of RSA algorithm, both integer
4923     Output: hex string of encrypted data
4924
4925     Limitation: supports one block encryption only
4926     '''
4927
4928     payload = int(binascii.hexlify(data[::-1]), 16)
4929     encrypted = pow(payload, exponent, modulus)
4930     return '%x' % encrypted
4931
4932
4933 def pkcs1pad(data, length):
4934     """
4935     Padding input data with PKCS#1 scheme
4936
4937     @param {int[]} data        input data
4938     @param {int}   length      target length
4939     @returns {int[]}           padded data
4940     """
4941     if len(data) > length - 11:
4942         raise ValueError('Input data too long for PKCS#1 padding')
4943
4944     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4945     return [0, 2] + pseudo_random + [0] + data
4946
4947
4948 def _base_n_table(n, table):
4949     if not table and not n:
4950         raise ValueError('Either table or n must be specified')
4951     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4952
4953     if n and n != len(table):
4954         raise ValueError(f'base {n} exceeds table length {len(table)}')
4955     return table
4956
4957
4958 def encode_base_n(num, n=None, table=None):
4959     """Convert given int to a base-n string"""
4960     table = _base_n_table(n, table)
4961     if not num:
4962         return table[0]
4963
4964     result, base = '', len(table)
4965     while num:
4966         result = table[num % base] + result
4967         num = num // base
4968     return result
4969
4970
4971 def decode_base_n(string, n=None, table=None):
4972     """Convert given base-n string to int"""
4973     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4974     result, base = 0, len(table)
4975     for char in string:
4976         result = result * base + table[char]
4977     return result
4978
4979
4980 def decode_base(value, digits):
4981     deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4982                         f'in a future version. Use {__name__}.decode_base_n instead')
4983     return decode_base_n(value, table=digits)
4984
4985
4986 def decode_packed_codes(code):
4987     mobj = re.search(PACKED_CODES_RE, code)
4988     obfuscated_code, base, count, symbols = mobj.groups()
4989     base = int(base)
4990     count = int(count)
4991     symbols = symbols.split('|')
4992     symbol_table = {}
4993
4994     while count:
4995         count -= 1
4996         base_n_count = encode_base_n(count, base)
4997         symbol_table[base_n_count] = symbols[count] or base_n_count
4998
4999     return re.sub(
5000         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5001         obfuscated_code)
5002
5003
5004 def caesar(s, alphabet, shift):
5005     if shift == 0:
5006         return s
5007     l = len(alphabet)
5008     return ''.join(
5009         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5010         for c in s)
5011
5012
5013 def rot47(s):
5014     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5015
5016
5017 def parse_m3u8_attributes(attrib):
5018     info = {}
5019     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5020         if val.startswith('"'):
5021             val = val[1:-1]
5022         info[key] = val
5023     return info
5024
5025
5026 def urshift(val, n):
5027     return val >> n if val >= 0 else (val + 0x100000000) >> n
5028
5029
5030 # Based on png2str() written by @gdkchan and improved by @yokrysty
5031 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5032 def decode_png(png_data):
5033     # Reference: https://www.w3.org/TR/PNG/
5034     header = png_data[8:]
5035
5036     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5037         raise OSError('Not a valid PNG file.')
5038
5039     int_map = {1: '>B', 2: '>H', 4: '>I'}
5040     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5041
5042     chunks = []
5043
5044     while header:
5045         length = unpack_integer(header[:4])
5046         header = header[4:]
5047
5048         chunk_type = header[:4]
5049         header = header[4:]
5050
5051         chunk_data = header[:length]
5052         header = header[length:]
5053
5054         header = header[4:]  # Skip CRC
5055
5056         chunks.append({
5057             'type': chunk_type,
5058             'length': length,
5059             'data': chunk_data
5060         })
5061
5062     ihdr = chunks[0]['data']
5063
5064     width = unpack_integer(ihdr[:4])
5065     height = unpack_integer(ihdr[4:8])
5066
5067     idat = b''
5068
5069     for chunk in chunks:
5070         if chunk['type'] == b'IDAT':
5071             idat += chunk['data']
5072
5073     if not idat:
5074         raise OSError('Unable to read PNG data.')
5075
5076     decompressed_data = bytearray(zlib.decompress(idat))
5077
5078     stride = width * 3
5079     pixels = []
5080
5081     def _get_pixel(idx):
5082         x = idx % stride
5083         y = idx // stride
5084         return pixels[y][x]
5085
5086     for y in range(height):
5087         basePos = y * (1 + stride)
5088         filter_type = decompressed_data[basePos]
5089
5090         current_row = []
5091
5092         pixels.append(current_row)
5093
5094         for x in range(stride):
5095             color = decompressed_data[1 + basePos + x]
5096             basex = y * stride + x
5097             left = 0
5098             up = 0
5099
5100             if x > 2:
5101                 left = _get_pixel(basex - 3)
5102             if y > 0:
5103                 up = _get_pixel(basex - stride)
5104
5105             if filter_type == 1:  # Sub
5106                 color = (color + left) & 0xff
5107             elif filter_type == 2:  # Up
5108                 color = (color + up) & 0xff
5109             elif filter_type == 3:  # Average
5110                 color = (color + ((left + up) >> 1)) & 0xff
5111             elif filter_type == 4:  # Paeth
5112                 a = left
5113                 b = up
5114                 c = 0
5115
5116                 if x > 2 and y > 0:
5117                     c = _get_pixel(basex - stride - 3)
5118
5119                 p = a + b - c
5120
5121                 pa = abs(p - a)
5122                 pb = abs(p - b)
5123                 pc = abs(p - c)
5124
5125                 if pa <= pb and pa <= pc:
5126                     color = (color + a) & 0xff
5127                 elif pb <= pc:
5128                     color = (color + b) & 0xff
5129                 else:
5130                     color = (color + c) & 0xff
5131
5132             current_row.append(color)
5133
5134     return width, height, pixels
5135
5136
5137 def write_xattr(path, key, value):
5138     # Windows: Write xattrs to NTFS Alternate Data Streams:
5139     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5140     if compat_os_name == 'nt':
5141         assert ':' not in key
5142         assert os.path.exists(path)
5143
5144         try:
5145             with open(f'{path}:{key}', 'wb') as f:
5146                 f.write(value)
5147         except OSError as e:
5148             raise XAttrMetadataError(e.errno, e.strerror)
5149         return
5150
5151     # UNIX Method 1. Use xattrs/pyxattrs modules
5152
5153     setxattr = None
5154     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5155         # Unicode arguments are not supported in pyxattr until version 0.5.0
5156         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5157         if version_tuple(xattr.__version__) >= (0, 5, 0):
5158             setxattr = xattr.set
5159     elif xattr:
5160         setxattr = xattr.setxattr
5161
5162     if setxattr:
5163         try:
5164             setxattr(path, key, value)
5165         except OSError as e:
5166             raise XAttrMetadataError(e.errno, e.strerror)
5167         return
5168
5169     # UNIX Method 2. Use setfattr/xattr executables
5170     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5171            else 'xattr' if check_executable('xattr', ['-h']) else None)
5172     if not exe:
5173         raise XAttrUnavailableError(
5174             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5175             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5176
5177     value = value.decode()
5178     try:
5179         _, stderr, returncode = Popen.run(
5180             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5181             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5182     except OSError as e:
5183         raise XAttrMetadataError(e.errno, e.strerror)
5184     if returncode:
5185         raise XAttrMetadataError(returncode, stderr)
5186
5187
5188 def random_birthday(year_field, month_field, day_field):
5189     start_date = datetime.date(1950, 1, 1)
5190     end_date = datetime.date(1995, 12, 31)
5191     offset = random.randint(0, (end_date - start_date).days)
5192     random_date = start_date + datetime.timedelta(offset)
5193     return {
5194         year_field: str(random_date.year),
5195         month_field: str(random_date.month),
5196         day_field: str(random_date.day),
5197     }
5198
5199
5200 # Templates for internet shortcut files, which are plain text files.
5201 DOT_URL_LINK_TEMPLATE = '''\
5202 [InternetShortcut]
5203 URL=%(url)s
5204 '''
5205
5206 DOT_WEBLOC_LINK_TEMPLATE = '''\
5207 <?xml version="1.0" encoding="UTF-8"?>
5208 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5209 <plist version="1.0">
5210 <dict>
5211 \t<key>URL</key>
5212 \t<string>%(url)s</string>
5213 </dict>
5214 </plist>
5215 '''
5216
5217 DOT_DESKTOP_LINK_TEMPLATE = '''\
5218 [Desktop Entry]
5219 Encoding=UTF-8
5220 Name=%(filename)s
5221 Type=Link
5222 URL=%(url)s
5223 Icon=text-html
5224 '''
5225
5226 LINK_TEMPLATES = {
5227     'url': DOT_URL_LINK_TEMPLATE,
5228     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5229     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5230 }
5231
5232
5233 def iri_to_uri(iri):
5234     """
5235     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5236
5237     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5238     """
5239
5240     iri_parts = urllib.parse.urlparse(iri)
5241
5242     if '[' in iri_parts.netloc:
5243         raise ValueError('IPv6 URIs are not, yet, supported.')
5244         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5245
5246     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5247
5248     net_location = ''
5249     if iri_parts.username:
5250         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5251         if iri_parts.password is not None:
5252             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5253         net_location += '@'
5254
5255     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5256     # The 'idna' encoding produces ASCII text.
5257     if iri_parts.port is not None and iri_parts.port != 80:
5258         net_location += ':' + str(iri_parts.port)
5259
5260     return urllib.parse.urlunparse(
5261         (iri_parts.scheme,
5262             net_location,
5263
5264             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5265
5266             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5267             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5268
5269             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5270             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5271
5272             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5273
5274     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5275
5276
5277 def to_high_limit_path(path):
5278     if sys.platform in ['win32', 'cygwin']:
5279         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5280         return '\\\\?\\' + os.path.abspath(path)
5281
5282     return path
5283
5284
5285 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5286     val = traverse_obj(obj, *variadic(field))
5287     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5288         return default
5289     return template % func(val)
5290
5291
5292 def clean_podcast_url(url):
5293     return re.sub(r'''(?x)
5294         (?:
5295             (?:
5296                 chtbl\.com/track|
5297                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5298                 play\.podtrac\.com
5299             )/[^/]+|
5300             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5301             flex\.acast\.com|
5302             pd(?:
5303                 cn\.co| # https://podcorn.com/analytics-prefix/
5304                 st\.fm # https://podsights.com/docs/
5305             )/e
5306         )/''', '', url)
5307
5308
5309 _HEX_TABLE = '0123456789abcdef'
5310
5311
5312 def random_uuidv4():
5313     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5314
5315
5316 def make_dir(path, to_screen=None):
5317     try:
5318         dn = os.path.dirname(path)
5319         if dn and not os.path.exists(dn):
5320             os.makedirs(dn)
5321         return True
5322     except OSError as err:
5323         if callable(to_screen) is not None:
5324             to_screen('unable to create directory ' + error_to_compat_str(err))
5325         return False
5326
5327
5328 def get_executable_path():
5329     from .update import _get_variant_and_executable_path
5330
5331     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5332
5333
5334 def load_plugins(name, suffix, namespace):
5335     classes = {}
5336     with contextlib.suppress(FileNotFoundError):
5337         plugins_spec = importlib.util.spec_from_file_location(
5338             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5339         plugins = importlib.util.module_from_spec(plugins_spec)
5340         sys.modules[plugins_spec.name] = plugins
5341         plugins_spec.loader.exec_module(plugins)
5342         for name in dir(plugins):
5343             if name in namespace:
5344                 continue
5345             if not name.endswith(suffix):
5346                 continue
5347             klass = getattr(plugins, name)
5348             classes[name] = namespace[name] = klass
5349     return classes
5350
5351
5352 def traverse_obj(
5353         obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5354         casesense=True, is_user_input=False, traverse_string=False):
5355     """
5356     Safely traverse nested `dict`s and `Sequence`s
5357
5358     >>> obj = [{}, {"key": "value"}]
5359     >>> traverse_obj(obj, (1, "key"))
5360     "value"
5361
5362     Each of the provided `paths` is tested and the first producing a valid result will be returned.
5363     The next path will also be tested if the path branched but no results could be found.
5364     Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5365     A value of None is treated as the absence of a value.
5366
5367     The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5368
5369     The keys in the path can be one of:
5370         - `None`:           Return the current object.
5371         - `str`/`int`:      Return `obj[key]`. For `re.Match, return `obj.group(key)`.
5372         - `slice`:          Branch out and return all values in `obj[key]`.
5373         - `Ellipsis`:       Branch out and return a list of all values.
5374         - `tuple`/`list`:   Branch out and return a list of all matching values.
5375                             Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5376         - `function`:       Branch out and return values filtered by the function.
5377                             Read as: `[value for key, value in obj if function(key, value)]`.
5378                             For `Sequence`s, `key` is the index of the value.
5379         - `dict`            Transform the current object and return a matching dict.
5380                             Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5381
5382         `tuple`, `list`, and `dict` all support nested paths and branches.
5383
5384     @params paths           Paths which to traverse by.
5385     @param default          Value to return if the paths do not match.
5386     @param expected_type    If a `type`, only accept final values of this type.
5387                             If any other callable, try to call the function on each result.
5388     @param get_all          If `False`, return the first matching result, otherwise all matching ones.
5389     @param casesense        If `False`, consider string dictionary keys as case insensitive.
5390
5391     The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5392
5393     @param is_user_input    Whether the keys are generated from user input.
5394                             If `True` strings get converted to `int`/`slice` if needed.
5395     @param traverse_string  Whether to traverse into objects as strings.
5396                             If `True`, any non-compatible object will first be
5397                             converted into a string and then traversed into.
5398
5399
5400     @returns                The result of the object traversal.
5401                             If successful, `get_all=True`, and the path branches at least once,
5402                             then a list of results is returned instead.
5403                             A list is always returned if the last path branches and no `default` is given.
5404     """
5405     is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5406     casefold = lambda k: k.casefold() if isinstance(k, str) else k
5407
5408     if isinstance(expected_type, type):
5409         type_test = lambda val: val if isinstance(val, expected_type) else None
5410     else:
5411         type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5412
5413     def apply_key(key, obj):
5414         if obj is None:
5415             return
5416
5417         elif key is None:
5418             yield obj
5419
5420         elif isinstance(key, (list, tuple)):
5421             for branch in key:
5422                 _, result = apply_path(obj, branch)
5423                 yield from result
5424
5425         elif key is ...:
5426             if isinstance(obj, collections.abc.Mapping):
5427                 yield from obj.values()
5428             elif is_sequence(obj):
5429                 yield from obj
5430             elif isinstance(obj, re.Match):
5431                 yield from obj.groups()
5432             elif traverse_string:
5433                 yield from str(obj)
5434
5435         elif callable(key):
5436             if is_sequence(obj):
5437                 iter_obj = enumerate(obj)
5438             elif isinstance(obj, collections.abc.Mapping):
5439                 iter_obj = obj.items()
5440             elif isinstance(obj, re.Match):
5441                 iter_obj = enumerate((obj.group(), *obj.groups()))
5442             elif traverse_string:
5443                 iter_obj = enumerate(str(obj))
5444             else:
5445                 return
5446             yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5447
5448         elif isinstance(key, dict):
5449             iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5450             yield {k: v if v is not None else default for k, v in iter_obj
5451                    if v is not None or default is not NO_DEFAULT}
5452
5453         elif isinstance(obj, collections.abc.Mapping):
5454             yield (obj.get(key) if casesense or (key in obj)
5455                    else next((v for k, v in obj.items() if casefold(k) == key), None))
5456
5457         elif isinstance(obj, re.Match):
5458             if isinstance(key, int) or casesense:
5459                 with contextlib.suppress(IndexError):
5460                     yield obj.group(key)
5461                     return
5462
5463             if not isinstance(key, str):
5464                 return
5465
5466             yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5467
5468         else:
5469             if is_user_input:
5470                 key = (int_or_none(key) if ':' not in key
5471                        else slice(*map(int_or_none, key.split(':'))))
5472
5473             if not isinstance(key, (int, slice)):
5474                 return
5475
5476             if not is_sequence(obj):
5477                 if not traverse_string:
5478                     return
5479                 obj = str(obj)
5480
5481             with contextlib.suppress(IndexError):
5482                 yield obj[key]
5483
5484     def apply_path(start_obj, path):
5485         objs = (start_obj,)
5486         has_branched = False
5487
5488         for key in variadic(path):
5489             if is_user_input and key == ':':
5490                 key = ...
5491
5492             if not casesense and isinstance(key, str):
5493                 key = key.casefold()
5494
5495             if key is ... or isinstance(key, (list, tuple)) or callable(key):
5496                 has_branched = True
5497
5498             key_func = functools.partial(apply_key, key)
5499             objs = itertools.chain.from_iterable(map(key_func, objs))
5500
5501         return has_branched, objs
5502
5503     def _traverse_obj(obj, path, use_list=True):
5504         has_branched, results = apply_path(obj, path)
5505         results = LazyList(x for x in map(type_test, results) if x is not None)
5506
5507         if get_all and has_branched:
5508             return results.exhaust() if results or use_list else None
5509
5510         return results[0] if results else None
5511
5512     for index, path in enumerate(paths, 1):
5513         use_list = default is NO_DEFAULT and index == len(paths)
5514         result = _traverse_obj(obj, path, use_list)
5515         if result is not None:
5516             return result
5517
5518     return None if default is NO_DEFAULT else default
5519
5520
5521 def traverse_dict(dictn, keys, casesense=True):
5522     deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5523                         f'in a future version. Use "{__name__}.traverse_obj" instead')
5524     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5525
5526
5527 def get_first(obj, keys, **kwargs):
5528     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5529
5530
5531 def time_seconds(**kwargs):
5532     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5533     return t.timestamp()
5534
5535
5536 # create a JSON Web Signature (jws) with HS256 algorithm
5537 # the resulting format is in JWS Compact Serialization
5538 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5539 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5540 def jwt_encode_hs256(payload_data, key, headers={}):
5541     header_data = {
5542         'alg': 'HS256',
5543         'typ': 'JWT',
5544     }
5545     if headers:
5546         header_data.update(headers)
5547     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5548     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5549     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5550     signature_b64 = base64.b64encode(h.digest())
5551     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5552     return token
5553
5554
5555 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5556 def jwt_decode_hs256(jwt):
5557     header_b64, payload_b64, signature_b64 = jwt.split('.')
5558     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5559     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5560     return payload_data
5561
5562
5563 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5564
5565
5566 @functools.cache
5567 def supports_terminal_sequences(stream):
5568     if compat_os_name == 'nt':
5569         if not WINDOWS_VT_MODE:
5570             return False
5571     elif not os.getenv('TERM'):
5572         return False
5573     try:
5574         return stream.isatty()
5575     except BaseException:
5576         return False
5577
5578
5579 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5580     if get_windows_version() < (10, 0, 10586):
5581         return
5582     global WINDOWS_VT_MODE
5583     try:
5584         Popen.run('', shell=True)
5585     except Exception:
5586         return
5587
5588     WINDOWS_VT_MODE = True
5589     supports_terminal_sequences.cache_clear()
5590
5591
5592 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5593
5594
5595 def remove_terminal_sequences(string):
5596     return _terminal_sequences_re.sub('', string)
5597
5598
5599 def number_of_digits(number):
5600     return len('%d' % number)
5601
5602
5603 def join_nonempty(*values, delim='-', from_dict=None):
5604     if from_dict is not None:
5605         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5606     return delim.join(map(str, filter(None, values)))
5607
5608
5609 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5610     """
5611     Find the largest format dimensions in terms of video width and, for each thumbnail:
5612     * Modify the URL: Match the width with the provided regex and replace with the former width
5613     * Update dimensions
5614
5615     This function is useful with video services that scale the provided thumbnails on demand
5616     """
5617     _keys = ('width', 'height')
5618     max_dimensions = max(
5619         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5620         default=(0, 0))
5621     if not max_dimensions[0]:
5622         return thumbnails
5623     return [
5624         merge_dicts(
5625             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5626             dict(zip(_keys, max_dimensions)), thumbnail)
5627         for thumbnail in thumbnails
5628     ]
5629
5630
5631 def parse_http_range(range):
5632     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5633     if not range:
5634         return None, None, None
5635     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5636     if not crg:
5637         return None, None, None
5638     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5639
5640
5641 def read_stdin(what):
5642     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5643     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5644     return sys.stdin
5645
5646
5647 def determine_file_encoding(data):
5648     """
5649     Detect the text encoding used
5650     @returns (encoding, bytes to skip)
5651     """
5652
5653     # BOM marks are given priority over declarations
5654     for bom, enc in BOMS:
5655         if data.startswith(bom):
5656             return enc, len(bom)
5657
5658     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5659     # We ignore the endianness to get a good enough match
5660     data = data.replace(b'\0', b'')
5661     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5662     return mobj.group(1).decode() if mobj else None, 0
5663
5664
5665 class Config:
5666     own_args = None
5667     parsed_args = None
5668     filename = None
5669     __initialized = False
5670
5671     def __init__(self, parser, label=None):
5672         self.parser, self.label = parser, label
5673         self._loaded_paths, self.configs = set(), []
5674
5675     def init(self, args=None, filename=None):
5676         assert not self.__initialized
5677         self.own_args, self.filename = args, filename
5678         return self.load_configs()
5679
5680     def load_configs(self):
5681         directory = ''
5682         if self.filename:
5683             location = os.path.realpath(self.filename)
5684             directory = os.path.dirname(location)
5685             if location in self._loaded_paths:
5686                 return False
5687             self._loaded_paths.add(location)
5688
5689         self.__initialized = True
5690         opts, _ = self.parser.parse_known_args(self.own_args)
5691         self.parsed_args = self.own_args
5692         for location in opts.config_locations or []:
5693             if location == '-':
5694                 if location in self._loaded_paths:
5695                     continue
5696                 self._loaded_paths.add(location)
5697                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5698                 continue
5699             location = os.path.join(directory, expand_path(location))
5700             if os.path.isdir(location):
5701                 location = os.path.join(location, 'yt-dlp.conf')
5702             if not os.path.exists(location):
5703                 self.parser.error(f'config location {location} does not exist')
5704             self.append_config(self.read_file(location), location)
5705         return True
5706
5707     def __str__(self):
5708         label = join_nonempty(
5709             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5710             delim=' ')
5711         return join_nonempty(
5712             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5713             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5714             delim='\n')
5715
5716     @staticmethod
5717     def read_file(filename, default=[]):
5718         try:
5719             optionf = open(filename, 'rb')
5720         except OSError:
5721             return default  # silently skip if file is not present
5722         try:
5723             enc, skip = determine_file_encoding(optionf.read(512))
5724             optionf.seek(skip, io.SEEK_SET)
5725         except OSError:
5726             enc = None  # silently skip read errors
5727         try:
5728             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5729             contents = optionf.read().decode(enc or preferredencoding())
5730             res = shlex.split(contents, comments=True)
5731         except Exception as err:
5732             raise ValueError(f'Unable to parse "{filename}": {err}')
5733         finally:
5734             optionf.close()
5735         return res
5736
5737     @staticmethod
5738     def hide_login_info(opts):
5739         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5740         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5741
5742         def _scrub_eq(o):
5743             m = eqre.match(o)
5744             if m:
5745                 return m.group('key') + '=PRIVATE'
5746             else:
5747                 return o
5748
5749         opts = list(map(_scrub_eq, opts))
5750         for idx, opt in enumerate(opts):
5751             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5752                 opts[idx + 1] = 'PRIVATE'
5753         return opts
5754
5755     def append_config(self, *args, label=None):
5756         config = type(self)(self.parser, label)
5757         config._loaded_paths = self._loaded_paths
5758         if config.init(*args):
5759             self.configs.append(config)
5760
5761     @property
5762     def all_args(self):
5763         for config in reversed(self.configs):
5764             yield from config.all_args
5765         yield from self.parsed_args or []
5766
5767     def parse_known_args(self, **kwargs):
5768         return self.parser.parse_known_args(self.all_args, **kwargs)
5769
5770     def parse_args(self):
5771         return self.parser.parse_args(self.all_args)
5772
5773
5774 class WebSocketsWrapper:
5775     """Wraps websockets module to use in non-async scopes"""
5776     pool = None
5777
5778     def __init__(self, url, headers=None, connect=True):
5779         self.loop = asyncio.new_event_loop()
5780         # XXX: "loop" is deprecated
5781         self.conn = websockets.connect(
5782             url, extra_headers=headers, ping_interval=None,
5783             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5784         if connect:
5785             self.__enter__()
5786         atexit.register(self.__exit__, None, None, None)
5787
5788     def __enter__(self):
5789         if not self.pool:
5790             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5791         return self
5792
5793     def send(self, *args):
5794         self.run_with_loop(self.pool.send(*args), self.loop)
5795
5796     def recv(self, *args):
5797         return self.run_with_loop(self.pool.recv(*args), self.loop)
5798
5799     def __exit__(self, type, value, traceback):
5800         try:
5801             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5802         finally:
5803             self.loop.close()
5804             self._cancel_all_tasks(self.loop)
5805
5806     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5807     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5808     @staticmethod
5809     def run_with_loop(main, loop):
5810         if not asyncio.iscoroutine(main):
5811             raise ValueError(f'a coroutine was expected, got {main!r}')
5812
5813         try:
5814             return loop.run_until_complete(main)
5815         finally:
5816             loop.run_until_complete(loop.shutdown_asyncgens())
5817             if hasattr(loop, 'shutdown_default_executor'):
5818                 loop.run_until_complete(loop.shutdown_default_executor())
5819
5820     @staticmethod
5821     def _cancel_all_tasks(loop):
5822         to_cancel = asyncio.all_tasks(loop)
5823
5824         if not to_cancel:
5825             return
5826
5827         for task in to_cancel:
5828             task.cancel()
5829
5830         # XXX: "loop" is removed in python 3.10+
5831         loop.run_until_complete(
5832             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5833
5834         for task in to_cancel:
5835             if task.cancelled():
5836                 continue
5837             if task.exception() is not None:
5838                 loop.call_exception_handler({
5839                     'message': 'unhandled exception during asyncio.run() shutdown',
5840                     'exception': task.exception(),
5841                     'task': task,
5842                 })
5843
5844
5845 def merge_headers(*dicts):
5846     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5847     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5848
5849
5850 def cached_method(f):
5851     """Cache a method"""
5852     signature = inspect.signature(f)
5853
5854     @functools.wraps(f)
5855     def wrapper(self, *args, **kwargs):
5856         bound_args = signature.bind(self, *args, **kwargs)
5857         bound_args.apply_defaults()
5858         key = tuple(bound_args.arguments.values())[1:]
5859
5860         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5861         if key not in cache:
5862             cache[key] = f(self, *args, **kwargs)
5863         return cache[key]
5864     return wrapper
5865
5866
5867 class classproperty:
5868     """property access for class methods with optional caching"""
5869     def __new__(cls, func=None, *args, **kwargs):
5870         if not func:
5871             return functools.partial(cls, *args, **kwargs)
5872         return super().__new__(cls)
5873
5874     def __init__(self, func, *, cache=False):
5875         functools.update_wrapper(self, func)
5876         self.func = func
5877         self._cache = {} if cache else None
5878
5879     def __get__(self, _, cls):
5880         if self._cache is None:
5881             return self.func(cls)
5882         elif cls not in self._cache:
5883             self._cache[cls] = self.func(cls)
5884         return self._cache[cls]
5885
5886
5887 class Namespace(types.SimpleNamespace):
5888     """Immutable namespace"""
5889
5890     def __iter__(self):
5891         return iter(self.__dict__.values())
5892
5893     @property
5894     def items_(self):
5895         return self.__dict__.items()
5896
5897
5898 MEDIA_EXTENSIONS = Namespace(
5899     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5900     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5901     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5902     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5903     thumbnails=('jpg', 'png', 'webp'),
5904     storyboards=('mhtml', ),
5905     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5906     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5907 )
5908 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5909 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5910
5911 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5912
5913
5914 class RetryManager:
5915     """Usage:
5916         for retry in RetryManager(...):
5917             try:
5918                 ...
5919             except SomeException as err:
5920                 retry.error = err
5921                 continue
5922     """
5923     attempt, _error = 0, None
5924
5925     def __init__(self, _retries, _error_callback, **kwargs):
5926         self.retries = _retries or 0
5927         self.error_callback = functools.partial(_error_callback, **kwargs)
5928
5929     def _should_retry(self):
5930         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5931
5932     @property
5933     def error(self):
5934         if self._error is NO_DEFAULT:
5935             return None
5936         return self._error
5937
5938     @error.setter
5939     def error(self, value):
5940         self._error = value
5941
5942     def __iter__(self):
5943         while self._should_retry():
5944             self.error = NO_DEFAULT
5945             self.attempt += 1
5946             yield self
5947             if self.error:
5948                 self.error_callback(self.error, self.attempt, self.retries)
5949
5950     @staticmethod
5951     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5952         """Utility function for reporting retries"""
5953         if count > retries:
5954             if error:
5955                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5956             raise e
5957
5958         if not count:
5959             return warn(e)
5960         elif isinstance(e, ExtractorError):
5961             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5962         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5963
5964         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5965         if delay:
5966             info(f'Sleeping {delay:.2f} seconds ...')
5967             time.sleep(delay)
5968
5969
5970 def make_archive_id(ie, video_id):
5971     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5972     return f'{ie_key.lower()} {video_id}'
5973
5974
5975 def truncate_string(s, left, right=0):
5976     assert left > 3 and right >= 0
5977     if s is None or len(s) <= left + right:
5978         return s
5979     return f'{s[:left-3]}...{s[-right:]}'
5980
5981
5982 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5983     assert 'all' in alias_dict, '"all" alias is required'
5984     requested = list(start or [])
5985     for val in options:
5986         discard = val.startswith('-')
5987         if discard:
5988             val = val[1:]
5989
5990         if val in alias_dict:
5991             val = alias_dict[val] if not discard else [
5992                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5993             # NB: Do not allow regex in aliases for performance
5994             requested = orderedSet_from_options(val, alias_dict, start=requested)
5995             continue
5996
5997         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5998                    else [val] if val in alias_dict['all'] else None)
5999         if current is None:
6000             raise ValueError(val)
6001
6002         if discard:
6003             for item in current:
6004                 while item in requested:
6005                     requested.remove(item)
6006         else:
6007             requested.extend(current)
6008
6009     return orderedSet(requested)
6010
6011
6012 class FormatSorter:
6013     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6014
6015     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6016                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6017                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
6018     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6019                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
6020                     'fps', 'fs_approx', 'source', 'id')
6021
6022     settings = {
6023         'vcodec': {'type': 'ordered', 'regex': True,
6024                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6025         'acodec': {'type': 'ordered', 'regex': True,
6026                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6027         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6028                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6029         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6030                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6031         'vext': {'type': 'ordered', 'field': 'video_ext',
6032                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6033                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6034         'aext': {'type': 'ordered', 'field': 'audio_ext',
6035                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6036                  'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6037         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6038         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6039                        'field': ('vcodec', 'acodec'),
6040                        'function': lambda it: int(any(v != 'none' for v in it))},
6041         'ie_pref': {'priority': True, 'type': 'extractor'},
6042         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6043         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6044         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6045         'quality': {'convert': 'float', 'default': -1},
6046         'filesize': {'convert': 'bytes'},
6047         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6048         'id': {'convert': 'string', 'field': 'format_id'},
6049         'height': {'convert': 'float_none'},
6050         'width': {'convert': 'float_none'},
6051         'fps': {'convert': 'float_none'},
6052         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6053         'tbr': {'convert': 'float_none'},
6054         'vbr': {'convert': 'float_none'},
6055         'abr': {'convert': 'float_none'},
6056         'asr': {'convert': 'float_none'},
6057         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6058
6059         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6060         'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6061         'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6062         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6063         'res': {'type': 'multiple', 'field': ('height', 'width'),
6064                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6065
6066         # Actual field names
6067         'format_id': {'type': 'alias', 'field': 'id'},
6068         'preference': {'type': 'alias', 'field': 'ie_pref'},
6069         'language_preference': {'type': 'alias', 'field': 'lang'},
6070         'source_preference': {'type': 'alias', 'field': 'source'},
6071         'protocol': {'type': 'alias', 'field': 'proto'},
6072         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6073         'audio_channels': {'type': 'alias', 'field': 'channels'},
6074
6075         # Deprecated
6076         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6077         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6078         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6079         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6080         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6081         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6082         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6083         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6084         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6085         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6086         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6087         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6088         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6089         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6090         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6091         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6092         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6093         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6094         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6095         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6096     }
6097
6098     def __init__(self, ydl, field_preference):
6099         self.ydl = ydl
6100         self._order = []
6101         self.evaluate_params(self.ydl.params, field_preference)
6102         if ydl.params.get('verbose'):
6103             self.print_verbose_info(self.ydl.write_debug)
6104
6105     def _get_field_setting(self, field, key):
6106         if field not in self.settings:
6107             if key in ('forced', 'priority'):
6108                 return False
6109             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6110                                         'deprecated and may be removed in a future version')
6111             self.settings[field] = {}
6112         propObj = self.settings[field]
6113         if key not in propObj:
6114             type = propObj.get('type')
6115             if key == 'field':
6116                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6117             elif key == 'convert':
6118                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6119             else:
6120                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6121             propObj[key] = default
6122         return propObj[key]
6123
6124     def _resolve_field_value(self, field, value, convertNone=False):
6125         if value is None:
6126             if not convertNone:
6127                 return None
6128         else:
6129             value = value.lower()
6130         conversion = self._get_field_setting(field, 'convert')
6131         if conversion == 'ignore':
6132             return None
6133         if conversion == 'string':
6134             return value
6135         elif conversion == 'float_none':
6136             return float_or_none(value)
6137         elif conversion == 'bytes':
6138             return parse_bytes(value)
6139         elif conversion == 'order':
6140             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6141             use_regex = self._get_field_setting(field, 'regex')
6142             list_length = len(order_list)
6143             empty_pos = order_list.index('') if '' in order_list else list_length + 1
6144             if use_regex and value is not None:
6145                 for i, regex in enumerate(order_list):
6146                     if regex and re.match(regex, value):
6147                         return list_length - i
6148                 return list_length - empty_pos  # not in list
6149             else:  # not regex or  value = None
6150                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6151         else:
6152             if value.isnumeric():
6153                 return float(value)
6154             else:
6155                 self.settings[field]['convert'] = 'string'
6156                 return value
6157
6158     def evaluate_params(self, params, sort_extractor):
6159         self._use_free_order = params.get('prefer_free_formats', False)
6160         self._sort_user = params.get('format_sort', [])
6161         self._sort_extractor = sort_extractor
6162
6163         def add_item(field, reverse, closest, limit_text):
6164             field = field.lower()
6165             if field in self._order:
6166                 return
6167             self._order.append(field)
6168             limit = self._resolve_field_value(field, limit_text)
6169             data = {
6170                 'reverse': reverse,
6171                 'closest': False if limit is None else closest,
6172                 'limit_text': limit_text,
6173                 'limit': limit}
6174             if field in self.settings:
6175                 self.settings[field].update(data)
6176             else:
6177                 self.settings[field] = data
6178
6179         sort_list = (
6180             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6181             + (tuple() if params.get('format_sort_force', False)
6182                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6183             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6184
6185         for item in sort_list:
6186             match = re.match(self.regex, item)
6187             if match is None:
6188                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6189             field = match.group('field')
6190             if field is None:
6191                 continue
6192             if self._get_field_setting(field, 'type') == 'alias':
6193                 alias, field = field, self._get_field_setting(field, 'field')
6194                 if self._get_field_setting(alias, 'deprecated'):
6195                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6196                                                 f'be removed in a future version. Please use {field} instead')
6197             reverse = match.group('reverse') is not None
6198             closest = match.group('separator') == '~'
6199             limit_text = match.group('limit')
6200
6201             has_limit = limit_text is not None
6202             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6203             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6204
6205             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6206             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6207             limit_count = len(limits)
6208             for (i, f) in enumerate(fields):
6209                 add_item(f, reverse, closest,
6210                          limits[i] if i < limit_count
6211                          else limits[0] if has_limit and not has_multiple_limits
6212                          else None)
6213
6214     def print_verbose_info(self, write_debug):
6215         if self._sort_user:
6216             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6217         if self._sort_extractor:
6218             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6219         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6220             '+' if self._get_field_setting(field, 'reverse') else '', field,
6221             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6222                           self._get_field_setting(field, 'limit_text'),
6223                           self._get_field_setting(field, 'limit'))
6224             if self._get_field_setting(field, 'limit_text') is not None else '')
6225             for field in self._order if self._get_field_setting(field, 'visible')]))
6226
6227     def _calculate_field_preference_from_value(self, format, field, type, value):
6228         reverse = self._get_field_setting(field, 'reverse')
6229         closest = self._get_field_setting(field, 'closest')
6230         limit = self._get_field_setting(field, 'limit')
6231
6232         if type == 'extractor':
6233             maximum = self._get_field_setting(field, 'max')
6234             if value is None or (maximum is not None and value >= maximum):
6235                 value = -1
6236         elif type == 'boolean':
6237             in_list = self._get_field_setting(field, 'in_list')
6238             not_in_list = self._get_field_setting(field, 'not_in_list')
6239             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6240         elif type == 'ordered':
6241             value = self._resolve_field_value(field, value, True)
6242
6243         # try to convert to number
6244         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6245         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6246         if is_num:
6247             value = val_num
6248
6249         return ((-10, 0) if value is None
6250                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
6251                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6252                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6253                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6254                 else (-1, value, 0))
6255
6256     def _calculate_field_preference(self, format, field):
6257         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
6258         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6259         if type == 'multiple':
6260             type = 'field'  # Only 'field' is allowed in multiple for now
6261             actual_fields = self._get_field_setting(field, 'field')
6262
6263             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6264         else:
6265             value = get_value(field)
6266         return self._calculate_field_preference_from_value(format, field, type, value)
6267
6268     def calculate_preference(self, format):
6269         # Determine missing protocol
6270         if not format.get('protocol'):
6271             format['protocol'] = determine_protocol(format)
6272
6273         # Determine missing ext
6274         if not format.get('ext') and 'url' in format:
6275             format['ext'] = determine_ext(format['url'])
6276         if format.get('vcodec') == 'none':
6277             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6278             format['video_ext'] = 'none'
6279         else:
6280             format['video_ext'] = format['ext']
6281             format['audio_ext'] = 'none'
6282         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
6283         #    format['preference'] = -1000
6284
6285         # Determine missing bitrates
6286         if format.get('tbr') is None:
6287             if format.get('vbr') is not None and format.get('abr') is not None:
6288                 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6289         else:
6290             if format.get('vcodec') != 'none' and format.get('vbr') is None:
6291                 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6292             if format.get('acodec') != 'none' and format.get('abr') is None:
6293                 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6294
6295         return tuple(self._calculate_field_preference(format, field) for field in self._order)
6296
6297
6298 # Deprecated
6299 has_certifi = bool(certifi)
6300 has_websockets = bool(websockets)