yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import importlib.util
  22 import inspect
  23 import io
  24 import itertools
  25 import json
  26 import locale
  27 import math
  28 import mimetypes
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import unicodedata
  45 import urllib.error
  46 import urllib.parse
  47 import urllib.request
  48 import xml.etree.ElementTree
  49 import zlib
  50
  51 from .compat import functools  # isort: split
  52 from .compat import (
  53     compat_etree_fromstring,
  54     compat_expanduser,
  55     compat_HTMLParseError,
  56     compat_os_name,
  57     compat_shlex_quote,
  58 )
  59 from .dependencies import brotli, certifi, websockets, xattr
  60 from .socks import ProxyType, sockssocket
  61
  62
  63 def register_socks_protocols():
  64     # "Register" SOCKS protocols
  65     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  66     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  67     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  68         if scheme not in urllib.parse.uses_netloc:
  69             urllib.parse.uses_netloc.append(scheme)
  70
  71
  72 # This is not clearly defined otherwise
  73 compiled_regex_type = type(re.compile(''))
  74
  75
  76 def random_user_agent():
  77     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  78     _CHROME_VERSIONS = (
  79         '90.0.4430.212',
  80         '90.0.4430.24',
  81         '90.0.4430.70',
  82         '90.0.4430.72',
  83         '90.0.4430.85',
  84         '90.0.4430.93',
  85         '91.0.4472.101',
  86         '91.0.4472.106',
  87         '91.0.4472.114',
  88         '91.0.4472.124',
  89         '91.0.4472.164',
  90         '91.0.4472.19',
  91         '91.0.4472.77',
  92         '92.0.4515.107',
  93         '92.0.4515.115',
  94         '92.0.4515.131',
  95         '92.0.4515.159',
  96         '92.0.4515.43',
  97         '93.0.4556.0',
  98         '93.0.4577.15',
  99         '93.0.4577.63',
 100         '93.0.4577.82',
 101         '94.0.4606.41',
 102         '94.0.4606.54',
 103         '94.0.4606.61',
 104         '94.0.4606.71',
 105         '94.0.4606.81',
 106         '94.0.4606.85',
 107         '95.0.4638.17',
 108         '95.0.4638.50',
 109         '95.0.4638.54',
 110         '95.0.4638.69',
 111         '95.0.4638.74',
 112         '96.0.4664.18',
 113         '96.0.4664.45',
 114         '96.0.4664.55',
 115         '96.0.4664.93',
 116         '97.0.4692.20',
 117     )
 118     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 119
 120
 121 SUPPORTED_ENCODINGS = [
 122     'gzip', 'deflate'
 123 ]
 124 if brotli:
 125     SUPPORTED_ENCODINGS.append('br')
 126
 127 std_headers = {
 128     'User-Agent': random_user_agent(),
 129     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 130     'Accept-Language': 'en-us,en;q=0.5',
 131     'Sec-Fetch-Mode': 'navigate',
 132 }
 133
 134
 135 USER_AGENTS = {
 136     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 137 }
 138
 139
 140 NO_DEFAULT = object()
 141 IDENTITY = lambda x: x
 142
 143 ENGLISH_MONTH_NAMES = [
 144     'January', 'February', 'March', 'April', 'May', 'June',
 145     'July', 'August', 'September', 'October', 'November', 'December']
 146
 147 MONTH_NAMES = {
 148     'en': ENGLISH_MONTH_NAMES,
 149     'fr': [
 150         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 151         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 152     # these follow the genitive grammatical case (dopełniacz)
 153     # some websites might be using nominative, which will require another month list
 154     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 155     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 156            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 157 }
 158
 159 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 160 TIMEZONE_NAMES = {
 161     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 162     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 163     'EST': -5, 'EDT': -4,  # Eastern
 164     'CST': -6, 'CDT': -5,  # Central
 165     'MST': -7, 'MDT': -6,  # Mountain
 166     'PST': -8, 'PDT': -7   # Pacific
 167 }
 168
 169 # needed for sanitizing filenames in restricted mode
 170 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 171                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 172                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 173
 174 DATE_FORMATS = (
 175     '%d %B %Y',
 176     '%d %b %Y',
 177     '%B %d %Y',
 178     '%B %dst %Y',
 179     '%B %dnd %Y',
 180     '%B %drd %Y',
 181     '%B %dth %Y',
 182     '%b %d %Y',
 183     '%b %dst %Y',
 184     '%b %dnd %Y',
 185     '%b %drd %Y',
 186     '%b %dth %Y',
 187     '%b %dst %Y %I:%M',
 188     '%b %dnd %Y %I:%M',
 189     '%b %drd %Y %I:%M',
 190     '%b %dth %Y %I:%M',
 191     '%Y %m %d',
 192     '%Y-%m-%d',
 193     '%Y.%m.%d.',
 194     '%Y/%m/%d',
 195     '%Y/%m/%d %H:%M',
 196     '%Y/%m/%d %H:%M:%S',
 197     '%Y%m%d%H%M',
 198     '%Y%m%d%H%M%S',
 199     '%Y%m%d',
 200     '%Y-%m-%d %H:%M',
 201     '%Y-%m-%d %H:%M:%S',
 202     '%Y-%m-%d %H:%M:%S.%f',
 203     '%Y-%m-%d %H:%M:%S:%f',
 204     '%d.%m.%Y %H:%M',
 205     '%d.%m.%Y %H.%M',
 206     '%Y-%m-%dT%H:%M:%SZ',
 207     '%Y-%m-%dT%H:%M:%S.%fZ',
 208     '%Y-%m-%dT%H:%M:%S.%f0Z',
 209     '%Y-%m-%dT%H:%M:%S',
 210     '%Y-%m-%dT%H:%M:%S.%f',
 211     '%Y-%m-%dT%H:%M',
 212     '%b %d %Y at %H:%M',
 213     '%b %d %Y at %H:%M:%S',
 214     '%B %d %Y at %H:%M',
 215     '%B %d %Y at %H:%M:%S',
 216     '%H:%M %d-%b-%Y',
 217 )
 218
 219 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 220 DATE_FORMATS_DAY_FIRST.extend([
 221     '%d-%m-%Y',
 222     '%d.%m.%Y',
 223     '%d.%m.%y',
 224     '%d/%m/%Y',
 225     '%d/%m/%y',
 226     '%d/%m/%Y %H:%M:%S',
 227     '%d-%m-%Y %H:%M',
 228 ])
 229
 230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 231 DATE_FORMATS_MONTH_FIRST.extend([
 232     '%m-%d-%Y',
 233     '%m.%d.%Y',
 234     '%m/%d/%Y',
 235     '%m/%d/%y',
 236     '%m/%d/%Y %H:%M:%S',
 237 ])
 238
 239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 241
 242 NUMBER_RE = r'\d+(?:\.\d+)?'
 243
 244
 245 @functools.cache
 246 def preferredencoding():
 247     """Get preferred encoding.
 248
 249     Returns the best encoding scheme for the system, based on
 250     locale.getpreferredencoding() and some further tweaks.
 251     """
 252     try:
 253         pref = locale.getpreferredencoding()
 254         'TEST'.encode(pref)
 255     except Exception:
 256         pref = 'UTF-8'
 257
 258     return pref
 259
 260
 261 def write_json_file(obj, fn):
 262     """ Encode obj as JSON and write it to fn, atomically if possible """
 263
 264     tf = tempfile.NamedTemporaryFile(
 265         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 266         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 267
 268     try:
 269         with tf:
 270             json.dump(obj, tf, ensure_ascii=False)
 271         if sys.platform == 'win32':
 272             # Need to remove existing file on Windows, else os.rename raises
 273             # WindowsError or FileExistsError.
 274             with contextlib.suppress(OSError):
 275                 os.unlink(fn)
 276         with contextlib.suppress(OSError):
 277             mask = os.umask(0)
 278             os.umask(mask)
 279             os.chmod(tf.name, 0o666 & ~mask)
 280         os.rename(tf.name, fn)
 281     except Exception:
 282         with contextlib.suppress(OSError):
 283             os.remove(tf.name)
 284         raise
 285
 286
 287 def find_xpath_attr(node, xpath, key, val=None):
 288     """ Find the xpath xpath[@key=val] """
 289     assert re.match(r'^[a-zA-Z_-]+$', key)
 290     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 291     return node.find(expr)
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295
 296
 297 def xpath_with_ns(path, ns_map):
 298     components = [c.split(':') for c in path.split('/')]
 299     replaced = []
 300     for c in components:
 301         if len(c) == 1:
 302             replaced.append(c[0])
 303         else:
 304             ns, tag = c
 305             replaced.append('{%s}%s' % (ns_map[ns], tag))
 306     return '/'.join(replaced)
 307
 308
 309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     def _find_xpath(xpath):
 311         return node.find(xpath)
 312
 313     if isinstance(xpath, str):
 314         n = _find_xpath(xpath)
 315     else:
 316         for xp in xpath:
 317             n = _find_xpath(xp)
 318             if n is not None:
 319                 break
 320
 321     if n is None:
 322         if default is not NO_DEFAULT:
 323             return default
 324         elif fatal:
 325             name = xpath if name is None else name
 326             raise ExtractorError('Could not find XML element %s' % name)
 327         else:
 328             return None
 329     return n
 330
 331
 332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 333     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 334     if n is None or n == default:
 335         return n
 336     if n.text is None:
 337         if default is not NO_DEFAULT:
 338             return default
 339         elif fatal:
 340             name = xpath if name is None else name
 341             raise ExtractorError('Could not find XML element\'s text %s' % name)
 342         else:
 343             return None
 344     return n.text
 345
 346
 347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 348     n = find_xpath_attr(node, xpath, key)
 349     if n is None:
 350         if default is not NO_DEFAULT:
 351             return default
 352         elif fatal:
 353             name = f'{xpath}[@{key}]' if name is None else name
 354             raise ExtractorError('Could not find XML attribute %s' % name)
 355         else:
 356             return None
 357     return n.attrib[key]
 358
 359
 360 def get_element_by_id(id, html, **kwargs):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute('id', id, html, **kwargs)
 363
 364
 365 def get_element_html_by_id(id, html, **kwargs):
 366     """Return the html of the tag with the specified ID in the passed HTML document"""
 367     return get_element_html_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_by_class(class_name, html):
 371     """Return the content of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_html_by_class(class_name, html):
 377     """Return the html of the first tag with the specified class in the passed HTML document"""
 378     retval = get_elements_html_by_class(class_name, html)
 379     return retval[0] if retval else None
 380
 381
 382 def get_element_by_attribute(attribute, value, html, **kwargs):
 383     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_html_by_attribute(attribute, value, html, **kargs):
 388     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_elements_by_class(class_name, html, **kargs):
 393     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_html_by_class(class_name, html):
 400     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_html_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_by_attribute(*args, **kwargs):
 407     """Return the content of the tag with the specified attribute in the passed HTML document"""
 408     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 409
 410
 411 def get_elements_html_by_attribute(*args, **kwargs):
 412     """Return the html of the tag with the specified attribute in the passed HTML document"""
 413     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 417     """
 418     Return the text (content) and the html (whole) of the tag with the specified
 419     attribute in the passed HTML document
 420     """
 421     if not value:
 422         return
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>{tag})
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 # XXX: This should be far less strict
 486 def get_element_text_and_html_by_tag(tag, html):
 487     """
 488     For the first element with the specified tag in the passed HTML document
 489     return its' content (text) and the whole element (html)
 490     """
 491     def find_or_raise(haystack, needle, exc):
 492         try:
 493             return haystack.index(needle)
 494         except ValueError:
 495             raise exc
 496     closing_tag = f'</{tag}>'
 497     whole_start = find_or_raise(
 498         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 499     content_start = find_or_raise(
 500         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 501     content_start += whole_start + 1
 502     with HTMLBreakOnClosingTagParser() as parser:
 503         parser.feed(html[whole_start:content_start])
 504         if not parser.tagstack or parser.tagstack[0] != tag:
 505             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 506         offset = content_start
 507         while offset < len(html):
 508             next_closing_tag_start = find_or_raise(
 509                 html[offset:], closing_tag,
 510                 compat_HTMLParseError(f'closing {tag} tag not found'))
 511             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 512             try:
 513                 parser.feed(html[offset:offset + next_closing_tag_end])
 514                 offset += next_closing_tag_end
 515             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 516                 return html[content_start:offset + next_closing_tag_start], \
 517                     html[whole_start:offset + next_closing_tag_end]
 518         raise compat_HTMLParseError('unexpected end of html')
 519
 520
 521 class HTMLAttributeParser(html.parser.HTMLParser):
 522     """Trivial HTML parser to gather the attributes for a single element"""
 523
 524     def __init__(self):
 525         self.attrs = {}
 526         html.parser.HTMLParser.__init__(self)
 527
 528     def handle_starttag(self, tag, attrs):
 529         self.attrs = dict(attrs)
 530         raise compat_HTMLParseError('done')
 531
 532
 533 class HTMLListAttrsParser(html.parser.HTMLParser):
 534     """HTML parser to gather the attributes for the elements of a list"""
 535
 536     def __init__(self):
 537         html.parser.HTMLParser.__init__(self)
 538         self.items = []
 539         self._level = 0
 540
 541     def handle_starttag(self, tag, attrs):
 542         if tag == 'li' and self._level == 0:
 543             self.items.append(dict(attrs))
 544         self._level += 1
 545
 546     def handle_endtag(self, tag):
 547         self._level -= 1
 548
 549
 550 def extract_attributes(html_element):
 551     """Given a string for an HTML element such as
 552     <el
 553          a="foo" B="bar" c="&98;az" d=boz
 554          empty= noval entity="&amp;"
 555          sq='"' dq="'"
 556     >
 557     Decode and return a dictionary of attributes.
 558     {
 559         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 560         'empty': '', 'noval': None, 'entity': '&',
 561         'sq': '"', 'dq': '\''
 562     }.
 563     """
 564     parser = HTMLAttributeParser()
 565     with contextlib.suppress(compat_HTMLParseError):
 566         parser.feed(html_element)
 567         parser.close()
 568     return parser.attrs
 569
 570
 571 def parse_list(webpage):
 572     """Given a string for an series of HTML <li> elements,
 573     return a dictionary of their attributes"""
 574     parser = HTMLListAttrsParser()
 575     parser.feed(webpage)
 576     parser.close()
 577     return parser.items
 578
 579
 580 def clean_html(html):
 581     """Clean an HTML snippet into a readable string"""
 582
 583     if html is None:  # Convenience for sanitizing descriptions etc.
 584         return html
 585
 586     html = re.sub(r'\s+', ' ', html)
 587     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 588     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 589     # Strip html tags
 590     html = re.sub('<.*?>', '', html)
 591     # Replace html entities
 592     html = unescapeHTML(html)
 593     return html.strip()
 594
 595
 596 class LenientJSONDecoder(json.JSONDecoder):
 597     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 598         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 599         super().__init__(*args, **kwargs)
 600
 601     def decode(self, s):
 602         if self.transform_source:
 603             s = self.transform_source(s)
 604         try:
 605             if self.ignore_extra:
 606                 return self.raw_decode(s.lstrip())[0]
 607             return super().decode(s)
 608         except json.JSONDecodeError as e:
 609             if e.pos is not None:
 610                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 611             raise
 612
 613
 614 def sanitize_open(filename, open_mode):
 615     """Try to open the given filename, and slightly tweak it if this fails.
 616
 617     Attempts to open the given filename. If this fails, it tries to change
 618     the filename slightly, step by step, until it's either able to open it
 619     or it fails and raises a final exception, like the standard open()
 620     function.
 621
 622     It returns the tuple (stream, definitive_file_name).
 623     """
 624     if filename == '-':
 625         if sys.platform == 'win32':
 626             import msvcrt
 627
 628             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 629             with contextlib.suppress(io.UnsupportedOperation):
 630                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 631         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 632
 633     for attempt in range(2):
 634         try:
 635             try:
 636                 if sys.platform == 'win32':
 637                     # FIXME: An exclusive lock also locks the file from being read.
 638                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 639                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 640                     raise LockingUnsupportedError()
 641                 stream = locked_file(filename, open_mode, block=False).__enter__()
 642             except OSError:
 643                 stream = open(filename, open_mode)
 644             return stream, filename
 645         except OSError as err:
 646             if attempt or err.errno in (errno.EACCES,):
 647                 raise
 648             old_filename, filename = filename, sanitize_path(filename)
 649             if old_filename == filename:
 650                 raise
 651
 652
 653 def timeconvert(timestr):
 654     """Convert RFC 2822 defined time string into system timestamp"""
 655     timestamp = None
 656     timetuple = email.utils.parsedate_tz(timestr)
 657     if timetuple is not None:
 658         timestamp = email.utils.mktime_tz(timetuple)
 659     return timestamp
 660
 661
 662 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 663     """Sanitizes a string so it could be used as part of a filename.
 664     @param restricted   Use a stricter subset of allowed characters
 665     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 666                         If unset, yt-dlp's new sanitization rules are in effect
 667     """
 668     if s == '':
 669         return ''
 670
 671     def replace_insane(char):
 672         if restricted and char in ACCENT_CHARS:
 673             return ACCENT_CHARS[char]
 674         elif not restricted and char == '\n':
 675             return '\0 '
 676         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 677             # Replace with their full-width unicode counterparts
 678             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 679         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 680             return ''
 681         elif char == '"':
 682             return '' if restricted else '\''
 683         elif char == ':':
 684             return '\0_\0-' if restricted else '\0 \0-'
 685         elif char in '\\/|*<>':
 686             return '\0_'
 687         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 688             return '\0_'
 689         return char
 690
 691     # Replace look-alike Unicode glyphs
 692     if restricted and (is_id is NO_DEFAULT or not is_id):
 693         s = unicodedata.normalize('NFKC', s)
 694     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 695     result = ''.join(map(replace_insane, s))
 696     if is_id is NO_DEFAULT:
 697         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 698         STRIP_RE = r'(?:\0.|[ _-])*'
 699         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 700     result = result.replace('\0', '') or '_'
 701
 702     if not is_id:
 703         while '__' in result:
 704             result = result.replace('__', '_')
 705         result = result.strip('_')
 706         # Common case of "Foreign band name - English song title"
 707         if restricted and result.startswith('-_'):
 708             result = result[2:]
 709         if result.startswith('-'):
 710             result = '_' + result[len('-'):]
 711         result = result.lstrip('.')
 712         if not result:
 713             result = '_'
 714     return result
 715
 716
 717 def sanitize_path(s, force=False):
 718     """Sanitizes and normalizes path on Windows"""
 719     if sys.platform == 'win32':
 720         force = False
 721         drive_or_unc, _ = os.path.splitdrive(s)
 722     elif force:
 723         drive_or_unc = ''
 724     else:
 725         return s
 726
 727     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 728     if drive_or_unc:
 729         norm_path.pop(0)
 730     sanitized_path = [
 731         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 732         for path_part in norm_path]
 733     if drive_or_unc:
 734         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 735     elif force and s and s[0] == os.path.sep:
 736         sanitized_path.insert(0, os.path.sep)
 737     return os.path.join(*sanitized_path)
 738
 739
 740 def sanitize_url(url, *, scheme='http'):
 741     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 742     # the number of unwanted failures due to missing protocol
 743     if url is None:
 744         return
 745     elif url.startswith('//'):
 746         return f'{scheme}:{url}'
 747     # Fix some common typos seen so far
 748     COMMON_TYPOS = (
 749         # https://github.com/ytdl-org/youtube-dl/issues/15649
 750         (r'^httpss://', r'https://'),
 751         # https://bx1.be/lives/direct-tv/
 752         (r'^rmtp([es]?)://', r'rtmp\1://'),
 753     )
 754     for mistake, fixup in COMMON_TYPOS:
 755         if re.match(mistake, url):
 756             return re.sub(mistake, fixup, url)
 757     return url
 758
 759
 760 def extract_basic_auth(url):
 761     parts = urllib.parse.urlsplit(url)
 762     if parts.username is None:
 763         return url, None
 764     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 765         parts.hostname if parts.port is None
 766         else '%s:%d' % (parts.hostname, parts.port))))
 767     auth_payload = base64.b64encode(
 768         ('%s:%s' % (parts.username, parts.password or '')).encode())
 769     return url, f'Basic {auth_payload.decode()}'
 770
 771
 772 def sanitized_Request(url, *args, **kwargs):
 773     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 774     if auth_header is not None:
 775         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 776         headers['Authorization'] = auth_header
 777     return urllib.request.Request(url, *args, **kwargs)
 778
 779
 780 def expand_path(s):
 781     """Expand shell variables and ~"""
 782     return os.path.expandvars(compat_expanduser(s))
 783
 784
 785 def orderedSet(iterable, *, lazy=False):
 786     """Remove all duplicates from the input iterable"""
 787     def _iter():
 788         seen = []  # Do not use set since the items can be unhashable
 789         for x in iterable:
 790             if x not in seen:
 791                 seen.append(x)
 792                 yield x
 793
 794     return _iter() if lazy else list(_iter())
 795
 796
 797 def _htmlentity_transform(entity_with_semicolon):
 798     """Transforms an HTML entity to a character."""
 799     entity = entity_with_semicolon[:-1]
 800
 801     # Known non-numeric HTML entity
 802     if entity in html.entities.name2codepoint:
 803         return chr(html.entities.name2codepoint[entity])
 804
 805     # TODO: HTML5 allows entities without a semicolon.
 806     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 807     if entity_with_semicolon in html.entities.html5:
 808         return html.entities.html5[entity_with_semicolon]
 809
 810     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 811     if mobj is not None:
 812         numstr = mobj.group(1)
 813         if numstr.startswith('x'):
 814             base = 16
 815             numstr = '0%s' % numstr
 816         else:
 817             base = 10
 818         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 819         with contextlib.suppress(ValueError):
 820             return chr(int(numstr, base))
 821
 822     # Unknown entity in name, return its literal representation
 823     return '&%s;' % entity
 824
 825
 826 def unescapeHTML(s):
 827     if s is None:
 828         return None
 829     assert isinstance(s, str)
 830
 831     return re.sub(
 832         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 833
 834
 835 def escapeHTML(text):
 836     return (
 837         text
 838         .replace('&', '&amp;')
 839         .replace('<', '&lt;')
 840         .replace('>', '&gt;')
 841         .replace('"', '&quot;')
 842         .replace("'", '&#39;')
 843     )
 844
 845
 846 def process_communicate_or_kill(p, *args, **kwargs):
 847     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 848                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 849     return Popen.communicate_or_kill(p, *args, **kwargs)
 850
 851
 852 class Popen(subprocess.Popen):
 853     if sys.platform == 'win32':
 854         _startupinfo = subprocess.STARTUPINFO()
 855         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 856     else:
 857         _startupinfo = None
 858
 859     @staticmethod
 860     def _fix_pyinstaller_ld_path(env):
 861         """Restore LD_LIBRARY_PATH when using PyInstaller
 862             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 863                  https://github.com/yt-dlp/yt-dlp/issues/4573
 864         """
 865         if not hasattr(sys, '_MEIPASS'):
 866             return
 867
 868         def _fix(key):
 869             orig = env.get(f'{key}_ORIG')
 870             if orig is None:
 871                 env.pop(key, None)
 872             else:
 873                 env[key] = orig
 874
 875         _fix('LD_LIBRARY_PATH')  # Linux
 876         _fix('DYLD_LIBRARY_PATH')  # macOS
 877
 878     def __init__(self, *args, env=None, text=False, **kwargs):
 879         if env is None:
 880             env = os.environ.copy()
 881         self._fix_pyinstaller_ld_path(env)
 882
 883         if text is True:
 884             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 885             kwargs.setdefault('encoding', 'utf-8')
 886             kwargs.setdefault('errors', 'replace')
 887         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 888
 889     def communicate_or_kill(self, *args, **kwargs):
 890         try:
 891             return self.communicate(*args, **kwargs)
 892         except BaseException:  # Including KeyboardInterrupt
 893             self.kill(timeout=None)
 894             raise
 895
 896     def kill(self, *, timeout=0):
 897         super().kill()
 898         if timeout != 0:
 899             self.wait(timeout=timeout)
 900
 901     @classmethod
 902     def run(cls, *args, timeout=None, **kwargs):
 903         with cls(*args, **kwargs) as proc:
 904             default = '' if proc.text_mode else b''
 905             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 906             return stdout or default, stderr or default, proc.returncode
 907
 908
 909 def get_subprocess_encoding():
 910     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 911         # For subprocess calls, encode with locale encoding
 912         # Refer to http://stackoverflow.com/a/9951851/35070
 913         encoding = preferredencoding()
 914     else:
 915         encoding = sys.getfilesystemencoding()
 916     if encoding is None:
 917         encoding = 'utf-8'
 918     return encoding
 919
 920
 921 def encodeFilename(s, for_subprocess=False):
 922     assert isinstance(s, str)
 923     return s
 924
 925
 926 def decodeFilename(b, for_subprocess=False):
 927     return b
 928
 929
 930 def encodeArgument(s):
 931     # Legacy code that uses byte strings
 932     # Uncomment the following line after fixing all post processors
 933     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 934     return s if isinstance(s, str) else s.decode('ascii')
 935
 936
 937 def decodeArgument(b):
 938     return b
 939
 940
 941 def decodeOption(optval):
 942     if optval is None:
 943         return optval
 944     if isinstance(optval, bytes):
 945         optval = optval.decode(preferredencoding())
 946
 947     assert isinstance(optval, str)
 948     return optval
 949
 950
 951 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 952
 953
 954 def timetuple_from_msec(msec):
 955     secs, msec = divmod(msec, 1000)
 956     mins, secs = divmod(secs, 60)
 957     hrs, mins = divmod(mins, 60)
 958     return _timetuple(hrs, mins, secs, msec)
 959
 960
 961 def formatSeconds(secs, delim=':', msec=False):
 962     time = timetuple_from_msec(secs * 1000)
 963     if time.hours:
 964         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 965     elif time.minutes:
 966         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 967     else:
 968         ret = '%d' % time.seconds
 969     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 970
 971
 972 def _ssl_load_windows_store_certs(ssl_context, storename):
 973     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 974     try:
 975         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 976                  if encoding == 'x509_asn' and (
 977                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 978     except PermissionError:
 979         return
 980     for cert in certs:
 981         with contextlib.suppress(ssl.SSLError):
 982             ssl_context.load_verify_locations(cadata=cert)
 983
 984
 985 def make_HTTPS_handler(params, **kwargs):
 986     opts_check_certificate = not params.get('nocheckcertificate')
 987     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 988     context.check_hostname = opts_check_certificate
 989     if params.get('legacyserverconnect'):
 990         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 991         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 992         context.set_ciphers('DEFAULT')
 993     elif (
 994         sys.version_info < (3, 10)
 995         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 996         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 997     ):
 998         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 999         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000         # in some situations [2][3].
1001         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1003         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1004         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007         # 4. https://peps.python.org/pep-0644/
1008         # 5. https://peps.python.org/pep-0644/#libressl-support
1009         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1010         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011         context.minimum_version = ssl.TLSVersion.TLSv1_2
1012
1013     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014     if opts_check_certificate:
1015         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016             context.load_verify_locations(cafile=certifi.where())
1017         else:
1018             try:
1019                 context.load_default_certs()
1020                 # Work around the issue in load_default_certs when there are bad certificates. See:
1021                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023             except ssl.SSLError:
1024                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026                     for storename in ('CA', 'ROOT'):
1027                         _ssl_load_windows_store_certs(context, storename)
1028                 context.set_default_verify_paths()
1029
1030     client_certfile = params.get('client_certificate')
1031     if client_certfile:
1032         try:
1033             context.load_cert_chain(
1034                 client_certfile, keyfile=params.get('client_certificate_key'),
1035                 password=params.get('client_certificate_password'))
1036         except ssl.SSLError:
1037             raise YoutubeDLError('Unable to load client certificate')
1038
1039     # Some servers may reject requests if ALPN extension is not sent. See:
1040     # https://github.com/python/cpython/issues/85140
1041     # https://github.com/yt-dlp/yt-dlp/issues/3878
1042     with contextlib.suppress(NotImplementedError):
1043         context.set_alpn_protocols(['http/1.1'])
1044
1045     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1046
1047
1048 def bug_reports_message(before=';'):
1049     from .update import REPOSITORY
1050
1051     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1052            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1053
1054     before = before.rstrip()
1055     if not before or before.endswith(('.', '!', '?')):
1056         msg = msg[0].title() + msg[1:]
1057
1058     return (before + ' ' if before else '') + msg
1059
1060
1061 class YoutubeDLError(Exception):
1062     """Base exception for YoutubeDL errors."""
1063     msg = None
1064
1065     def __init__(self, msg=None):
1066         if msg is not None:
1067             self.msg = msg
1068         elif self.msg is None:
1069             self.msg = type(self).__name__
1070         super().__init__(self.msg)
1071
1072
1073 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1074 if hasattr(ssl, 'CertificateError'):
1075     network_exceptions.append(ssl.CertificateError)
1076 network_exceptions = tuple(network_exceptions)
1077
1078
1079 class ExtractorError(YoutubeDLError):
1080     """Error during info extraction."""
1081
1082     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1083         """ tb, if given, is the original traceback (so that it can be printed out).
1084         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1085         """
1086         if sys.exc_info()[0] in network_exceptions:
1087             expected = True
1088
1089         self.orig_msg = str(msg)
1090         self.traceback = tb
1091         self.expected = expected
1092         self.cause = cause
1093         self.video_id = video_id
1094         self.ie = ie
1095         self.exc_info = sys.exc_info()  # preserve original exception
1096         if isinstance(self.exc_info[1], ExtractorError):
1097             self.exc_info = self.exc_info[1].exc_info
1098         super().__init__(self.__msg)
1099
1100     @property
1101     def __msg(self):
1102         return ''.join((
1103             format_field(self.ie, None, '[%s] '),
1104             format_field(self.video_id, None, '%s: '),
1105             self.orig_msg,
1106             format_field(self.cause, None, ' (caused by %r)'),
1107             '' if self.expected else bug_reports_message()))
1108
1109     def format_traceback(self):
1110         return join_nonempty(
1111             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1112             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1113             delim='\n') or None
1114
1115     def __setattr__(self, name, value):
1116         super().__setattr__(name, value)
1117         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118             self.msg = self.__msg or type(self).__name__
1119             self.args = (self.msg, )  # Cannot be property
1120
1121
1122 class UnsupportedError(ExtractorError):
1123     def __init__(self, url):
1124         super().__init__(
1125             'Unsupported URL: %s' % url, expected=True)
1126         self.url = url
1127
1128
1129 class RegexNotFoundError(ExtractorError):
1130     """Error when a regex didn't match"""
1131     pass
1132
1133
1134 class GeoRestrictedError(ExtractorError):
1135     """Geographic restriction Error exception.
1136
1137     This exception may be thrown when a video is not available from your
1138     geographic location due to geographic restrictions imposed by a website.
1139     """
1140
1141     def __init__(self, msg, countries=None, **kwargs):
1142         kwargs['expected'] = True
1143         super().__init__(msg, **kwargs)
1144         self.countries = countries
1145
1146
1147 class UserNotLive(ExtractorError):
1148     """Error when a channel/user is not live"""
1149
1150     def __init__(self, msg=None, **kwargs):
1151         kwargs['expected'] = True
1152         super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
1155 class DownloadError(YoutubeDLError):
1156     """Download Error exception.
1157
1158     This exception may be thrown by FileDownloader objects if they are not
1159     configured to continue on errors. They will contain the appropriate
1160     error message.
1161     """
1162
1163     def __init__(self, msg, exc_info=None):
1164         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1165         super().__init__(msg)
1166         self.exc_info = exc_info
1167
1168
1169 class EntryNotInPlaylist(YoutubeDLError):
1170     """Entry not in playlist exception.
1171
1172     This exception will be thrown by YoutubeDL when a requested entry
1173     is not found in the playlist info_dict
1174     """
1175     msg = 'Entry not found in info'
1176
1177
1178 class SameFileError(YoutubeDLError):
1179     """Same File exception.
1180
1181     This exception will be thrown by FileDownloader objects if they detect
1182     multiple files would have to be downloaded to the same file on disk.
1183     """
1184     msg = 'Fixed output name but more than one file to download'
1185
1186     def __init__(self, filename=None):
1187         if filename is not None:
1188             self.msg += f': {filename}'
1189         super().__init__(self.msg)
1190
1191
1192 class PostProcessingError(YoutubeDLError):
1193     """Post Processing exception.
1194
1195     This exception may be raised by PostProcessor's .run() method to
1196     indicate an error in the postprocessing task.
1197     """
1198
1199
1200 class DownloadCancelled(YoutubeDLError):
1201     """ Exception raised when the download queue should be interrupted """
1202     msg = 'The download was cancelled'
1203
1204
1205 class ExistingVideoReached(DownloadCancelled):
1206     """ --break-on-existing triggered """
1207     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1208
1209
1210 class RejectedVideoReached(DownloadCancelled):
1211     """ --break-on-reject triggered """
1212     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1213
1214
1215 class MaxDownloadsReached(DownloadCancelled):
1216     """ --max-downloads limit has been reached. """
1217     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
1220 class ReExtractInfo(YoutubeDLError):
1221     """ Video info needs to be re-extracted. """
1222
1223     def __init__(self, msg, expected=False):
1224         super().__init__(msg)
1225         self.expected = expected
1226
1227
1228 class ThrottledDownload(ReExtractInfo):
1229     """ Download speed below --throttled-rate. """
1230     msg = 'The download speed is below throttle limit'
1231
1232     def __init__(self):
1233         super().__init__(self.msg, expected=False)
1234
1235
1236 class UnavailableVideoError(YoutubeDLError):
1237     """Unavailable Format exception.
1238
1239     This exception will be thrown when a video is requested
1240     in a format that is not available for that video.
1241     """
1242     msg = 'Unable to download video'
1243
1244     def __init__(self, err=None):
1245         if err is not None:
1246             self.msg += f': {err}'
1247         super().__init__(self.msg)
1248
1249
1250 class ContentTooShortError(YoutubeDLError):
1251     """Content Too Short exception.
1252
1253     This exception may be raised by FileDownloader objects when a file they
1254     download is too small for what the server announced first, indicating
1255     the connection was probably interrupted.
1256     """
1257
1258     def __init__(self, downloaded, expected):
1259         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1260         # Both in bytes
1261         self.downloaded = downloaded
1262         self.expected = expected
1263
1264
1265 class XAttrMetadataError(YoutubeDLError):
1266     def __init__(self, code=None, msg='Unknown error'):
1267         super().__init__(msg)
1268         self.code = code
1269         self.msg = msg
1270
1271         # Parsing code and msg
1272         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1273                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1274             self.reason = 'NO_SPACE'
1275         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276             self.reason = 'VALUE_TOO_LONG'
1277         else:
1278             self.reason = 'NOT_SUPPORTED'
1279
1280
1281 class XAttrUnavailableError(YoutubeDLError):
1282     pass
1283
1284
1285 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1286     hc = http_class(*args, **kwargs)
1287     source_address = ydl_handler._params.get('source_address')
1288
1289     if source_address is not None:
1290         # This is to workaround _create_connection() from socket where it will try all
1291         # address data from getaddrinfo() including IPv6. This filters the result from
1292         # getaddrinfo() based on the source_address value.
1293         # This is based on the cpython socket.create_connection() function.
1294         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296             host, port = address
1297             err = None
1298             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1299             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300             ip_addrs = [addr for addr in addrs if addr[0] == af]
1301             if addrs and not ip_addrs:
1302                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1303                 raise OSError(
1304                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305                     % (ip_version, source_address[0]))
1306             for res in ip_addrs:
1307                 af, socktype, proto, canonname, sa = res
1308                 sock = None
1309                 try:
1310                     sock = socket.socket(af, socktype, proto)
1311                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312                         sock.settimeout(timeout)
1313                     sock.bind(source_address)
1314                     sock.connect(sa)
1315                     err = None  # Explicitly break reference cycle
1316                     return sock
1317                 except OSError as _:
1318                     err = _
1319                     if sock is not None:
1320                         sock.close()
1321             if err is not None:
1322                 raise err
1323             else:
1324                 raise OSError('getaddrinfo returns an empty list')
1325         if hasattr(hc, '_create_connection'):
1326             hc._create_connection = _create_connection
1327         hc.source_address = (source_address, 0)
1328
1329     return hc
1330
1331
1332 def handle_youtubedl_headers(headers):
1333     filtered_headers = headers
1334
1335     if 'Youtubedl-no-compression' in filtered_headers:
1336         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1337         del filtered_headers['Youtubedl-no-compression']
1338
1339     return filtered_headers
1340
1341
1342 class YoutubeDLHandler(urllib.request.HTTPHandler):
1343     """Handler for HTTP requests and responses.
1344
1345     This class, when installed with an OpenerDirector, automatically adds
1346     the standard headers to every HTTP request and handles gzipped and
1347     deflated responses from web servers. If compression is to be avoided in
1348     a particular request, the original request in the program code only has
1349     to include the HTTP header "Youtubedl-no-compression", which will be
1350     removed before making the real request.
1351
1352     Part of this code was copied from:
1353
1354     http://techknack.net/python-urllib2-handlers/
1355
1356     Andrew Rowls, the author of that code, agreed to release it to the
1357     public domain.
1358     """
1359
1360     def __init__(self, params, *args, **kwargs):
1361         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1362         self._params = params
1363
1364     def http_open(self, req):
1365         conn_class = http.client.HTTPConnection
1366
1367         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368         if socks_proxy:
1369             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370             del req.headers['Ytdl-socks-proxy']
1371
1372         return self.do_open(functools.partial(
1373             _create_http_connection, self, conn_class, False),
1374             req)
1375
1376     @staticmethod
1377     def deflate(data):
1378         if not data:
1379             return data
1380         try:
1381             return zlib.decompress(data, -zlib.MAX_WBITS)
1382         except zlib.error:
1383             return zlib.decompress(data)
1384
1385     @staticmethod
1386     def brotli(data):
1387         if not data:
1388             return data
1389         return brotli.decompress(data)
1390
1391     def http_request(self, req):
1392         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393         # always respected by websites, some tend to give out URLs with non percent-encoded
1394         # non-ASCII characters (see telemb.py, ard.py [#3412])
1395         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396         # To work around aforementioned issue we will replace request's original URL with
1397         # percent-encoded one
1398         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400         url = req.get_full_url()
1401         url_escaped = escape_url(url)
1402
1403         # Substitute URL if any change after escaping
1404         if url != url_escaped:
1405             req = update_Request(req, url=url_escaped)
1406
1407         for h, v in self._params.get('http_headers', std_headers).items():
1408             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409             # The dict keys are capitalized because of this bug by urllib
1410             if h.capitalize() not in req.headers:
1411                 req.add_header(h, v)
1412
1413         if 'Accept-encoding' not in req.headers:
1414             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
1416         req.headers = handle_youtubedl_headers(req.headers)
1417
1418         return super().do_request_(req)
1419
1420     def http_response(self, req, resp):
1421         old_resp = resp
1422         # gzip
1423         if resp.headers.get('Content-encoding', '') == 'gzip':
1424             content = resp.read()
1425             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426             try:
1427                 uncompressed = io.BytesIO(gz.read())
1428             except OSError as original_ioerror:
1429                 # There may be junk add the end of the file
1430                 # See http://stackoverflow.com/q/4928560/35070 for details
1431                 for i in range(1, 1024):
1432                     try:
1433                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434                         uncompressed = io.BytesIO(gz.read())
1435                     except OSError:
1436                         continue
1437                     break
1438                 else:
1439                     raise original_ioerror
1440             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1441             resp.msg = old_resp.msg
1442             del resp.headers['Content-encoding']
1443         # deflate
1444         if resp.headers.get('Content-encoding', '') == 'deflate':
1445             gz = io.BytesIO(self.deflate(resp.read()))
1446             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1447             resp.msg = old_resp.msg
1448             del resp.headers['Content-encoding']
1449         # brotli
1450         if resp.headers.get('Content-encoding', '') == 'br':
1451             resp = urllib.request.addinfourl(
1452                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453             resp.msg = old_resp.msg
1454             del resp.headers['Content-encoding']
1455         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1456         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1457         if 300 <= resp.code < 400:
1458             location = resp.headers.get('Location')
1459             if location:
1460                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1461                 location = location.encode('iso-8859-1').decode()
1462                 location_escaped = escape_url(location)
1463                 if location != location_escaped:
1464                     del resp.headers['Location']
1465                     resp.headers['Location'] = location_escaped
1466         return resp
1467
1468     https_request = http_request
1469     https_response = http_response
1470
1471
1472 def make_socks_conn_class(base_class, socks_proxy):
1473     assert issubclass(base_class, (
1474         http.client.HTTPConnection, http.client.HTTPSConnection))
1475
1476     url_components = urllib.parse.urlparse(socks_proxy)
1477     if url_components.scheme.lower() == 'socks5':
1478         socks_type = ProxyType.SOCKS5
1479     elif url_components.scheme.lower() in ('socks', 'socks4'):
1480         socks_type = ProxyType.SOCKS4
1481     elif url_components.scheme.lower() == 'socks4a':
1482         socks_type = ProxyType.SOCKS4A
1483
1484     def unquote_if_non_empty(s):
1485         if not s:
1486             return s
1487         return urllib.parse.unquote_plus(s)
1488
1489     proxy_args = (
1490         socks_type,
1491         url_components.hostname, url_components.port or 1080,
1492         True,  # Remote DNS
1493         unquote_if_non_empty(url_components.username),
1494         unquote_if_non_empty(url_components.password),
1495     )
1496
1497     class SocksConnection(base_class):
1498         def connect(self):
1499             self.sock = sockssocket()
1500             self.sock.setproxy(*proxy_args)
1501             if isinstance(self.timeout, (int, float)):
1502                 self.sock.settimeout(self.timeout)
1503             self.sock.connect((self.host, self.port))
1504
1505             if isinstance(self, http.client.HTTPSConnection):
1506                 if hasattr(self, '_context'):  # Python > 2.6
1507                     self.sock = self._context.wrap_socket(
1508                         self.sock, server_hostname=self.host)
1509                 else:
1510                     self.sock = ssl.wrap_socket(self.sock)
1511
1512     return SocksConnection
1513
1514
1515 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1516     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1517         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1519         self._params = params
1520
1521     def https_open(self, req):
1522         kwargs = {}
1523         conn_class = self._https_conn_class
1524
1525         if hasattr(self, '_context'):  # python > 2.6
1526             kwargs['context'] = self._context
1527         if hasattr(self, '_check_hostname'):  # python 3.x
1528             kwargs['check_hostname'] = self._check_hostname
1529
1530         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531         if socks_proxy:
1532             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533             del req.headers['Ytdl-socks-proxy']
1534
1535         try:
1536             return self.do_open(
1537                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538         except urllib.error.URLError as e:
1539             if (isinstance(e.reason, ssl.SSLError)
1540                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542             raise
1543
1544
1545 def is_path_like(f):
1546     return isinstance(f, (str, bytes, os.PathLike))
1547
1548
1549 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1550     """
1551     See [1] for cookie file format.
1552
1553     1. https://curl.haxx.se/docs/http-cookies.html
1554     """
1555     _HTTPONLY_PREFIX = '#HttpOnly_'
1556     _ENTRY_LEN = 7
1557     _HEADER = '''# Netscape HTTP Cookie File
1558 # This file is generated by yt-dlp.  Do not edit.
1559
1560 '''
1561     _CookieFileEntry = collections.namedtuple(
1562         'CookieFileEntry',
1563         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1564
1565     def __init__(self, filename=None, *args, **kwargs):
1566         super().__init__(None, *args, **kwargs)
1567         if is_path_like(filename):
1568             filename = os.fspath(filename)
1569         self.filename = filename
1570
1571     @staticmethod
1572     def _true_or_false(cndn):
1573         return 'TRUE' if cndn else 'FALSE'
1574
1575     @contextlib.contextmanager
1576     def open(self, file, *, write=False):
1577         if is_path_like(file):
1578             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579                 yield f
1580         else:
1581             if write:
1582                 file.truncate(0)
1583             yield file
1584
1585     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586         now = time.time()
1587         for cookie in self:
1588             if (not ignore_discard and cookie.discard
1589                     or not ignore_expires and cookie.is_expired(now)):
1590                 continue
1591             name, value = cookie.name, cookie.value
1592             if value is None:
1593                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594                 # with no name, whereas http.cookiejar regards it as a
1595                 # cookie with no value.
1596                 name, value = '', name
1597             f.write('%s\n' % '\t'.join((
1598                 cookie.domain,
1599                 self._true_or_false(cookie.domain.startswith('.')),
1600                 cookie.path,
1601                 self._true_or_false(cookie.secure),
1602                 str_or_none(cookie.expires, default=''),
1603                 name, value
1604             )))
1605
1606     def save(self, filename=None, *args, **kwargs):
1607         """
1608         Save cookies to a file.
1609         Code is taken from CPython 3.6
1610         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1611
1612         if filename is None:
1613             if self.filename is not None:
1614                 filename = self.filename
1615             else:
1616                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1617
1618         # Store session cookies with `expires` set to 0 instead of an empty string
1619         for cookie in self:
1620             if cookie.expires is None:
1621                 cookie.expires = 0
1622
1623         with self.open(filename, write=True) as f:
1624             f.write(self._HEADER)
1625             self._really_save(f, *args, **kwargs)
1626
1627     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1628         """Load cookies from a file."""
1629         if filename is None:
1630             if self.filename is not None:
1631                 filename = self.filename
1632             else:
1633                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1634
1635         def prepare_line(line):
1636             if line.startswith(self._HTTPONLY_PREFIX):
1637                 line = line[len(self._HTTPONLY_PREFIX):]
1638             # comments and empty lines are fine
1639             if line.startswith('#') or not line.strip():
1640                 return line
1641             cookie_list = line.split('\t')
1642             if len(cookie_list) != self._ENTRY_LEN:
1643                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1644             cookie = self._CookieFileEntry(*cookie_list)
1645             if cookie.expires_at and not cookie.expires_at.isdigit():
1646                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1647             return line
1648
1649         cf = io.StringIO()
1650         with self.open(filename) as f:
1651             for line in f:
1652                 try:
1653                     cf.write(prepare_line(line))
1654                 except http.cookiejar.LoadError as e:
1655                     if f'{line.strip()} '[0] in '[{"':
1656                         raise http.cookiejar.LoadError(
1657                             'Cookies file must be Netscape formatted, not JSON. See  '
1658                             'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1659                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1660                     continue
1661         cf.seek(0)
1662         self._really_load(cf, filename, ignore_discard, ignore_expires)
1663         # Session cookies are denoted by either `expires` field set to
1664         # an empty string or 0. MozillaCookieJar only recognizes the former
1665         # (see [1]). So we need force the latter to be recognized as session
1666         # cookies on our own.
1667         # Session cookies may be important for cookies-based authentication,
1668         # e.g. usually, when user does not check 'Remember me' check box while
1669         # logging in on a site, some important cookies are stored as session
1670         # cookies so that not recognizing them will result in failed login.
1671         # 1. https://bugs.python.org/issue17164
1672         for cookie in self:
1673             # Treat `expires=0` cookies as session cookies
1674             if cookie.expires == 0:
1675                 cookie.expires = None
1676                 cookie.discard = True
1677
1678
1679 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1680     def __init__(self, cookiejar=None):
1681         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1682
1683     def http_response(self, request, response):
1684         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1685
1686     https_request = urllib.request.HTTPCookieProcessor.http_request
1687     https_response = http_response
1688
1689
1690 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1691     """YoutubeDL redirect handler
1692
1693     The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695     This redirect handler solves two issues:
1696      - ensures redirect URL is always unicode under python 2
1697      - introduces support for experimental HTTP response status code
1698        308 Permanent Redirect [2] used by some sites [3]
1699
1700     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703     """
1704
1705     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1706
1707     def redirect_request(self, req, fp, code, msg, headers, newurl):
1708         """Return a Request or None in response to a redirect.
1709
1710         This is called by the http_error_30x methods when a
1711         redirection response is received.  If a redirection should
1712         take place, return a new Request to allow http_error_30x to
1713         perform the redirect.  Otherwise, raise HTTPError if no-one
1714         else should try to handle this url.  Return None if you can't
1715         but another Handler might.
1716         """
1717         m = req.get_method()
1718         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719                  or code in (301, 302, 303) and m == "POST")):
1720             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1721         # Strictly (according to RFC 2616), 301 or 302 in response to
1722         # a POST MUST NOT cause a redirection without confirmation
1723         # from the user (of urllib.request, in this case).  In practice,
1724         # essentially all clients do redirect in this case, so we do
1725         # the same.
1726
1727         # Be conciliant with URIs containing a space.  This is mainly
1728         # redundant with the more complete encoding done in http_error_302(),
1729         # but it is kept for compatibility with other callers.
1730         newurl = newurl.replace(' ', '%20')
1731
1732         CONTENT_HEADERS = ("content-length", "content-type")
1733         # NB: don't use dict comprehension for python 2.6 compatibility
1734         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1735
1736         # A 303 must either use GET or HEAD for subsequent request
1737         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738         if code == 303 and m != 'HEAD':
1739             m = 'GET'
1740         # 301 and 302 redirects are commonly turned into a GET from a POST
1741         # for subsequent requests by browsers, so we'll do the same.
1742         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744         if code in (301, 302) and m == 'POST':
1745             m = 'GET'
1746
1747         return urllib.request.Request(
1748             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1749             unverifiable=True, method=m)
1750
1751
1752 def extract_timezone(date_str):
1753     m = re.search(
1754         r'''(?x)
1755             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1756             (?P<tz>Z|                                            # just the UTC Z, or
1757                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1758                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759                    [ ]?                                          # optional space
1760                 (?P<sign>\+|-)                                   # +/-
1761                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1762             $)
1763         ''', date_str)
1764     if not m:
1765         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767         if timezone is not None:
1768             date_str = date_str[:-len(m.group('tz'))]
1769         timezone = datetime.timedelta(hours=timezone or 0)
1770     else:
1771         date_str = date_str[:-len(m.group('tz'))]
1772         if not m.group('sign'):
1773             timezone = datetime.timedelta()
1774         else:
1775             sign = 1 if m.group('sign') == '+' else -1
1776             timezone = datetime.timedelta(
1777                 hours=sign * int(m.group('hours')),
1778                 minutes=sign * int(m.group('minutes')))
1779     return timezone, date_str
1780
1781
1782 def parse_iso8601(date_str, delimiter='T', timezone=None):
1783     """ Return a UNIX timestamp from the given date """
1784
1785     if date_str is None:
1786         return None
1787
1788     date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
1790     if timezone is None:
1791         timezone, date_str = extract_timezone(date_str)
1792
1793     with contextlib.suppress(ValueError):
1794         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1795         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796         return calendar.timegm(dt.timetuple())
1797
1798
1799 def date_formats(day_first=True):
1800     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
1803 def unified_strdate(date_str, day_first=True):
1804     """Return a string with the date in the format YYYYMMDD"""
1805
1806     if date_str is None:
1807         return None
1808     upload_date = None
1809     # Replace commas
1810     date_str = date_str.replace(',', ' ')
1811     # Remove AM/PM + timezone
1812     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1813     _, date_str = extract_timezone(date_str)
1814
1815     for expression in date_formats(day_first):
1816         with contextlib.suppress(ValueError):
1817             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1818     if upload_date is None:
1819         timetuple = email.utils.parsedate_tz(date_str)
1820         if timetuple:
1821             with contextlib.suppress(ValueError):
1822                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1823     if upload_date is not None:
1824         return str(upload_date)
1825
1826
1827 def unified_timestamp(date_str, day_first=True):
1828     if date_str is None:
1829         return None
1830
1831     date_str = re.sub(r'\s+', ' ', re.sub(
1832         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1833
1834     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1835     timezone, date_str = extract_timezone(date_str)
1836
1837     # Remove AM/PM + timezone
1838     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
1840     # Remove unrecognized timezones from ISO 8601 alike timestamps
1841     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842     if m:
1843         date_str = date_str[:-len(m.group('tz'))]
1844
1845     # Python only supports microseconds, so remove nanoseconds
1846     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847     if m:
1848         date_str = m.group(1)
1849
1850     for expression in date_formats(day_first):
1851         with contextlib.suppress(ValueError):
1852             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1853             return calendar.timegm(dt.timetuple())
1854
1855     timetuple = email.utils.parsedate_tz(date_str)
1856     if timetuple:
1857         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1858
1859
1860 def determine_ext(url, default_ext='unknown_video'):
1861     if url is None or '.' not in url:
1862         return default_ext
1863     guess = url.partition('?')[0].rpartition('.')[2]
1864     if re.match(r'^[A-Za-z0-9]+$', guess):
1865         return guess
1866     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1868         return guess.rstrip('/')
1869     else:
1870         return default_ext
1871
1872
1873 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1875
1876
1877 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1878     R"""
1879     Return a datetime object from a string.
1880     Supported format:
1881         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883     @param format       strftime format of DATE
1884     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1885                         auto: round to the unit provided in date_str (if applicable).
1886     """
1887     auto_precision = False
1888     if precision == 'auto':
1889         auto_precision = True
1890         precision = 'microsecond'
1891     today = datetime_round(datetime.datetime.utcnow(), precision)
1892     if date_str in ('now', 'today'):
1893         return today
1894     if date_str == 'yesterday':
1895         return today - datetime.timedelta(days=1)
1896     match = re.match(
1897         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1898         date_str)
1899     if match is not None:
1900         start_time = datetime_from_str(match.group('start'), precision, format)
1901         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1902         unit = match.group('unit')
1903         if unit == 'month' or unit == 'year':
1904             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1905             unit = 'day'
1906         else:
1907             if unit == 'week':
1908                 unit = 'day'
1909                 time *= 7
1910             delta = datetime.timedelta(**{unit + 's': time})
1911             new_date = start_time + delta
1912         if auto_precision:
1913             return datetime_round(new_date, unit)
1914         return new_date
1915
1916     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
1919 def date_from_str(date_str, format='%Y%m%d', strict=False):
1920     R"""
1921     Return a date object from a string using datetime_from_str
1922
1923     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1924                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1925     """
1926     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927         raise ValueError(f'Invalid date format "{date_str}"')
1928     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931 def datetime_add_months(dt, months):
1932     """Increment/Decrement a datetime object by months."""
1933     month = dt.month + months - 1
1934     year = dt.year + month // 12
1935     month = month % 12 + 1
1936     day = min(dt.day, calendar.monthrange(year, month)[1])
1937     return dt.replace(year, month, day)
1938
1939
1940 def datetime_round(dt, precision='day'):
1941     """
1942     Round a datetime object's time to a specific precision
1943     """
1944     if precision == 'microsecond':
1945         return dt
1946
1947     unit_seconds = {
1948         'day': 86400,
1949         'hour': 3600,
1950         'minute': 60,
1951         'second': 1,
1952     }
1953     roundto = lambda x, n: ((x + n / 2) // n) * n
1954     timestamp = calendar.timegm(dt.timetuple())
1955     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1956
1957
1958 def hyphenate_date(date_str):
1959     """
1960     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962     if match is not None:
1963         return '-'.join(match.groups())
1964     else:
1965         return date_str
1966
1967
1968 class DateRange:
1969     """Represents a time interval between two dates"""
1970
1971     def __init__(self, start=None, end=None):
1972         """start and end must be strings in the format accepted by date"""
1973         if start is not None:
1974             self.start = date_from_str(start, strict=True)
1975         else:
1976             self.start = datetime.datetime.min.date()
1977         if end is not None:
1978             self.end = date_from_str(end, strict=True)
1979         else:
1980             self.end = datetime.datetime.max.date()
1981         if self.start > self.end:
1982             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1983
1984     @classmethod
1985     def day(cls, day):
1986         """Returns a range that only contains the given day"""
1987         return cls(day, day)
1988
1989     def __contains__(self, date):
1990         """Check if the date is in the range"""
1991         if not isinstance(date, datetime.date):
1992             date = date_from_str(date)
1993         return self.start <= date <= self.end
1994
1995     def __str__(self):
1996         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1997
1998     def __eq__(self, other):
1999         return (isinstance(other, DateRange)
2000                 and self.start == other.start and self.end == other.end)
2001
2002
2003 def platform_name():
2004     """ Returns the platform name as a str """
2005     deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2006     return platform.platform()
2007
2008
2009 @functools.cache
2010 def system_identifier():
2011     python_implementation = platform.python_implementation()
2012     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2014     libc_ver = []
2015     with contextlib.suppress(OSError):  # We may not have access to the executable
2016         libc_ver = platform.libc_ver()
2017
2018     return 'Python %s (%s %s %s) - %s (%s%s)' % (
2019         platform.python_version(),
2020         python_implementation,
2021         platform.machine(),
2022         platform.architecture()[0],
2023         platform.platform(),
2024         ssl.OPENSSL_VERSION,
2025         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2026     )
2027
2028
2029 @functools.cache
2030 def get_windows_version():
2031     ''' Get Windows version. returns () if it's not running on Windows '''
2032     if compat_os_name == 'nt':
2033         return version_tuple(platform.win32_ver()[1])
2034     else:
2035         return ()
2036
2037
2038 def write_string(s, out=None, encoding=None):
2039     assert isinstance(s, str)
2040     out = out or sys.stderr
2041
2042     if compat_os_name == 'nt' and supports_terminal_sequences(out):
2043         s = re.sub(r'([\r\n]+)', r' \1', s)
2044
2045     enc, buffer = None, out
2046     if 'b' in getattr(out, 'mode', ''):
2047         enc = encoding or preferredencoding()
2048     elif hasattr(out, 'buffer'):
2049         buffer = out.buffer
2050         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2051
2052     buffer.write(s.encode(enc, 'ignore') if enc else s)
2053     out.flush()
2054
2055
2056 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057     from . import _IN_CLI
2058     if _IN_CLI:
2059         if msg in deprecation_warning._cache:
2060             return
2061         deprecation_warning._cache.add(msg)
2062         if printer:
2063             return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065     else:
2066         import warnings
2067         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070 deprecation_warning._cache = set()
2071
2072
2073 def bytes_to_intlist(bs):
2074     if not bs:
2075         return []
2076     if isinstance(bs[0], int):  # Python 3
2077         return list(bs)
2078     else:
2079         return [ord(c) for c in bs]
2080
2081
2082 def intlist_to_bytes(xs):
2083     if not xs:
2084         return b''
2085     return struct.pack('%dB' % len(xs), *xs)
2086
2087
2088 class LockingUnsupportedError(OSError):
2089     msg = 'File locking is not supported'
2090
2091     def __init__(self):
2092         super().__init__(self.msg)
2093
2094
2095 # Cross-platform file locking
2096 if sys.platform == 'win32':
2097     import ctypes
2098     import ctypes.wintypes
2099     import msvcrt
2100
2101     class OVERLAPPED(ctypes.Structure):
2102         _fields_ = [
2103             ('Internal', ctypes.wintypes.LPVOID),
2104             ('InternalHigh', ctypes.wintypes.LPVOID),
2105             ('Offset', ctypes.wintypes.DWORD),
2106             ('OffsetHigh', ctypes.wintypes.DWORD),
2107             ('hEvent', ctypes.wintypes.HANDLE),
2108         ]
2109
2110     kernel32 = ctypes.windll.kernel32
2111     LockFileEx = kernel32.LockFileEx
2112     LockFileEx.argtypes = [
2113         ctypes.wintypes.HANDLE,     # hFile
2114         ctypes.wintypes.DWORD,      # dwFlags
2115         ctypes.wintypes.DWORD,      # dwReserved
2116         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2117         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2118         ctypes.POINTER(OVERLAPPED)  # Overlapped
2119     ]
2120     LockFileEx.restype = ctypes.wintypes.BOOL
2121     UnlockFileEx = kernel32.UnlockFileEx
2122     UnlockFileEx.argtypes = [
2123         ctypes.wintypes.HANDLE,     # hFile
2124         ctypes.wintypes.DWORD,      # dwReserved
2125         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2126         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2127         ctypes.POINTER(OVERLAPPED)  # Overlapped
2128     ]
2129     UnlockFileEx.restype = ctypes.wintypes.BOOL
2130     whole_low = 0xffffffff
2131     whole_high = 0x7fffffff
2132
2133     def _lock_file(f, exclusive, block):
2134         overlapped = OVERLAPPED()
2135         overlapped.Offset = 0
2136         overlapped.OffsetHigh = 0
2137         overlapped.hEvent = 0
2138         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2139
2140         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2143             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2145
2146     def _unlock_file(f):
2147         assert f._lock_file_overlapped_p
2148         handle = msvcrt.get_osfhandle(f.fileno())
2149         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2150             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152 else:
2153     try:
2154         import fcntl
2155
2156         def _lock_file(f, exclusive, block):
2157             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158             if not block:
2159                 flags |= fcntl.LOCK_NB
2160             try:
2161                 fcntl.flock(f, flags)
2162             except BlockingIOError:
2163                 raise
2164             except OSError:  # AOSP does not have flock()
2165                 fcntl.lockf(f, flags)
2166
2167         def _unlock_file(f):
2168             try:
2169                 fcntl.flock(f, fcntl.LOCK_UN)
2170             except OSError:
2171                 fcntl.lockf(f, fcntl.LOCK_UN)
2172
2173     except ImportError:
2174
2175         def _lock_file(f, exclusive, block):
2176             raise LockingUnsupportedError()
2177
2178         def _unlock_file(f):
2179             raise LockingUnsupportedError()
2180
2181
2182 class locked_file:
2183     locked = False
2184
2185     def __init__(self, filename, mode, block=True, encoding=None):
2186         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187             raise NotImplementedError(mode)
2188         self.mode, self.block = mode, block
2189
2190         writable = any(f in mode for f in 'wax+')
2191         readable = any(f in mode for f in 'r+')
2192         flags = functools.reduce(operator.ior, (
2193             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2194             getattr(os, 'O_BINARY', 0),  # Windows only
2195             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2196             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2197             os.O_APPEND if 'a' in mode else 0,
2198             os.O_EXCL if 'x' in mode else 0,
2199             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200         ))
2201
2202         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2203
2204     def __enter__(self):
2205         exclusive = 'r' not in self.mode
2206         try:
2207             _lock_file(self.f, exclusive, self.block)
2208             self.locked = True
2209         except OSError:
2210             self.f.close()
2211             raise
2212         if 'w' in self.mode:
2213             try:
2214                 self.f.truncate()
2215             except OSError as e:
2216                 if e.errno not in (
2217                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2218                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2219                 ):
2220                     raise
2221         return self
2222
2223     def unlock(self):
2224         if not self.locked:
2225             return
2226         try:
2227             _unlock_file(self.f)
2228         finally:
2229             self.locked = False
2230
2231     def __exit__(self, *_):
2232         try:
2233             self.unlock()
2234         finally:
2235             self.f.close()
2236
2237     open = __enter__
2238     close = __exit__
2239
2240     def __getattr__(self, attr):
2241         return getattr(self.f, attr)
2242
2243     def __iter__(self):
2244         return iter(self.f)
2245
2246
2247 @functools.cache
2248 def get_filesystem_encoding():
2249     encoding = sys.getfilesystemencoding()
2250     return encoding if encoding is not None else 'utf-8'
2251
2252
2253 def shell_quote(args):
2254     quoted_args = []
2255     encoding = get_filesystem_encoding()
2256     for a in args:
2257         if isinstance(a, bytes):
2258             # We may get a filename encoded with 'encodeFilename'
2259             a = a.decode(encoding)
2260         quoted_args.append(compat_shlex_quote(a))
2261     return ' '.join(quoted_args)
2262
2263
2264 def smuggle_url(url, data):
2265     """ Pass additional data in a URL for internal use. """
2266
2267     url, idata = unsmuggle_url(url, {})
2268     data.update(idata)
2269     sdata = urllib.parse.urlencode(
2270         {'__youtubedl_smuggle': json.dumps(data)})
2271     return url + '#' + sdata
2272
2273
2274 def unsmuggle_url(smug_url, default=None):
2275     if '#__youtubedl_smuggle' not in smug_url:
2276         return smug_url, default
2277     url, _, sdata = smug_url.rpartition('#')
2278     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2279     data = json.loads(jsond)
2280     return url, data
2281
2282
2283 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284     """ Formats numbers with decimal sufixes like K, M, etc """
2285     num, factor = float_or_none(num), float(factor)
2286     if num is None or num < 0:
2287         return None
2288     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2291     if factor == 1024:
2292         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2293     converted = num / (factor ** exponent)
2294     return fmt % (converted, suffix)
2295
2296
2297 def format_bytes(bytes):
2298     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2299
2300
2301 def lookup_unit_table(unit_table, s, strict=False):
2302     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2303     units_re = '|'.join(re.escape(u) for u in unit_table)
2304     m = (re.fullmatch if strict else re.match)(
2305         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2306     if not m:
2307         return None
2308
2309     num = float(m.group('num').replace(',', '.'))
2310     mult = unit_table[m.group('unit')]
2311     return round(num * mult)
2312
2313
2314 def parse_bytes(s):
2315     """Parse a string indicating a byte quantity into an integer"""
2316     return lookup_unit_table(
2317         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318         s.upper(), strict=True)
2319
2320
2321 def parse_filesize(s):
2322     if s is None:
2323         return None
2324
2325     # The lower-case forms are of course incorrect and unofficial,
2326     # but we support those too
2327     _UNIT_TABLE = {
2328         'B': 1,
2329         'b': 1,
2330         'bytes': 1,
2331         'KiB': 1024,
2332         'KB': 1000,
2333         'kB': 1024,
2334         'Kb': 1000,
2335         'kb': 1000,
2336         'kilobytes': 1000,
2337         'kibibytes': 1024,
2338         'MiB': 1024 ** 2,
2339         'MB': 1000 ** 2,
2340         'mB': 1024 ** 2,
2341         'Mb': 1000 ** 2,
2342         'mb': 1000 ** 2,
2343         'megabytes': 1000 ** 2,
2344         'mebibytes': 1024 ** 2,
2345         'GiB': 1024 ** 3,
2346         'GB': 1000 ** 3,
2347         'gB': 1024 ** 3,
2348         'Gb': 1000 ** 3,
2349         'gb': 1000 ** 3,
2350         'gigabytes': 1000 ** 3,
2351         'gibibytes': 1024 ** 3,
2352         'TiB': 1024 ** 4,
2353         'TB': 1000 ** 4,
2354         'tB': 1024 ** 4,
2355         'Tb': 1000 ** 4,
2356         'tb': 1000 ** 4,
2357         'terabytes': 1000 ** 4,
2358         'tebibytes': 1024 ** 4,
2359         'PiB': 1024 ** 5,
2360         'PB': 1000 ** 5,
2361         'pB': 1024 ** 5,
2362         'Pb': 1000 ** 5,
2363         'pb': 1000 ** 5,
2364         'petabytes': 1000 ** 5,
2365         'pebibytes': 1024 ** 5,
2366         'EiB': 1024 ** 6,
2367         'EB': 1000 ** 6,
2368         'eB': 1024 ** 6,
2369         'Eb': 1000 ** 6,
2370         'eb': 1000 ** 6,
2371         'exabytes': 1000 ** 6,
2372         'exbibytes': 1024 ** 6,
2373         'ZiB': 1024 ** 7,
2374         'ZB': 1000 ** 7,
2375         'zB': 1024 ** 7,
2376         'Zb': 1000 ** 7,
2377         'zb': 1000 ** 7,
2378         'zettabytes': 1000 ** 7,
2379         'zebibytes': 1024 ** 7,
2380         'YiB': 1024 ** 8,
2381         'YB': 1000 ** 8,
2382         'yB': 1024 ** 8,
2383         'Yb': 1000 ** 8,
2384         'yb': 1000 ** 8,
2385         'yottabytes': 1000 ** 8,
2386         'yobibytes': 1024 ** 8,
2387     }
2388
2389     return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392 def parse_count(s):
2393     if s is None:
2394         return None
2395
2396     s = re.sub(r'^[^\d]+\s', '', s).strip()
2397
2398     if re.match(r'^[\d,.]+$', s):
2399         return str_to_int(s)
2400
2401     _UNIT_TABLE = {
2402         'k': 1000,
2403         'K': 1000,
2404         'm': 1000 ** 2,
2405         'M': 1000 ** 2,
2406         'kk': 1000 ** 2,
2407         'KK': 1000 ** 2,
2408         'b': 1000 ** 3,
2409         'B': 1000 ** 3,
2410     }
2411
2412     ret = lookup_unit_table(_UNIT_TABLE, s)
2413     if ret is not None:
2414         return ret
2415
2416     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417     if mobj:
2418         return str_to_int(mobj.group(1))
2419
2420
2421 def parse_resolution(s, *, lenient=False):
2422     if s is None:
2423         return {}
2424
2425     if lenient:
2426         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427     else:
2428         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2429     if mobj:
2430         return {
2431             'width': int(mobj.group('w')),
2432             'height': int(mobj.group('h')),
2433         }
2434
2435     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2436     if mobj:
2437         return {'height': int(mobj.group(1))}
2438
2439     mobj = re.search(r'\b([48])[kK]\b', s)
2440     if mobj:
2441         return {'height': int(mobj.group(1)) * 540}
2442
2443     return {}
2444
2445
2446 def parse_bitrate(s):
2447     if not isinstance(s, str):
2448         return
2449     mobj = re.search(r'\b(\d+)\s*kbps', s)
2450     if mobj:
2451         return int(mobj.group(1))
2452
2453
2454 def month_by_name(name, lang='en'):
2455     """ Return the number of a month by (locale-independently) English name """
2456
2457     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2458
2459     try:
2460         return month_names.index(name) + 1
2461     except ValueError:
2462         return None
2463
2464
2465 def month_by_abbreviation(abbrev):
2466     """ Return the number of a month by (locale-independently) English
2467         abbreviations """
2468
2469     try:
2470         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2471     except ValueError:
2472         return None
2473
2474
2475 def fix_xml_ampersands(xml_str):
2476     """Replace all the '&' by '&amp;' in XML"""
2477     return re.sub(
2478         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2479         '&amp;',
2480         xml_str)
2481
2482
2483 def setproctitle(title):
2484     assert isinstance(title, str)
2485
2486     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487     try:
2488         import ctypes
2489     except ImportError:
2490         return
2491
2492     try:
2493         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2494     except OSError:
2495         return
2496     except TypeError:
2497         # LoadLibrary in Windows Python 2.7.13 only expects
2498         # a bytestring, but since unicode_literals turns
2499         # every string into a unicode string, it fails.
2500         return
2501     title_bytes = title.encode()
2502     buf = ctypes.create_string_buffer(len(title_bytes))
2503     buf.value = title_bytes
2504     try:
2505         libc.prctl(15, buf, 0, 0, 0)
2506     except AttributeError:
2507         return  # Strange libc, just skip this
2508
2509
2510 def remove_start(s, start):
2511     return s[len(start):] if s is not None and s.startswith(start) else s
2512
2513
2514 def remove_end(s, end):
2515     return s[:-len(end)] if s is not None and s.endswith(end) else s
2516
2517
2518 def remove_quotes(s):
2519     if s is None or len(s) < 2:
2520         return s
2521     for quote in ('"', "'", ):
2522         if s[0] == quote and s[-1] == quote:
2523             return s[1:-1]
2524     return s
2525
2526
2527 def get_domain(url):
2528     """
2529     This implementation is inconsistent, but is kept for compatibility.
2530     Use this only for "webpage_url_domain"
2531     """
2532     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2533
2534
2535 def url_basename(url):
2536     path = urllib.parse.urlparse(url).path
2537     return path.strip('/').split('/')[-1]
2538
2539
2540 def base_url(url):
2541     return re.match(r'https?://[^?#]+/', url).group()
2542
2543
2544 def urljoin(base, path):
2545     if isinstance(path, bytes):
2546         path = path.decode()
2547     if not isinstance(path, str) or not path:
2548         return None
2549     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2550         return path
2551     if isinstance(base, bytes):
2552         base = base.decode()
2553     if not isinstance(base, str) or not re.match(
2554             r'^(?:https?:)?//', base):
2555         return None
2556     return urllib.parse.urljoin(base, path)
2557
2558
2559 class HEADRequest(urllib.request.Request):
2560     def get_method(self):
2561         return 'HEAD'
2562
2563
2564 class PUTRequest(urllib.request.Request):
2565     def get_method(self):
2566         return 'PUT'
2567
2568
2569 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2570     if get_attr and v is not None:
2571         v = getattr(v, get_attr, None)
2572     try:
2573         return int(v) * invscale // scale
2574     except (ValueError, TypeError, OverflowError):
2575         return default
2576
2577
2578 def str_or_none(v, default=None):
2579     return default if v is None else str(v)
2580
2581
2582 def str_to_int(int_str):
2583     """ A more relaxed version of int_or_none """
2584     if isinstance(int_str, int):
2585         return int_str
2586     elif isinstance(int_str, str):
2587         int_str = re.sub(r'[,\.\+]', '', int_str)
2588         return int_or_none(int_str)
2589
2590
2591 def float_or_none(v, scale=1, invscale=1, default=None):
2592     if v is None:
2593         return default
2594     try:
2595         return float(v) * invscale / scale
2596     except (ValueError, TypeError):
2597         return default
2598
2599
2600 def bool_or_none(v, default=None):
2601     return v if isinstance(v, bool) else default
2602
2603
2604 def strip_or_none(v, default=None):
2605     return v.strip() if isinstance(v, str) else default
2606
2607
2608 def url_or_none(url):
2609     if not url or not isinstance(url, str):
2610         return None
2611     url = url.strip()
2612     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2613
2614
2615 def request_to_url(req):
2616     if isinstance(req, urllib.request.Request):
2617         return req.get_full_url()
2618     else:
2619         return req
2620
2621
2622 def strftime_or_none(timestamp, date_format, default=None):
2623     datetime_object = None
2624     try:
2625         if isinstance(timestamp, (int, float)):  # unix timestamp
2626             # Using naive datetime here can break timestamp() in Windows
2627             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628             datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2629         elif isinstance(timestamp, str):  # assume YYYYMMDD
2630             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2631         date_format = re.sub(  # Support %s on windows
2632             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2633         return datetime_object.strftime(date_format)
2634     except (ValueError, TypeError, AttributeError):
2635         return default
2636
2637
2638 def parse_duration(s):
2639     if not isinstance(s, str):
2640         return None
2641     s = s.strip()
2642     if not s:
2643         return None
2644
2645     days, hours, mins, secs, ms = [None] * 5
2646     m = re.match(r'''(?x)
2647             (?P<before_secs>
2648                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650             (?P<ms>[.:][0-9]+)?Z?$
2651         ''', s)
2652     if m:
2653         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2654     else:
2655         m = re.match(
2656             r'''(?ix)(?:P?
2657                 (?:
2658                     [0-9]+\s*y(?:ears?)?,?\s*
2659                 )?
2660                 (?:
2661                     [0-9]+\s*m(?:onths?)?,?\s*
2662                 )?
2663                 (?:
2664                     [0-9]+\s*w(?:eeks?)?,?\s*
2665                 )?
2666                 (?:
2667                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2668                 )?
2669                 T)?
2670                 (?:
2671                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2672                 )?
2673                 (?:
2674                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2675                 )?
2676                 (?:
2677                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2678                 )?Z?$''', s)
2679         if m:
2680             days, hours, mins, secs, ms = m.groups()
2681         else:
2682             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2683             if m:
2684                 hours, mins = m.groups()
2685             else:
2686                 return None
2687
2688     if ms:
2689         ms = ms.replace(':', '.')
2690     return sum(float(part or 0) * mult for part, mult in (
2691         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2692
2693
2694 def prepend_extension(filename, ext, expected_real_ext=None):
2695     name, real_ext = os.path.splitext(filename)
2696     return (
2697         f'{name}.{ext}{real_ext}'
2698         if not expected_real_ext or real_ext[1:] == expected_real_ext
2699         else f'{filename}.{ext}')
2700
2701
2702 def replace_extension(filename, ext, expected_real_ext=None):
2703     name, real_ext = os.path.splitext(filename)
2704     return '{}.{}'.format(
2705         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706         ext)
2707
2708
2709 def check_executable(exe, args=[]):
2710     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711     args can be a list of arguments for a short output (like -version) """
2712     try:
2713         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2714     except OSError:
2715         return False
2716     return exe
2717
2718
2719 def _get_exe_version_output(exe, args):
2720     try:
2721         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2722         # SIGTTOU if yt-dlp is run in the background.
2723         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2724         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2726     except OSError:
2727         return False
2728     return stdout
2729
2730
2731 def detect_exe_version(output, version_re=None, unrecognized='present'):
2732     assert isinstance(output, str)
2733     if version_re is None:
2734         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735     m = re.search(version_re, output)
2736     if m:
2737         return m.group(1)
2738     else:
2739         return unrecognized
2740
2741
2742 def get_exe_version(exe, args=['--version'],
2743                     version_re=None, unrecognized='present'):
2744     """ Returns the version of the specified executable,
2745     or False if the executable is not present """
2746     out = _get_exe_version_output(exe, args)
2747     return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
2750 def frange(start=0, stop=None, step=1):
2751     """Float range"""
2752     if stop is None:
2753         start, stop = 0, start
2754     sign = [-1, 1][step > 0] if step else 0
2755     while sign * start < sign * stop:
2756         yield start
2757         start += step
2758
2759
2760 class LazyList(collections.abc.Sequence):
2761     """Lazy immutable list from an iterable
2762     Note that slices of a LazyList are lists and not LazyList"""
2763
2764     class IndexError(IndexError):
2765         pass
2766
2767     def __init__(self, iterable, *, reverse=False, _cache=None):
2768         self._iterable = iter(iterable)
2769         self._cache = [] if _cache is None else _cache
2770         self._reversed = reverse
2771
2772     def __iter__(self):
2773         if self._reversed:
2774             # We need to consume the entire iterable to iterate in reverse
2775             yield from self.exhaust()
2776             return
2777         yield from self._cache
2778         for item in self._iterable:
2779             self._cache.append(item)
2780             yield item
2781
2782     def _exhaust(self):
2783         self._cache.extend(self._iterable)
2784         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2785         return self._cache
2786
2787     def exhaust(self):
2788         """Evaluate the entire iterable"""
2789         return self._exhaust()[::-1 if self._reversed else 1]
2790
2791     @staticmethod
2792     def _reverse_index(x):
2793         return None if x is None else ~x
2794
2795     def __getitem__(self, idx):
2796         if isinstance(idx, slice):
2797             if self._reversed:
2798                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2799             start, stop, step = idx.start, idx.stop, idx.step or 1
2800         elif isinstance(idx, int):
2801             if self._reversed:
2802                 idx = self._reverse_index(idx)
2803             start, stop, step = idx, idx, 0
2804         else:
2805             raise TypeError('indices must be integers or slices')
2806         if ((start or 0) < 0 or (stop or 0) < 0
2807                 or (start is None and step < 0)
2808                 or (stop is None and step > 0)):
2809             # We need to consume the entire iterable to be able to slice from the end
2810             # Obviously, never use this with infinite iterables
2811             self._exhaust()
2812             try:
2813                 return self._cache[idx]
2814             except IndexError as e:
2815                 raise self.IndexError(e) from e
2816         n = max(start or 0, stop or 0) - len(self._cache) + 1
2817         if n > 0:
2818             self._cache.extend(itertools.islice(self._iterable, n))
2819         try:
2820             return self._cache[idx]
2821         except IndexError as e:
2822             raise self.IndexError(e) from e
2823
2824     def __bool__(self):
2825         try:
2826             self[-1] if self._reversed else self[0]
2827         except self.IndexError:
2828             return False
2829         return True
2830
2831     def __len__(self):
2832         self._exhaust()
2833         return len(self._cache)
2834
2835     def __reversed__(self):
2836         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2837
2838     def __copy__(self):
2839         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2840
2841     def __repr__(self):
2842         # repr and str should mimic a list. So we exhaust the iterable
2843         return repr(self.exhaust())
2844
2845     def __str__(self):
2846         return repr(self.exhaust())
2847
2848
2849 class PagedList:
2850
2851     class IndexError(IndexError):
2852         pass
2853
2854     def __len__(self):
2855         # This is only useful for tests
2856         return len(self.getslice())
2857
2858     def __init__(self, pagefunc, pagesize, use_cache=True):
2859         self._pagefunc = pagefunc
2860         self._pagesize = pagesize
2861         self._pagecount = float('inf')
2862         self._use_cache = use_cache
2863         self._cache = {}
2864
2865     def getpage(self, pagenum):
2866         page_results = self._cache.get(pagenum)
2867         if page_results is None:
2868             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2869         if self._use_cache:
2870             self._cache[pagenum] = page_results
2871         return page_results
2872
2873     def getslice(self, start=0, end=None):
2874         return list(self._getslice(start, end))
2875
2876     def _getslice(self, start, end):
2877         raise NotImplementedError('This method must be implemented by subclasses')
2878
2879     def __getitem__(self, idx):
2880         assert self._use_cache, 'Indexing PagedList requires cache'
2881         if not isinstance(idx, int) or idx < 0:
2882             raise TypeError('indices must be non-negative integers')
2883         entries = self.getslice(idx, idx + 1)
2884         if not entries:
2885             raise self.IndexError()
2886         return entries[0]
2887
2888
2889 class OnDemandPagedList(PagedList):
2890     """Download pages until a page with less than maximum results"""
2891
2892     def _getslice(self, start, end):
2893         for pagenum in itertools.count(start // self._pagesize):
2894             firstid = pagenum * self._pagesize
2895             nextfirstid = pagenum * self._pagesize + self._pagesize
2896             if start >= nextfirstid:
2897                 continue
2898
2899             startv = (
2900                 start % self._pagesize
2901                 if firstid <= start < nextfirstid
2902                 else 0)
2903             endv = (
2904                 ((end - 1) % self._pagesize) + 1
2905                 if (end is not None and firstid <= end <= nextfirstid)
2906                 else None)
2907
2908             try:
2909                 page_results = self.getpage(pagenum)
2910             except Exception:
2911                 self._pagecount = pagenum - 1
2912                 raise
2913             if startv != 0 or endv is not None:
2914                 page_results = page_results[startv:endv]
2915             yield from page_results
2916
2917             # A little optimization - if current page is not "full", ie. does
2918             # not contain page_size videos then we can assume that this page
2919             # is the last one - there are no more ids on further pages -
2920             # i.e. no need to query again.
2921             if len(page_results) + startv < self._pagesize:
2922                 break
2923
2924             # If we got the whole page, but the next page is not interesting,
2925             # break out early as well
2926             if end == nextfirstid:
2927                 break
2928
2929
2930 class InAdvancePagedList(PagedList):
2931     """PagedList with total number of pages known in advance"""
2932
2933     def __init__(self, pagefunc, pagecount, pagesize):
2934         PagedList.__init__(self, pagefunc, pagesize, True)
2935         self._pagecount = pagecount
2936
2937     def _getslice(self, start, end):
2938         start_page = start // self._pagesize
2939         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2940         skip_elems = start - start_page * self._pagesize
2941         only_more = None if end is None else end - start
2942         for pagenum in range(start_page, end_page):
2943             page_results = self.getpage(pagenum)
2944             if skip_elems:
2945                 page_results = page_results[skip_elems:]
2946                 skip_elems = None
2947             if only_more is not None:
2948                 if len(page_results) < only_more:
2949                     only_more -= len(page_results)
2950                 else:
2951                     yield from page_results[:only_more]
2952                     break
2953             yield from page_results
2954
2955
2956 class PlaylistEntries:
2957     MissingEntry = object()
2958     is_exhausted = False
2959
2960     def __init__(self, ydl, info_dict):
2961         self.ydl = ydl
2962
2963         # _entries must be assigned now since infodict can change during iteration
2964         entries = info_dict.get('entries')
2965         if entries is None:
2966             raise EntryNotInPlaylist('There are no entries')
2967         elif isinstance(entries, list):
2968             self.is_exhausted = True
2969
2970         requested_entries = info_dict.get('requested_entries')
2971         self.is_incomplete = requested_entries is not None
2972         if self.is_incomplete:
2973             assert self.is_exhausted
2974             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2975             for i, entry in zip(requested_entries, entries):
2976                 self._entries[i - 1] = entry
2977         elif isinstance(entries, (list, PagedList, LazyList)):
2978             self._entries = entries
2979         else:
2980             self._entries = LazyList(entries)
2981
2982     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983         (?P<start>[+-]?\d+)?
2984         (?P<range>[:-]
2985             (?P<end>[+-]?\d+|inf(?:inite)?)?
2986             (?::(?P<step>[+-]?\d+))?
2987         )?''')
2988
2989     @classmethod
2990     def parse_playlist_items(cls, string):
2991         for segment in string.split(','):
2992             if not segment:
2993                 raise ValueError('There is two or more consecutive commas')
2994             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995             if not mobj:
2996                 raise ValueError(f'{segment!r} is not a valid specification')
2997             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998             if int_or_none(step) == 0:
2999                 raise ValueError(f'Step in {segment!r} cannot be zero')
3000             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002     def get_requested_items(self):
3003         playlist_items = self.ydl.params.get('playlist_items')
3004         playlist_start = self.ydl.params.get('playliststart', 1)
3005         playlist_end = self.ydl.params.get('playlistend')
3006         # For backwards compatibility, interpret -1 as whole list
3007         if playlist_end in (-1, None):
3008             playlist_end = ''
3009         if not playlist_items:
3010             playlist_items = f'{playlist_start}:{playlist_end}'
3011         elif playlist_start != 1 or playlist_end:
3012             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014         for index in self.parse_playlist_items(playlist_items):
3015             for i, entry in self[index]:
3016                 yield i, entry
3017                 if not entry:
3018                     continue
3019                 try:
3020                     # TODO: Add auto-generated fields
3021                     self.ydl._match_entry(entry, incomplete=True, silent=True)
3022                 except (ExistingVideoReached, RejectedVideoReached):
3023                     return
3024
3025     def get_full_count(self):
3026         if self.is_exhausted and not self.is_incomplete:
3027             return len(self)
3028         elif isinstance(self._entries, InAdvancePagedList):
3029             if self._entries._pagesize == 1:
3030                 return self._entries._pagecount
3031
3032     @functools.cached_property
3033     def _getter(self):
3034         if isinstance(self._entries, list):
3035             def get_entry(i):
3036                 try:
3037                     entry = self._entries[i]
3038                 except IndexError:
3039                     entry = self.MissingEntry
3040                     if not self.is_incomplete:
3041                         raise self.IndexError()
3042                 if entry is self.MissingEntry:
3043                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3044                 return entry
3045         else:
3046             def get_entry(i):
3047                 try:
3048                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049                 except (LazyList.IndexError, PagedList.IndexError):
3050                     raise self.IndexError()
3051         return get_entry
3052
3053     def __getitem__(self, idx):
3054         if isinstance(idx, int):
3055             idx = slice(idx, idx)
3056
3057         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058         step = 1 if idx.step is None else idx.step
3059         if idx.start is None:
3060             start = 0 if step > 0 else len(self) - 1
3061         else:
3062             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064         # NB: Do not call len(self) when idx == [:]
3065         if idx.stop is None:
3066             stop = 0 if step < 0 else float('inf')
3067         else:
3068             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069         stop += [-1, 1][step > 0]
3070
3071         for i in frange(start, stop, step):
3072             if i < 0:
3073                 continue
3074             try:
3075                 entry = self._getter(i)
3076             except self.IndexError:
3077                 self.is_exhausted = True
3078                 if step > 0:
3079                     break
3080                 continue
3081             yield i + 1, entry
3082
3083     def __len__(self):
3084         return len(tuple(self[:]))
3085
3086     class IndexError(IndexError):
3087         pass
3088
3089
3090 def uppercase_escape(s):
3091     unicode_escape = codecs.getdecoder('unicode_escape')
3092     return re.sub(
3093         r'\\U[0-9a-fA-F]{8}',
3094         lambda m: unicode_escape(m.group(0))[0],
3095         s)
3096
3097
3098 def lowercase_escape(s):
3099     unicode_escape = codecs.getdecoder('unicode_escape')
3100     return re.sub(
3101         r'\\u[0-9a-fA-F]{4}',
3102         lambda m: unicode_escape(m.group(0))[0],
3103         s)
3104
3105
3106 def escape_rfc3986(s):
3107     """Escape non-ASCII characters as suggested by RFC 3986"""
3108     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3109
3110
3111 def escape_url(url):
3112     """Escape URL as suggested by RFC 3986"""
3113     url_parsed = urllib.parse.urlparse(url)
3114     return url_parsed._replace(
3115         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3116         path=escape_rfc3986(url_parsed.path),
3117         params=escape_rfc3986(url_parsed.params),
3118         query=escape_rfc3986(url_parsed.query),
3119         fragment=escape_rfc3986(url_parsed.fragment)
3120     ).geturl()
3121
3122
3123 def parse_qs(url, **kwargs):
3124     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3125
3126
3127 def read_batch_urls(batch_fd):
3128     def fixup(url):
3129         if not isinstance(url, str):
3130             url = url.decode('utf-8', 'replace')
3131         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132         for bom in BOM_UTF8:
3133             if url.startswith(bom):
3134                 url = url[len(bom):]
3135         url = url.lstrip()
3136         if not url or url.startswith(('#', ';', ']')):
3137             return False
3138         # "#" cannot be stripped out since it is part of the URI
3139         # However, it can be safely stripped out if following a whitespace
3140         return re.split(r'\s#', url, 1)[0].rstrip()
3141
3142     with contextlib.closing(batch_fd) as fd:
3143         return [url for url in map(fixup, fd) if url]
3144
3145
3146 def urlencode_postdata(*args, **kargs):
3147     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3148
3149
3150 def update_url_query(url, query):
3151     if not query:
3152         return url
3153     parsed_url = urllib.parse.urlparse(url)
3154     qs = urllib.parse.parse_qs(parsed_url.query)
3155     qs.update(query)
3156     return urllib.parse.urlunparse(parsed_url._replace(
3157         query=urllib.parse.urlencode(qs, True)))
3158
3159
3160 def update_Request(req, url=None, data=None, headers=None, query=None):
3161     req_headers = req.headers.copy()
3162     req_headers.update(headers or {})
3163     req_data = data or req.data
3164     req_url = update_url_query(url or req.get_full_url(), query)
3165     req_get_method = req.get_method()
3166     if req_get_method == 'HEAD':
3167         req_type = HEADRequest
3168     elif req_get_method == 'PUT':
3169         req_type = PUTRequest
3170     else:
3171         req_type = urllib.request.Request
3172     new_req = req_type(
3173         req_url, data=req_data, headers=req_headers,
3174         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175     if hasattr(req, 'timeout'):
3176         new_req.timeout = req.timeout
3177     return new_req
3178
3179
3180 def _multipart_encode_impl(data, boundary):
3181     content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183     out = b''
3184     for k, v in data.items():
3185         out += b'--' + boundary.encode('ascii') + b'\r\n'
3186         if isinstance(k, str):
3187             k = k.encode()
3188         if isinstance(v, str):
3189             v = v.encode()
3190         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3192         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3193         if boundary.encode('ascii') in content:
3194             raise ValueError('Boundary overlaps with data')
3195         out += content
3196
3197     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199     return out, content_type
3200
3201
3202 def multipart_encode(data, boundary=None):
3203     '''
3204     Encode a dict to RFC 7578-compliant form-data
3205
3206     data:
3207         A dict where keys and values can be either Unicode or bytes-like
3208         objects.
3209     boundary:
3210         If specified a Unicode object, it's used as the boundary. Otherwise
3211         a random boundary is generated.
3212
3213     Reference: https://tools.ietf.org/html/rfc7578
3214     '''
3215     has_specified_boundary = boundary is not None
3216
3217     while True:
3218         if boundary is None:
3219             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221         try:
3222             out, content_type = _multipart_encode_impl(data, boundary)
3223             break
3224         except ValueError:
3225             if has_specified_boundary:
3226                 raise
3227             boundary = None
3228
3229     return out, content_type
3230
3231
3232 def variadic(x, allowed_types=(str, bytes, dict)):
3233     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
3236 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3237     for val in map(d.get, variadic(key_or_keys)):
3238         if val is not None and (val or not skip_false_values):
3239             return val
3240     return default
3241
3242
3243 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244     for f in funcs:
3245         try:
3246             val = f(*args, **kwargs)
3247         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3248             pass
3249         else:
3250             if expected_type is None or isinstance(val, expected_type):
3251                 return val
3252
3253
3254 def try_get(src, getter, expected_type=None):
3255     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3256
3257
3258 def filter_dict(dct, cndn=lambda _, v: v is not None):
3259     return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
3262 def merge_dicts(*dicts):
3263     merged = {}
3264     for a_dict in dicts:
3265         for k, v in a_dict.items():
3266             if (v is not None and k not in merged
3267                     or isinstance(v, str) and merged[k] == ''):
3268                 merged[k] = v
3269     return merged
3270
3271
3272 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3273     return string if isinstance(string, str) else str(string, encoding, errors)
3274
3275
3276 US_RATINGS = {
3277     'G': 0,
3278     'PG': 10,
3279     'PG-13': 13,
3280     'R': 16,
3281     'NC': 18,
3282 }
3283
3284
3285 TV_PARENTAL_GUIDELINES = {
3286     'TV-Y': 0,
3287     'TV-Y7': 7,
3288     'TV-G': 0,
3289     'TV-PG': 0,
3290     'TV-14': 14,
3291     'TV-MA': 17,
3292 }
3293
3294
3295 def parse_age_limit(s):
3296     # isinstance(False, int) is True. So type() must be used instead
3297     if type(s) is int:  # noqa: E721
3298         return s if 0 <= s <= 21 else None
3299     elif not isinstance(s, str):
3300         return None
3301     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3302     if m:
3303         return int(m.group('age'))
3304     s = s.upper()
3305     if s in US_RATINGS:
3306         return US_RATINGS[s]
3307     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3308     if m:
3309         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3310     return None
3311
3312
3313 def strip_jsonp(code):
3314     return re.sub(
3315         r'''(?sx)^
3316             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3317             (?:\s*&&\s*(?P=func_name))?
3318             \s*\(\s*(?P<callback_data>.*)\);?
3319             \s*?(?://[^\n]*)*$''',
3320         r'\g<callback_data>', code)
3321
3322
3323 def js_to_json(code, vars={}, *, strict=False):
3324     # vars is a dict of var, val pairs to substitute
3325     STRING_QUOTES = '\'"'
3326     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3327     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3328     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3329     INTEGER_TABLE = (
3330         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3332     )
3333
3334     def process_escape(match):
3335         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336         escape = match.group(1) or match.group(2)
3337
3338         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339                 else R'\u00' if escape == 'x'
3340                 else '' if escape == '\n'
3341                 else escape)
3342
3343     def fix_kv(m):
3344         v = m.group(0)
3345         if v in ('true', 'false', 'null'):
3346             return v
3347         elif v in ('undefined', 'void 0'):
3348             return 'null'
3349         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3350             return ''
3351
3352         if v[0] in STRING_QUOTES:
3353             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354             return f'"{escaped}"'
3355
3356         for regex, base in INTEGER_TABLE:
3357             im = re.match(regex, v)
3358             if im:
3359                 i = int(im.group(1), base)
3360                 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362         if v in vars:
3363             try:
3364                 if not strict:
3365                     json.loads(vars[v])
3366             except json.decoder.JSONDecodeError:
3367                 return json.dumps(vars[v])
3368             else:
3369                 return vars[v]
3370
3371         if not strict:
3372             return f'"{v}"'
3373
3374         raise ValueError(f'Unknown value: {v}')
3375
3376     def create_map(mobj):
3377         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3378
3379     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3380     if not strict:
3381         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3382         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3383
3384     return re.sub(rf'''(?sx)
3385         {STRING_RE}|
3386         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3387         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3388         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3389         [0-9]+(?={SKIP_RE}:)|
3390         !+
3391         ''', fix_kv, code)
3392
3393
3394 def qualities(quality_ids):
3395     """ Get a numeric quality value out of a list of possible values """
3396     def q(qid):
3397         try:
3398             return quality_ids.index(qid)
3399         except ValueError:
3400             return -1
3401     return q
3402
3403
3404 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3405
3406
3407 DEFAULT_OUTTMPL = {
3408     'default': '%(title)s [%(id)s].%(ext)s',
3409     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3410 }
3411 OUTTMPL_TYPES = {
3412     'chapter': None,
3413     'subtitle': None,
3414     'thumbnail': None,
3415     'description': 'description',
3416     'annotation': 'annotations.xml',
3417     'infojson': 'info.json',
3418     'link': None,
3419     'pl_video': None,
3420     'pl_thumbnail': None,
3421     'pl_description': 'description',
3422     'pl_infojson': 'info.json',
3423 }
3424
3425 # As of [1] format syntax is:
3426 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3427 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3428 STR_FORMAT_RE_TMPL = r'''(?x)
3429     (?<!%)(?P<prefix>(?:%%)*)
3430     %
3431     (?P<has_key>\((?P<key>{0})\))?
3432     (?P<format>
3433         (?P<conversion>[#0\-+ ]+)?
3434         (?P<min_width>\d+)?
3435         (?P<precision>\.\d+)?
3436         (?P<len_mod>[hlL])?  # unused in python
3437         {1}  # conversion type
3438     )
3439 '''
3440
3441
3442 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3443
3444
3445 def limit_length(s, length):
3446     """ Add ellipses to overly long strings """
3447     if s is None:
3448         return None
3449     ELLIPSES = '...'
3450     if len(s) > length:
3451         return s[:length - len(ELLIPSES)] + ELLIPSES
3452     return s
3453
3454
3455 def version_tuple(v):
3456     return tuple(int(e) for e in re.split(r'[-.]', v))
3457
3458
3459 def is_outdated_version(version, limit, assume_new=True):
3460     if not version:
3461         return not assume_new
3462     try:
3463         return version_tuple(version) < version_tuple(limit)
3464     except ValueError:
3465         return not assume_new
3466
3467
3468 def ytdl_is_updateable():
3469     """ Returns if yt-dlp can be updated with -U """
3470
3471     from .update import is_non_updateable
3472
3473     return not is_non_updateable()
3474
3475
3476 def args_to_str(args):
3477     # Get a short string representation for a subprocess command
3478     return ' '.join(compat_shlex_quote(a) for a in args)
3479
3480
3481 def error_to_compat_str(err):
3482     return str(err)
3483
3484
3485 def error_to_str(err):
3486     return f'{type(err).__name__}: {err}'
3487
3488
3489 def mimetype2ext(mt, default=NO_DEFAULT):
3490     if not isinstance(mt, str):
3491         if default is not NO_DEFAULT:
3492             return default
3493         return None
3494
3495     MAP = {
3496         # video
3497         '3gpp': '3gp',
3498         'mp2t': 'ts',
3499         'mp4': 'mp4',
3500         'mpeg': 'mpeg',
3501         'mpegurl': 'm3u8',
3502         'quicktime': 'mov',
3503         'webm': 'webm',
3504         'vp9': 'vp9',
3505         'x-flv': 'flv',
3506         'x-m4v': 'm4v',
3507         'x-matroska': 'mkv',
3508         'x-mng': 'mng',
3509         'x-mp4-fragmented': 'mp4',
3510         'x-ms-asf': 'asf',
3511         'x-ms-wmv': 'wmv',
3512         'x-msvideo': 'avi',
3513
3514         # application (streaming playlists)
3515         'dash+xml': 'mpd',
3516         'f4m+xml': 'f4m',
3517         'hds+xml': 'f4m',
3518         'vnd.apple.mpegurl': 'm3u8',
3519         'vnd.ms-sstr+xml': 'ism',
3520         'x-mpegurl': 'm3u8',
3521
3522         # audio
3523         'audio/mp4': 'm4a',
3524         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3525         # Using .mp3 as it's the most popular one
3526         'audio/mpeg': 'mp3',
3527         'audio/webm': 'weba',
3528         'audio/x-matroska': 'mka',
3529         'audio/x-mpegurl': 'm3u',
3530         'midi': 'mid',
3531         'ogg': 'ogg',
3532         'wav': 'wav',
3533         'wave': 'wav',
3534         'x-aac': 'aac',
3535         'x-flac': 'flac',
3536         'x-m4a': 'm4a',
3537         'x-realaudio': 'ra',
3538         'x-wav': 'wav',
3539
3540         # image
3541         'avif': 'avif',
3542         'bmp': 'bmp',
3543         'gif': 'gif',
3544         'jpeg': 'jpg',
3545         'png': 'png',
3546         'svg+xml': 'svg',
3547         'tiff': 'tif',
3548         'vnd.wap.wbmp': 'wbmp',
3549         'webp': 'webp',
3550         'x-icon': 'ico',
3551         'x-jng': 'jng',
3552         'x-ms-bmp': 'bmp',
3553
3554         # caption
3555         'filmstrip+json': 'fs',
3556         'smptett+xml': 'tt',
3557         'ttaf+xml': 'dfxp',
3558         'ttml+xml': 'ttml',
3559         'x-ms-sami': 'sami',
3560
3561         # misc
3562         'gzip': 'gz',
3563         'json': 'json',
3564         'xml': 'xml',
3565         'zip': 'zip',
3566     }
3567
3568     mimetype = mt.partition(';')[0].strip().lower()
3569     _, _, subtype = mimetype.rpartition('/')
3570
3571     ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3572     if ext:
3573         return ext
3574     elif default is not NO_DEFAULT:
3575         return default
3576     return subtype.replace('+', '.')
3577
3578
3579 def ext2mimetype(ext_or_url):
3580     if not ext_or_url:
3581         return None
3582     if '.' not in ext_or_url:
3583         ext_or_url = f'file.{ext_or_url}'
3584     return mimetypes.guess_type(ext_or_url)[0]
3585
3586
3587 def parse_codecs(codecs_str):
3588     # http://tools.ietf.org/html/rfc6381
3589     if not codecs_str:
3590         return {}
3591     split_codecs = list(filter(None, map(
3592         str.strip, codecs_str.strip().strip(',').split(','))))
3593     vcodec, acodec, scodec, hdr = None, None, None, None
3594     for full_codec in split_codecs:
3595         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3596         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3597                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3598             if vcodec:
3599                 continue
3600             vcodec = full_codec
3601             if parts[0] in ('dvh1', 'dvhe'):
3602                 hdr = 'DV'
3603             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3604                 hdr = 'HDR10'
3605             elif parts[:2] == ['vp9', '2']:
3606                 hdr = 'HDR10'
3607         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3608                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3609             acodec = acodec or full_codec
3610         elif parts[0] in ('stpp', 'wvtt'):
3611             scodec = scodec or full_codec
3612         else:
3613             write_string(f'WARNING: Unknown codec {full_codec}\n')
3614     if vcodec or acodec or scodec:
3615         return {
3616             'vcodec': vcodec or 'none',
3617             'acodec': acodec or 'none',
3618             'dynamic_range': hdr,
3619             **({'scodec': scodec} if scodec is not None else {}),
3620         }
3621     elif len(split_codecs) == 2:
3622         return {
3623             'vcodec': split_codecs[0],
3624             'acodec': split_codecs[1],
3625         }
3626     return {}
3627
3628
3629 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3630     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3631
3632     allow_mkv = not preferences or 'mkv' in preferences
3633
3634     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3635         return 'mkv'  # TODO: any other format allows this?
3636
3637     # TODO: All codecs supported by parse_codecs isn't handled here
3638     COMPATIBLE_CODECS = {
3639         'mp4': {
3640             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3641             'h264', 'aacl', 'ec-3',  # Set in ISM
3642         },
3643         'webm': {
3644             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3645             'vp9x', 'vp8x',  # in the webm spec
3646         },
3647     }
3648
3649     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3650     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3651
3652     for ext in preferences or COMPATIBLE_CODECS.keys():
3653         codec_set = COMPATIBLE_CODECS.get(ext, set())
3654         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3655             return ext
3656
3657     COMPATIBLE_EXTS = (
3658         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3659         {'webm'},
3660     )
3661     for ext in preferences or vexts:
3662         current_exts = {ext, *vexts, *aexts}
3663         if ext == 'mkv' or current_exts == {ext} or any(
3664                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3665             return ext
3666     return 'mkv' if allow_mkv else preferences[-1]
3667
3668
3669 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3670     getheader = url_handle.headers.get
3671
3672     cd = getheader('Content-Disposition')
3673     if cd:
3674         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3675         if m:
3676             e = determine_ext(m.group('filename'), default_ext=None)
3677             if e:
3678                 return e
3679
3680     meta_ext = getheader('x-amz-meta-name')
3681     if meta_ext:
3682         e = meta_ext.rpartition('.')[2]
3683         if e:
3684             return e
3685
3686     return mimetype2ext(getheader('Content-Type'), default=default)
3687
3688
3689 def encode_data_uri(data, mime_type):
3690     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3691
3692
3693 def age_restricted(content_limit, age_limit):
3694     """ Returns True iff the content should be blocked """
3695
3696     if age_limit is None:  # No limit set
3697         return False
3698     if content_limit is None:
3699         return False  # Content available for everyone
3700     return age_limit < content_limit
3701
3702
3703 # List of known byte-order-marks (BOM)
3704 BOMS = [
3705     (b'\xef\xbb\xbf', 'utf-8'),
3706     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3707     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3708     (b'\xff\xfe', 'utf-16-le'),
3709     (b'\xfe\xff', 'utf-16-be'),
3710 ]
3711
3712
3713 def is_html(first_bytes):
3714     """ Detect whether a file contains HTML by examining its first bytes. """
3715
3716     encoding = 'utf-8'
3717     for bom, enc in BOMS:
3718         while first_bytes.startswith(bom):
3719             encoding, first_bytes = enc, first_bytes[len(bom):]
3720
3721     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3722
3723
3724 def determine_protocol(info_dict):
3725     protocol = info_dict.get('protocol')
3726     if protocol is not None:
3727         return protocol
3728
3729     url = sanitize_url(info_dict['url'])
3730     if url.startswith('rtmp'):
3731         return 'rtmp'
3732     elif url.startswith('mms'):
3733         return 'mms'
3734     elif url.startswith('rtsp'):
3735         return 'rtsp'
3736
3737     ext = determine_ext(url)
3738     if ext == 'm3u8':
3739         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3740     elif ext == 'f4m':
3741         return 'f4m'
3742
3743     return urllib.parse.urlparse(url).scheme
3744
3745
3746 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3747     """ Render a list of rows, each as a list of values.
3748     Text after a \t will be right aligned """
3749     def width(string):
3750         return len(remove_terminal_sequences(string).replace('\t', ''))
3751
3752     def get_max_lens(table):
3753         return [max(width(str(v)) for v in col) for col in zip(*table)]
3754
3755     def filter_using_list(row, filterArray):
3756         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3757
3758     max_lens = get_max_lens(data) if hide_empty else []
3759     header_row = filter_using_list(header_row, max_lens)
3760     data = [filter_using_list(row, max_lens) for row in data]
3761
3762     table = [header_row] + data
3763     max_lens = get_max_lens(table)
3764     extra_gap += 1
3765     if delim:
3766         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3767         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3768     for row in table:
3769         for pos, text in enumerate(map(str, row)):
3770             if '\t' in text:
3771                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3772             else:
3773                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3774     ret = '\n'.join(''.join(row).rstrip() for row in table)
3775     return ret
3776
3777
3778 def _match_one(filter_part, dct, incomplete):
3779     # TODO: Generalize code with YoutubeDL._build_format_filter
3780     STRING_OPERATORS = {
3781         '*=': operator.contains,
3782         '^=': lambda attr, value: attr.startswith(value),
3783         '$=': lambda attr, value: attr.endswith(value),
3784         '~=': lambda attr, value: re.search(value, attr),
3785     }
3786     COMPARISON_OPERATORS = {
3787         **STRING_OPERATORS,
3788         '<=': operator.le,  # "<=" must be defined above "<"
3789         '<': operator.lt,
3790         '>=': operator.ge,
3791         '>': operator.gt,
3792         '=': operator.eq,
3793     }
3794
3795     if isinstance(incomplete, bool):
3796         is_incomplete = lambda _: incomplete
3797     else:
3798         is_incomplete = lambda k: k in incomplete
3799
3800     operator_rex = re.compile(r'''(?x)
3801         (?P<key>[a-z_]+)
3802         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3803         (?:
3804             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3805             (?P<strval>.+?)
3806         )
3807         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3808     m = operator_rex.fullmatch(filter_part.strip())
3809     if m:
3810         m = m.groupdict()
3811         unnegated_op = COMPARISON_OPERATORS[m['op']]
3812         if m['negation']:
3813             op = lambda attr, value: not unnegated_op(attr, value)
3814         else:
3815             op = unnegated_op
3816         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3817         if m['quote']:
3818             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3819         actual_value = dct.get(m['key'])
3820         numeric_comparison = None
3821         if isinstance(actual_value, (int, float)):
3822             # If the original field is a string and matching comparisonvalue is
3823             # a number we should respect the origin of the original field
3824             # and process comparison value as a string (see
3825             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3826             try:
3827                 numeric_comparison = int(comparison_value)
3828             except ValueError:
3829                 numeric_comparison = parse_filesize(comparison_value)
3830                 if numeric_comparison is None:
3831                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3832                 if numeric_comparison is None:
3833                     numeric_comparison = parse_duration(comparison_value)
3834         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3835             raise ValueError('Operator %s only supports string values!' % m['op'])
3836         if actual_value is None:
3837             return is_incomplete(m['key']) or m['none_inclusive']
3838         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3839
3840     UNARY_OPERATORS = {
3841         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3842         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3843     }
3844     operator_rex = re.compile(r'''(?x)
3845         (?P<op>%s)\s*(?P<key>[a-z_]+)
3846         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3847     m = operator_rex.fullmatch(filter_part.strip())
3848     if m:
3849         op = UNARY_OPERATORS[m.group('op')]
3850         actual_value = dct.get(m.group('key'))
3851         if is_incomplete(m.group('key')) and actual_value is None:
3852             return True
3853         return op(actual_value)
3854
3855     raise ValueError('Invalid filter part %r' % filter_part)
3856
3857
3858 def match_str(filter_str, dct, incomplete=False):
3859     """ Filter a dictionary with a simple string syntax.
3860     @returns           Whether the filter passes
3861     @param incomplete  Set of keys that is expected to be missing from dct.
3862                        Can be True/False to indicate all/none of the keys may be missing.
3863                        All conditions on incomplete keys pass if the key is missing
3864     """
3865     return all(
3866         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3867         for filter_part in re.split(r'(?<!\\)&', filter_str))
3868
3869
3870 def match_filter_func(filters):
3871     if not filters:
3872         return None
3873     filters = set(variadic(filters))
3874
3875     interactive = '-' in filters
3876     if interactive:
3877         filters.remove('-')
3878
3879     def _match_func(info_dict, incomplete=False):
3880         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3881             return NO_DEFAULT if interactive and not incomplete else None
3882         else:
3883             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3884             filter_str = ') | ('.join(map(str.strip, filters))
3885             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3886     return _match_func
3887
3888
3889 class download_range_func:
3890     def __init__(self, chapters, ranges):
3891         self.chapters, self.ranges = chapters, ranges
3892
3893     def __call__(self, info_dict, ydl):
3894         if not self.ranges and not self.chapters:
3895             yield {}
3896
3897         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3898                    else 'Cannot match chapters since chapter information is unavailable')
3899         for regex in self.chapters or []:
3900             for i, chapter in enumerate(info_dict.get('chapters') or []):
3901                 if re.search(regex, chapter['title']):
3902                     warning = None
3903                     yield {**chapter, 'index': i}
3904         if self.chapters and warning:
3905             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3906
3907         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3908
3909     def __eq__(self, other):
3910         return (isinstance(other, download_range_func)
3911                 and self.chapters == other.chapters and self.ranges == other.ranges)
3912
3913     def __repr__(self):
3914         return f'{type(self).__name__}({self.chapters}, {self.ranges})'
3915
3916
3917 def parse_dfxp_time_expr(time_expr):
3918     if not time_expr:
3919         return
3920
3921     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3922     if mobj:
3923         return float(mobj.group('time_offset'))
3924
3925     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3926     if mobj:
3927         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3928
3929
3930 def srt_subtitles_timecode(seconds):
3931     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3932
3933
3934 def ass_subtitles_timecode(seconds):
3935     time = timetuple_from_msec(seconds * 1000)
3936     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3937
3938
3939 def dfxp2srt(dfxp_data):
3940     '''
3941     @param dfxp_data A bytes-like object containing DFXP data
3942     @returns A unicode object containing converted SRT data
3943     '''
3944     LEGACY_NAMESPACES = (
3945         (b'http://www.w3.org/ns/ttml', [
3946             b'http://www.w3.org/2004/11/ttaf1',
3947             b'http://www.w3.org/2006/04/ttaf1',
3948             b'http://www.w3.org/2006/10/ttaf1',
3949         ]),
3950         (b'http://www.w3.org/ns/ttml#styling', [
3951             b'http://www.w3.org/ns/ttml#style',
3952         ]),
3953     )
3954
3955     SUPPORTED_STYLING = [
3956         'color',
3957         'fontFamily',
3958         'fontSize',
3959         'fontStyle',
3960         'fontWeight',
3961         'textDecoration'
3962     ]
3963
3964     _x = functools.partial(xpath_with_ns, ns_map={
3965         'xml': 'http://www.w3.org/XML/1998/namespace',
3966         'ttml': 'http://www.w3.org/ns/ttml',
3967         'tts': 'http://www.w3.org/ns/ttml#styling',
3968     })
3969
3970     styles = {}
3971     default_style = {}
3972
3973     class TTMLPElementParser:
3974         _out = ''
3975         _unclosed_elements = []
3976         _applied_styles = []
3977
3978         def start(self, tag, attrib):
3979             if tag in (_x('ttml:br'), 'br'):
3980                 self._out += '\n'
3981             else:
3982                 unclosed_elements = []
3983                 style = {}
3984                 element_style_id = attrib.get('style')
3985                 if default_style:
3986                     style.update(default_style)
3987                 if element_style_id:
3988                     style.update(styles.get(element_style_id, {}))
3989                 for prop in SUPPORTED_STYLING:
3990                     prop_val = attrib.get(_x('tts:' + prop))
3991                     if prop_val:
3992                         style[prop] = prop_val
3993                 if style:
3994                     font = ''
3995                     for k, v in sorted(style.items()):
3996                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3997                             continue
3998                         if k == 'color':
3999                             font += ' color="%s"' % v
4000                         elif k == 'fontSize':
4001                             font += ' size="%s"' % v
4002                         elif k == 'fontFamily':
4003                             font += ' face="%s"' % v
4004                         elif k == 'fontWeight' and v == 'bold':
4005                             self._out += '<b>'
4006                             unclosed_elements.append('b')
4007                         elif k == 'fontStyle' and v == 'italic':
4008                             self._out += '<i>'
4009                             unclosed_elements.append('i')
4010                         elif k == 'textDecoration' and v == 'underline':
4011                             self._out += '<u>'
4012                             unclosed_elements.append('u')
4013                     if font:
4014                         self._out += '<font' + font + '>'
4015                         unclosed_elements.append('font')
4016                     applied_style = {}
4017                     if self._applied_styles:
4018                         applied_style.update(self._applied_styles[-1])
4019                     applied_style.update(style)
4020                     self._applied_styles.append(applied_style)
4021                 self._unclosed_elements.append(unclosed_elements)
4022
4023         def end(self, tag):
4024             if tag not in (_x('ttml:br'), 'br'):
4025                 unclosed_elements = self._unclosed_elements.pop()
4026                 for element in reversed(unclosed_elements):
4027                     self._out += '</%s>' % element
4028                 if unclosed_elements and self._applied_styles:
4029                     self._applied_styles.pop()
4030
4031         def data(self, data):
4032             self._out += data
4033
4034         def close(self):
4035             return self._out.strip()
4036
4037     def parse_node(node):
4038         target = TTMLPElementParser()
4039         parser = xml.etree.ElementTree.XMLParser(target=target)
4040         parser.feed(xml.etree.ElementTree.tostring(node))
4041         return parser.close()
4042
4043     for k, v in LEGACY_NAMESPACES:
4044         for ns in v:
4045             dfxp_data = dfxp_data.replace(ns, k)
4046
4047     dfxp = compat_etree_fromstring(dfxp_data)
4048     out = []
4049     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4050
4051     if not paras:
4052         raise ValueError('Invalid dfxp/TTML subtitle')
4053
4054     repeat = False
4055     while True:
4056         for style in dfxp.findall(_x('.//ttml:style')):
4057             style_id = style.get('id') or style.get(_x('xml:id'))
4058             if not style_id:
4059                 continue
4060             parent_style_id = style.get('style')
4061             if parent_style_id:
4062                 if parent_style_id not in styles:
4063                     repeat = True
4064                     continue
4065                 styles[style_id] = styles[parent_style_id].copy()
4066             for prop in SUPPORTED_STYLING:
4067                 prop_val = style.get(_x('tts:' + prop))
4068                 if prop_val:
4069                     styles.setdefault(style_id, {})[prop] = prop_val
4070         if repeat:
4071             repeat = False
4072         else:
4073             break
4074
4075     for p in ('body', 'div'):
4076         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4077         if ele is None:
4078             continue
4079         style = styles.get(ele.get('style'))
4080         if not style:
4081             continue
4082         default_style.update(style)
4083
4084     for para, index in zip(paras, itertools.count(1)):
4085         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4086         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4087         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4088         if begin_time is None:
4089             continue
4090         if not end_time:
4091             if not dur:
4092                 continue
4093             end_time = begin_time + dur
4094         out.append('%d\n%s --> %s\n%s\n\n' % (
4095             index,
4096             srt_subtitles_timecode(begin_time),
4097             srt_subtitles_timecode(end_time),
4098             parse_node(para)))
4099
4100     return ''.join(out)
4101
4102
4103 def cli_option(params, command_option, param, separator=None):
4104     param = params.get(param)
4105     return ([] if param is None
4106             else [command_option, str(param)] if separator is None
4107             else [f'{command_option}{separator}{param}'])
4108
4109
4110 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4111     param = params.get(param)
4112     assert param in (True, False, None)
4113     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4114
4115
4116 def cli_valueless_option(params, command_option, param, expected_value=True):
4117     return [command_option] if params.get(param) == expected_value else []
4118
4119
4120 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4121     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4122         if use_compat:
4123             return argdict
4124         else:
4125             argdict = None
4126     if argdict is None:
4127         return default
4128     assert isinstance(argdict, dict)
4129
4130     assert isinstance(keys, (list, tuple))
4131     for key_list in keys:
4132         arg_list = list(filter(
4133             lambda x: x is not None,
4134             [argdict.get(key.lower()) for key in variadic(key_list)]))
4135         if arg_list:
4136             return [arg for args in arg_list for arg in args]
4137     return default
4138
4139
4140 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4141     main_key, exe = main_key.lower(), exe.lower()
4142     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4143     keys = [f'{root_key}{k}' for k in (keys or [''])]
4144     if root_key in keys:
4145         if main_key != exe:
4146             keys.append((main_key, exe))
4147         keys.append('default')
4148     else:
4149         use_compat = False
4150     return cli_configuration_args(argdict, keys, default, use_compat)
4151
4152
4153 class ISO639Utils:
4154     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4155     _lang_map = {
4156         'aa': 'aar',
4157         'ab': 'abk',
4158         'ae': 'ave',
4159         'af': 'afr',
4160         'ak': 'aka',
4161         'am': 'amh',
4162         'an': 'arg',
4163         'ar': 'ara',
4164         'as': 'asm',
4165         'av': 'ava',
4166         'ay': 'aym',
4167         'az': 'aze',
4168         'ba': 'bak',
4169         'be': 'bel',
4170         'bg': 'bul',
4171         'bh': 'bih',
4172         'bi': 'bis',
4173         'bm': 'bam',
4174         'bn': 'ben',
4175         'bo': 'bod',
4176         'br': 'bre',
4177         'bs': 'bos',
4178         'ca': 'cat',
4179         'ce': 'che',
4180         'ch': 'cha',
4181         'co': 'cos',
4182         'cr': 'cre',
4183         'cs': 'ces',
4184         'cu': 'chu',
4185         'cv': 'chv',
4186         'cy': 'cym',
4187         'da': 'dan',
4188         'de': 'deu',
4189         'dv': 'div',
4190         'dz': 'dzo',
4191         'ee': 'ewe',
4192         'el': 'ell',
4193         'en': 'eng',
4194         'eo': 'epo',
4195         'es': 'spa',
4196         'et': 'est',
4197         'eu': 'eus',
4198         'fa': 'fas',
4199         'ff': 'ful',
4200         'fi': 'fin',
4201         'fj': 'fij',
4202         'fo': 'fao',
4203         'fr': 'fra',
4204         'fy': 'fry',
4205         'ga': 'gle',
4206         'gd': 'gla',
4207         'gl': 'glg',
4208         'gn': 'grn',
4209         'gu': 'guj',
4210         'gv': 'glv',
4211         'ha': 'hau',
4212         'he': 'heb',
4213         'iw': 'heb',  # Replaced by he in 1989 revision
4214         'hi': 'hin',
4215         'ho': 'hmo',
4216         'hr': 'hrv',
4217         'ht': 'hat',
4218         'hu': 'hun',
4219         'hy': 'hye',
4220         'hz': 'her',
4221         'ia': 'ina',
4222         'id': 'ind',
4223         'in': 'ind',  # Replaced by id in 1989 revision
4224         'ie': 'ile',
4225         'ig': 'ibo',
4226         'ii': 'iii',
4227         'ik': 'ipk',
4228         'io': 'ido',
4229         'is': 'isl',
4230         'it': 'ita',
4231         'iu': 'iku',
4232         'ja': 'jpn',
4233         'jv': 'jav',
4234         'ka': 'kat',
4235         'kg': 'kon',
4236         'ki': 'kik',
4237         'kj': 'kua',
4238         'kk': 'kaz',
4239         'kl': 'kal',
4240         'km': 'khm',
4241         'kn': 'kan',
4242         'ko': 'kor',
4243         'kr': 'kau',
4244         'ks': 'kas',
4245         'ku': 'kur',
4246         'kv': 'kom',
4247         'kw': 'cor',
4248         'ky': 'kir',
4249         'la': 'lat',
4250         'lb': 'ltz',
4251         'lg': 'lug',
4252         'li': 'lim',
4253         'ln': 'lin',
4254         'lo': 'lao',
4255         'lt': 'lit',
4256         'lu': 'lub',
4257         'lv': 'lav',
4258         'mg': 'mlg',
4259         'mh': 'mah',
4260         'mi': 'mri',
4261         'mk': 'mkd',
4262         'ml': 'mal',
4263         'mn': 'mon',
4264         'mr': 'mar',
4265         'ms': 'msa',
4266         'mt': 'mlt',
4267         'my': 'mya',
4268         'na': 'nau',
4269         'nb': 'nob',
4270         'nd': 'nde',
4271         'ne': 'nep',
4272         'ng': 'ndo',
4273         'nl': 'nld',
4274         'nn': 'nno',
4275         'no': 'nor',
4276         'nr': 'nbl',
4277         'nv': 'nav',
4278         'ny': 'nya',
4279         'oc': 'oci',
4280         'oj': 'oji',
4281         'om': 'orm',
4282         'or': 'ori',
4283         'os': 'oss',
4284         'pa': 'pan',
4285         'pi': 'pli',
4286         'pl': 'pol',
4287         'ps': 'pus',
4288         'pt': 'por',
4289         'qu': 'que',
4290         'rm': 'roh',
4291         'rn': 'run',
4292         'ro': 'ron',
4293         'ru': 'rus',
4294         'rw': 'kin',
4295         'sa': 'san',
4296         'sc': 'srd',
4297         'sd': 'snd',
4298         'se': 'sme',
4299         'sg': 'sag',
4300         'si': 'sin',
4301         'sk': 'slk',
4302         'sl': 'slv',
4303         'sm': 'smo',
4304         'sn': 'sna',
4305         'so': 'som',
4306         'sq': 'sqi',
4307         'sr': 'srp',
4308         'ss': 'ssw',
4309         'st': 'sot',
4310         'su': 'sun',
4311         'sv': 'swe',
4312         'sw': 'swa',
4313         'ta': 'tam',
4314         'te': 'tel',
4315         'tg': 'tgk',
4316         'th': 'tha',
4317         'ti': 'tir',
4318         'tk': 'tuk',
4319         'tl': 'tgl',
4320         'tn': 'tsn',
4321         'to': 'ton',
4322         'tr': 'tur',
4323         'ts': 'tso',
4324         'tt': 'tat',
4325         'tw': 'twi',
4326         'ty': 'tah',
4327         'ug': 'uig',
4328         'uk': 'ukr',
4329         'ur': 'urd',
4330         'uz': 'uzb',
4331         've': 'ven',
4332         'vi': 'vie',
4333         'vo': 'vol',
4334         'wa': 'wln',
4335         'wo': 'wol',
4336         'xh': 'xho',
4337         'yi': 'yid',
4338         'ji': 'yid',  # Replaced by yi in 1989 revision
4339         'yo': 'yor',
4340         'za': 'zha',
4341         'zh': 'zho',
4342         'zu': 'zul',
4343     }
4344
4345     @classmethod
4346     def short2long(cls, code):
4347         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4348         return cls._lang_map.get(code[:2])
4349
4350     @classmethod
4351     def long2short(cls, code):
4352         """Convert language code from ISO 639-2/T to ISO 639-1"""
4353         for short_name, long_name in cls._lang_map.items():
4354             if long_name == code:
4355                 return short_name
4356
4357
4358 class ISO3166Utils:
4359     # From http://data.okfn.org/data/core/country-list
4360     _country_map = {
4361         'AF': 'Afghanistan',
4362         'AX': 'Åland Islands',
4363         'AL': 'Albania',
4364         'DZ': 'Algeria',
4365         'AS': 'American Samoa',
4366         'AD': 'Andorra',
4367         'AO': 'Angola',
4368         'AI': 'Anguilla',
4369         'AQ': 'Antarctica',
4370         'AG': 'Antigua and Barbuda',
4371         'AR': 'Argentina',
4372         'AM': 'Armenia',
4373         'AW': 'Aruba',
4374         'AU': 'Australia',
4375         'AT': 'Austria',
4376         'AZ': 'Azerbaijan',
4377         'BS': 'Bahamas',
4378         'BH': 'Bahrain',
4379         'BD': 'Bangladesh',
4380         'BB': 'Barbados',
4381         'BY': 'Belarus',
4382         'BE': 'Belgium',
4383         'BZ': 'Belize',
4384         'BJ': 'Benin',
4385         'BM': 'Bermuda',
4386         'BT': 'Bhutan',
4387         'BO': 'Bolivia, Plurinational State of',
4388         'BQ': 'Bonaire, Sint Eustatius and Saba',
4389         'BA': 'Bosnia and Herzegovina',
4390         'BW': 'Botswana',
4391         'BV': 'Bouvet Island',
4392         'BR': 'Brazil',
4393         'IO': 'British Indian Ocean Territory',
4394         'BN': 'Brunei Darussalam',
4395         'BG': 'Bulgaria',
4396         'BF': 'Burkina Faso',
4397         'BI': 'Burundi',
4398         'KH': 'Cambodia',
4399         'CM': 'Cameroon',
4400         'CA': 'Canada',
4401         'CV': 'Cape Verde',
4402         'KY': 'Cayman Islands',
4403         'CF': 'Central African Republic',
4404         'TD': 'Chad',
4405         'CL': 'Chile',
4406         'CN': 'China',
4407         'CX': 'Christmas Island',
4408         'CC': 'Cocos (Keeling) Islands',
4409         'CO': 'Colombia',
4410         'KM': 'Comoros',
4411         'CG': 'Congo',
4412         'CD': 'Congo, the Democratic Republic of the',
4413         'CK': 'Cook Islands',
4414         'CR': 'Costa Rica',
4415         'CI': 'Côte d\'Ivoire',
4416         'HR': 'Croatia',
4417         'CU': 'Cuba',
4418         'CW': 'Curaçao',
4419         'CY': 'Cyprus',
4420         'CZ': 'Czech Republic',
4421         'DK': 'Denmark',
4422         'DJ': 'Djibouti',
4423         'DM': 'Dominica',
4424         'DO': 'Dominican Republic',
4425         'EC': 'Ecuador',
4426         'EG': 'Egypt',
4427         'SV': 'El Salvador',
4428         'GQ': 'Equatorial Guinea',
4429         'ER': 'Eritrea',
4430         'EE': 'Estonia',
4431         'ET': 'Ethiopia',
4432         'FK': 'Falkland Islands (Malvinas)',
4433         'FO': 'Faroe Islands',
4434         'FJ': 'Fiji',
4435         'FI': 'Finland',
4436         'FR': 'France',
4437         'GF': 'French Guiana',
4438         'PF': 'French Polynesia',
4439         'TF': 'French Southern Territories',
4440         'GA': 'Gabon',
4441         'GM': 'Gambia',
4442         'GE': 'Georgia',
4443         'DE': 'Germany',
4444         'GH': 'Ghana',
4445         'GI': 'Gibraltar',
4446         'GR': 'Greece',
4447         'GL': 'Greenland',
4448         'GD': 'Grenada',
4449         'GP': 'Guadeloupe',
4450         'GU': 'Guam',
4451         'GT': 'Guatemala',
4452         'GG': 'Guernsey',
4453         'GN': 'Guinea',
4454         'GW': 'Guinea-Bissau',
4455         'GY': 'Guyana',
4456         'HT': 'Haiti',
4457         'HM': 'Heard Island and McDonald Islands',
4458         'VA': 'Holy See (Vatican City State)',
4459         'HN': 'Honduras',
4460         'HK': 'Hong Kong',
4461         'HU': 'Hungary',
4462         'IS': 'Iceland',
4463         'IN': 'India',
4464         'ID': 'Indonesia',
4465         'IR': 'Iran, Islamic Republic of',
4466         'IQ': 'Iraq',
4467         'IE': 'Ireland',
4468         'IM': 'Isle of Man',
4469         'IL': 'Israel',
4470         'IT': 'Italy',
4471         'JM': 'Jamaica',
4472         'JP': 'Japan',
4473         'JE': 'Jersey',
4474         'JO': 'Jordan',
4475         'KZ': 'Kazakhstan',
4476         'KE': 'Kenya',
4477         'KI': 'Kiribati',
4478         'KP': 'Korea, Democratic People\'s Republic of',
4479         'KR': 'Korea, Republic of',
4480         'KW': 'Kuwait',
4481         'KG': 'Kyrgyzstan',
4482         'LA': 'Lao People\'s Democratic Republic',
4483         'LV': 'Latvia',
4484         'LB': 'Lebanon',
4485         'LS': 'Lesotho',
4486         'LR': 'Liberia',
4487         'LY': 'Libya',
4488         'LI': 'Liechtenstein',
4489         'LT': 'Lithuania',
4490         'LU': 'Luxembourg',
4491         'MO': 'Macao',
4492         'MK': 'Macedonia, the Former Yugoslav Republic of',
4493         'MG': 'Madagascar',
4494         'MW': 'Malawi',
4495         'MY': 'Malaysia',
4496         'MV': 'Maldives',
4497         'ML': 'Mali',
4498         'MT': 'Malta',
4499         'MH': 'Marshall Islands',
4500         'MQ': 'Martinique',
4501         'MR': 'Mauritania',
4502         'MU': 'Mauritius',
4503         'YT': 'Mayotte',
4504         'MX': 'Mexico',
4505         'FM': 'Micronesia, Federated States of',
4506         'MD': 'Moldova, Republic of',
4507         'MC': 'Monaco',
4508         'MN': 'Mongolia',
4509         'ME': 'Montenegro',
4510         'MS': 'Montserrat',
4511         'MA': 'Morocco',
4512         'MZ': 'Mozambique',
4513         'MM': 'Myanmar',
4514         'NA': 'Namibia',
4515         'NR': 'Nauru',
4516         'NP': 'Nepal',
4517         'NL': 'Netherlands',
4518         'NC': 'New Caledonia',
4519         'NZ': 'New Zealand',
4520         'NI': 'Nicaragua',
4521         'NE': 'Niger',
4522         'NG': 'Nigeria',
4523         'NU': 'Niue',
4524         'NF': 'Norfolk Island',
4525         'MP': 'Northern Mariana Islands',
4526         'NO': 'Norway',
4527         'OM': 'Oman',
4528         'PK': 'Pakistan',
4529         'PW': 'Palau',
4530         'PS': 'Palestine, State of',
4531         'PA': 'Panama',
4532         'PG': 'Papua New Guinea',
4533         'PY': 'Paraguay',
4534         'PE': 'Peru',
4535         'PH': 'Philippines',
4536         'PN': 'Pitcairn',
4537         'PL': 'Poland',
4538         'PT': 'Portugal',
4539         'PR': 'Puerto Rico',
4540         'QA': 'Qatar',
4541         'RE': 'Réunion',
4542         'RO': 'Romania',
4543         'RU': 'Russian Federation',
4544         'RW': 'Rwanda',
4545         'BL': 'Saint Barthélemy',
4546         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4547         'KN': 'Saint Kitts and Nevis',
4548         'LC': 'Saint Lucia',
4549         'MF': 'Saint Martin (French part)',
4550         'PM': 'Saint Pierre and Miquelon',
4551         'VC': 'Saint Vincent and the Grenadines',
4552         'WS': 'Samoa',
4553         'SM': 'San Marino',
4554         'ST': 'Sao Tome and Principe',
4555         'SA': 'Saudi Arabia',
4556         'SN': 'Senegal',
4557         'RS': 'Serbia',
4558         'SC': 'Seychelles',
4559         'SL': 'Sierra Leone',
4560         'SG': 'Singapore',
4561         'SX': 'Sint Maarten (Dutch part)',
4562         'SK': 'Slovakia',
4563         'SI': 'Slovenia',
4564         'SB': 'Solomon Islands',
4565         'SO': 'Somalia',
4566         'ZA': 'South Africa',
4567         'GS': 'South Georgia and the South Sandwich Islands',
4568         'SS': 'South Sudan',
4569         'ES': 'Spain',
4570         'LK': 'Sri Lanka',
4571         'SD': 'Sudan',
4572         'SR': 'Suriname',
4573         'SJ': 'Svalbard and Jan Mayen',
4574         'SZ': 'Swaziland',
4575         'SE': 'Sweden',
4576         'CH': 'Switzerland',
4577         'SY': 'Syrian Arab Republic',
4578         'TW': 'Taiwan, Province of China',
4579         'TJ': 'Tajikistan',
4580         'TZ': 'Tanzania, United Republic of',
4581         'TH': 'Thailand',
4582         'TL': 'Timor-Leste',
4583         'TG': 'Togo',
4584         'TK': 'Tokelau',
4585         'TO': 'Tonga',
4586         'TT': 'Trinidad and Tobago',
4587         'TN': 'Tunisia',
4588         'TR': 'Turkey',
4589         'TM': 'Turkmenistan',
4590         'TC': 'Turks and Caicos Islands',
4591         'TV': 'Tuvalu',
4592         'UG': 'Uganda',
4593         'UA': 'Ukraine',
4594         'AE': 'United Arab Emirates',
4595         'GB': 'United Kingdom',
4596         'US': 'United States',
4597         'UM': 'United States Minor Outlying Islands',
4598         'UY': 'Uruguay',
4599         'UZ': 'Uzbekistan',
4600         'VU': 'Vanuatu',
4601         'VE': 'Venezuela, Bolivarian Republic of',
4602         'VN': 'Viet Nam',
4603         'VG': 'Virgin Islands, British',
4604         'VI': 'Virgin Islands, U.S.',
4605         'WF': 'Wallis and Futuna',
4606         'EH': 'Western Sahara',
4607         'YE': 'Yemen',
4608         'ZM': 'Zambia',
4609         'ZW': 'Zimbabwe',
4610         # Not ISO 3166 codes, but used for IP blocks
4611         'AP': 'Asia/Pacific Region',
4612         'EU': 'Europe',
4613     }
4614
4615     @classmethod
4616     def short2full(cls, code):
4617         """Convert an ISO 3166-2 country code to the corresponding full name"""
4618         return cls._country_map.get(code.upper())
4619
4620
4621 class GeoUtils:
4622     # Major IPv4 address blocks per country
4623     _country_ip_map = {
4624         'AD': '46.172.224.0/19',
4625         'AE': '94.200.0.0/13',
4626         'AF': '149.54.0.0/17',
4627         'AG': '209.59.64.0/18',
4628         'AI': '204.14.248.0/21',
4629         'AL': '46.99.0.0/16',
4630         'AM': '46.70.0.0/15',
4631         'AO': '105.168.0.0/13',
4632         'AP': '182.50.184.0/21',
4633         'AQ': '23.154.160.0/24',
4634         'AR': '181.0.0.0/12',
4635         'AS': '202.70.112.0/20',
4636         'AT': '77.116.0.0/14',
4637         'AU': '1.128.0.0/11',
4638         'AW': '181.41.0.0/18',
4639         'AX': '185.217.4.0/22',
4640         'AZ': '5.197.0.0/16',
4641         'BA': '31.176.128.0/17',
4642         'BB': '65.48.128.0/17',
4643         'BD': '114.130.0.0/16',
4644         'BE': '57.0.0.0/8',
4645         'BF': '102.178.0.0/15',
4646         'BG': '95.42.0.0/15',
4647         'BH': '37.131.0.0/17',
4648         'BI': '154.117.192.0/18',
4649         'BJ': '137.255.0.0/16',
4650         'BL': '185.212.72.0/23',
4651         'BM': '196.12.64.0/18',
4652         'BN': '156.31.0.0/16',
4653         'BO': '161.56.0.0/16',
4654         'BQ': '161.0.80.0/20',
4655         'BR': '191.128.0.0/12',
4656         'BS': '24.51.64.0/18',
4657         'BT': '119.2.96.0/19',
4658         'BW': '168.167.0.0/16',
4659         'BY': '178.120.0.0/13',
4660         'BZ': '179.42.192.0/18',
4661         'CA': '99.224.0.0/11',
4662         'CD': '41.243.0.0/16',
4663         'CF': '197.242.176.0/21',
4664         'CG': '160.113.0.0/16',
4665         'CH': '85.0.0.0/13',
4666         'CI': '102.136.0.0/14',
4667         'CK': '202.65.32.0/19',
4668         'CL': '152.172.0.0/14',
4669         'CM': '102.244.0.0/14',
4670         'CN': '36.128.0.0/10',
4671         'CO': '181.240.0.0/12',
4672         'CR': '201.192.0.0/12',
4673         'CU': '152.206.0.0/15',
4674         'CV': '165.90.96.0/19',
4675         'CW': '190.88.128.0/17',
4676         'CY': '31.153.0.0/16',
4677         'CZ': '88.100.0.0/14',
4678         'DE': '53.0.0.0/8',
4679         'DJ': '197.241.0.0/17',
4680         'DK': '87.48.0.0/12',
4681         'DM': '192.243.48.0/20',
4682         'DO': '152.166.0.0/15',
4683         'DZ': '41.96.0.0/12',
4684         'EC': '186.68.0.0/15',
4685         'EE': '90.190.0.0/15',
4686         'EG': '156.160.0.0/11',
4687         'ER': '196.200.96.0/20',
4688         'ES': '88.0.0.0/11',
4689         'ET': '196.188.0.0/14',
4690         'EU': '2.16.0.0/13',
4691         'FI': '91.152.0.0/13',
4692         'FJ': '144.120.0.0/16',
4693         'FK': '80.73.208.0/21',
4694         'FM': '119.252.112.0/20',
4695         'FO': '88.85.32.0/19',
4696         'FR': '90.0.0.0/9',
4697         'GA': '41.158.0.0/15',
4698         'GB': '25.0.0.0/8',
4699         'GD': '74.122.88.0/21',
4700         'GE': '31.146.0.0/16',
4701         'GF': '161.22.64.0/18',
4702         'GG': '62.68.160.0/19',
4703         'GH': '154.160.0.0/12',
4704         'GI': '95.164.0.0/16',
4705         'GL': '88.83.0.0/19',
4706         'GM': '160.182.0.0/15',
4707         'GN': '197.149.192.0/18',
4708         'GP': '104.250.0.0/19',
4709         'GQ': '105.235.224.0/20',
4710         'GR': '94.64.0.0/13',
4711         'GT': '168.234.0.0/16',
4712         'GU': '168.123.0.0/16',
4713         'GW': '197.214.80.0/20',
4714         'GY': '181.41.64.0/18',
4715         'HK': '113.252.0.0/14',
4716         'HN': '181.210.0.0/16',
4717         'HR': '93.136.0.0/13',
4718         'HT': '148.102.128.0/17',
4719         'HU': '84.0.0.0/14',
4720         'ID': '39.192.0.0/10',
4721         'IE': '87.32.0.0/12',
4722         'IL': '79.176.0.0/13',
4723         'IM': '5.62.80.0/20',
4724         'IN': '117.192.0.0/10',
4725         'IO': '203.83.48.0/21',
4726         'IQ': '37.236.0.0/14',
4727         'IR': '2.176.0.0/12',
4728         'IS': '82.221.0.0/16',
4729         'IT': '79.0.0.0/10',
4730         'JE': '87.244.64.0/18',
4731         'JM': '72.27.0.0/17',
4732         'JO': '176.29.0.0/16',
4733         'JP': '133.0.0.0/8',
4734         'KE': '105.48.0.0/12',
4735         'KG': '158.181.128.0/17',
4736         'KH': '36.37.128.0/17',
4737         'KI': '103.25.140.0/22',
4738         'KM': '197.255.224.0/20',
4739         'KN': '198.167.192.0/19',
4740         'KP': '175.45.176.0/22',
4741         'KR': '175.192.0.0/10',
4742         'KW': '37.36.0.0/14',
4743         'KY': '64.96.0.0/15',
4744         'KZ': '2.72.0.0/13',
4745         'LA': '115.84.64.0/18',
4746         'LB': '178.135.0.0/16',
4747         'LC': '24.92.144.0/20',
4748         'LI': '82.117.0.0/19',
4749         'LK': '112.134.0.0/15',
4750         'LR': '102.183.0.0/16',
4751         'LS': '129.232.0.0/17',
4752         'LT': '78.56.0.0/13',
4753         'LU': '188.42.0.0/16',
4754         'LV': '46.109.0.0/16',
4755         'LY': '41.252.0.0/14',
4756         'MA': '105.128.0.0/11',
4757         'MC': '88.209.64.0/18',
4758         'MD': '37.246.0.0/16',
4759         'ME': '178.175.0.0/17',
4760         'MF': '74.112.232.0/21',
4761         'MG': '154.126.0.0/17',
4762         'MH': '117.103.88.0/21',
4763         'MK': '77.28.0.0/15',
4764         'ML': '154.118.128.0/18',
4765         'MM': '37.111.0.0/17',
4766         'MN': '49.0.128.0/17',
4767         'MO': '60.246.0.0/16',
4768         'MP': '202.88.64.0/20',
4769         'MQ': '109.203.224.0/19',
4770         'MR': '41.188.64.0/18',
4771         'MS': '208.90.112.0/22',
4772         'MT': '46.11.0.0/16',
4773         'MU': '105.16.0.0/12',
4774         'MV': '27.114.128.0/18',
4775         'MW': '102.70.0.0/15',
4776         'MX': '187.192.0.0/11',
4777         'MY': '175.136.0.0/13',
4778         'MZ': '197.218.0.0/15',
4779         'NA': '41.182.0.0/16',
4780         'NC': '101.101.0.0/18',
4781         'NE': '197.214.0.0/18',
4782         'NF': '203.17.240.0/22',
4783         'NG': '105.112.0.0/12',
4784         'NI': '186.76.0.0/15',
4785         'NL': '145.96.0.0/11',
4786         'NO': '84.208.0.0/13',
4787         'NP': '36.252.0.0/15',
4788         'NR': '203.98.224.0/19',
4789         'NU': '49.156.48.0/22',
4790         'NZ': '49.224.0.0/14',
4791         'OM': '5.36.0.0/15',
4792         'PA': '186.72.0.0/15',
4793         'PE': '186.160.0.0/14',
4794         'PF': '123.50.64.0/18',
4795         'PG': '124.240.192.0/19',
4796         'PH': '49.144.0.0/13',
4797         'PK': '39.32.0.0/11',
4798         'PL': '83.0.0.0/11',
4799         'PM': '70.36.0.0/20',
4800         'PR': '66.50.0.0/16',
4801         'PS': '188.161.0.0/16',
4802         'PT': '85.240.0.0/13',
4803         'PW': '202.124.224.0/20',
4804         'PY': '181.120.0.0/14',
4805         'QA': '37.210.0.0/15',
4806         'RE': '102.35.0.0/16',
4807         'RO': '79.112.0.0/13',
4808         'RS': '93.86.0.0/15',
4809         'RU': '5.136.0.0/13',
4810         'RW': '41.186.0.0/16',
4811         'SA': '188.48.0.0/13',
4812         'SB': '202.1.160.0/19',
4813         'SC': '154.192.0.0/11',
4814         'SD': '102.120.0.0/13',
4815         'SE': '78.64.0.0/12',
4816         'SG': '8.128.0.0/10',
4817         'SI': '188.196.0.0/14',
4818         'SK': '78.98.0.0/15',
4819         'SL': '102.143.0.0/17',
4820         'SM': '89.186.32.0/19',
4821         'SN': '41.82.0.0/15',
4822         'SO': '154.115.192.0/18',
4823         'SR': '186.179.128.0/17',
4824         'SS': '105.235.208.0/21',
4825         'ST': '197.159.160.0/19',
4826         'SV': '168.243.0.0/16',
4827         'SX': '190.102.0.0/20',
4828         'SY': '5.0.0.0/16',
4829         'SZ': '41.84.224.0/19',
4830         'TC': '65.255.48.0/20',
4831         'TD': '154.68.128.0/19',
4832         'TG': '196.168.0.0/14',
4833         'TH': '171.96.0.0/13',
4834         'TJ': '85.9.128.0/18',
4835         'TK': '27.96.24.0/21',
4836         'TL': '180.189.160.0/20',
4837         'TM': '95.85.96.0/19',
4838         'TN': '197.0.0.0/11',
4839         'TO': '175.176.144.0/21',
4840         'TR': '78.160.0.0/11',
4841         'TT': '186.44.0.0/15',
4842         'TV': '202.2.96.0/19',
4843         'TW': '120.96.0.0/11',
4844         'TZ': '156.156.0.0/14',
4845         'UA': '37.52.0.0/14',
4846         'UG': '102.80.0.0/13',
4847         'US': '6.0.0.0/8',
4848         'UY': '167.56.0.0/13',
4849         'UZ': '84.54.64.0/18',
4850         'VA': '212.77.0.0/19',
4851         'VC': '207.191.240.0/21',
4852         'VE': '186.88.0.0/13',
4853         'VG': '66.81.192.0/20',
4854         'VI': '146.226.0.0/16',
4855         'VN': '14.160.0.0/11',
4856         'VU': '202.80.32.0/20',
4857         'WF': '117.20.32.0/21',
4858         'WS': '202.4.32.0/19',
4859         'YE': '134.35.0.0/16',
4860         'YT': '41.242.116.0/22',
4861         'ZA': '41.0.0.0/11',
4862         'ZM': '102.144.0.0/13',
4863         'ZW': '102.177.192.0/18',
4864     }
4865
4866     @classmethod
4867     def random_ipv4(cls, code_or_block):
4868         if len(code_or_block) == 2:
4869             block = cls._country_ip_map.get(code_or_block.upper())
4870             if not block:
4871                 return None
4872         else:
4873             block = code_or_block
4874         addr, preflen = block.split('/')
4875         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4876         addr_max = addr_min | (0xffffffff >> int(preflen))
4877         return str(socket.inet_ntoa(
4878             struct.pack('!L', random.randint(addr_min, addr_max))))
4879
4880
4881 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4882     def __init__(self, proxies=None):
4883         # Set default handlers
4884         for type in ('http', 'https'):
4885             setattr(self, '%s_open' % type,
4886                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4887                         meth(r, proxy, type))
4888         urllib.request.ProxyHandler.__init__(self, proxies)
4889
4890     def proxy_open(self, req, proxy, type):
4891         req_proxy = req.headers.get('Ytdl-request-proxy')
4892         if req_proxy is not None:
4893             proxy = req_proxy
4894             del req.headers['Ytdl-request-proxy']
4895
4896         if proxy == '__noproxy__':
4897             return None  # No Proxy
4898         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4899             req.add_header('Ytdl-socks-proxy', proxy)
4900             # yt-dlp's http/https handlers do wrapping the socket with socks
4901             return None
4902         return urllib.request.ProxyHandler.proxy_open(
4903             self, req, proxy, type)
4904
4905
4906 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4907 # released into Public Domain
4908 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4909
4910 def long_to_bytes(n, blocksize=0):
4911     """long_to_bytes(n:long, blocksize:int) : string
4912     Convert a long integer to a byte string.
4913
4914     If optional blocksize is given and greater than zero, pad the front of the
4915     byte string with binary zeros so that the length is a multiple of
4916     blocksize.
4917     """
4918     # after much testing, this algorithm was deemed to be the fastest
4919     s = b''
4920     n = int(n)
4921     while n > 0:
4922         s = struct.pack('>I', n & 0xffffffff) + s
4923         n = n >> 32
4924     # strip off leading zeros
4925     for i in range(len(s)):
4926         if s[i] != b'\000'[0]:
4927             break
4928     else:
4929         # only happens when n == 0
4930         s = b'\000'
4931         i = 0
4932     s = s[i:]
4933     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4934     # de-padding being done above, but sigh...
4935     if blocksize > 0 and len(s) % blocksize:
4936         s = (blocksize - len(s) % blocksize) * b'\000' + s
4937     return s
4938
4939
4940 def bytes_to_long(s):
4941     """bytes_to_long(string) : long
4942     Convert a byte string to a long integer.
4943
4944     This is (essentially) the inverse of long_to_bytes().
4945     """
4946     acc = 0
4947     length = len(s)
4948     if length % 4:
4949         extra = (4 - length % 4)
4950         s = b'\000' * extra + s
4951         length = length + extra
4952     for i in range(0, length, 4):
4953         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4954     return acc
4955
4956
4957 def ohdave_rsa_encrypt(data, exponent, modulus):
4958     '''
4959     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4960
4961     Input:
4962         data: data to encrypt, bytes-like object
4963         exponent, modulus: parameter e and N of RSA algorithm, both integer
4964     Output: hex string of encrypted data
4965
4966     Limitation: supports one block encryption only
4967     '''
4968
4969     payload = int(binascii.hexlify(data[::-1]), 16)
4970     encrypted = pow(payload, exponent, modulus)
4971     return '%x' % encrypted
4972
4973
4974 def pkcs1pad(data, length):
4975     """
4976     Padding input data with PKCS#1 scheme
4977
4978     @param {int[]} data        input data
4979     @param {int}   length      target length
4980     @returns {int[]}           padded data
4981     """
4982     if len(data) > length - 11:
4983         raise ValueError('Input data too long for PKCS#1 padding')
4984
4985     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4986     return [0, 2] + pseudo_random + [0] + data
4987
4988
4989 def _base_n_table(n, table):
4990     if not table and not n:
4991         raise ValueError('Either table or n must be specified')
4992     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4993
4994     if n and n != len(table):
4995         raise ValueError(f'base {n} exceeds table length {len(table)}')
4996     return table
4997
4998
4999 def encode_base_n(num, n=None, table=None):
5000     """Convert given int to a base-n string"""
5001     table = _base_n_table(n, table)
5002     if not num:
5003         return table[0]
5004
5005     result, base = '', len(table)
5006     while num:
5007         result = table[num % base] + result
5008         num = num // base
5009     return result
5010
5011
5012 def decode_base_n(string, n=None, table=None):
5013     """Convert given base-n string to int"""
5014     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5015     result, base = 0, len(table)
5016     for char in string:
5017         result = result * base + table[char]
5018     return result
5019
5020
5021 def decode_base(value, digits):
5022     deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5023                         f'in a future version. Use {__name__}.decode_base_n instead')
5024     return decode_base_n(value, table=digits)
5025
5026
5027 def decode_packed_codes(code):
5028     mobj = re.search(PACKED_CODES_RE, code)
5029     obfuscated_code, base, count, symbols = mobj.groups()
5030     base = int(base)
5031     count = int(count)
5032     symbols = symbols.split('|')
5033     symbol_table = {}
5034
5035     while count:
5036         count -= 1
5037         base_n_count = encode_base_n(count, base)
5038         symbol_table[base_n_count] = symbols[count] or base_n_count
5039
5040     return re.sub(
5041         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5042         obfuscated_code)
5043
5044
5045 def caesar(s, alphabet, shift):
5046     if shift == 0:
5047         return s
5048     l = len(alphabet)
5049     return ''.join(
5050         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5051         for c in s)
5052
5053
5054 def rot47(s):
5055     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5056
5057
5058 def parse_m3u8_attributes(attrib):
5059     info = {}
5060     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5061         if val.startswith('"'):
5062             val = val[1:-1]
5063         info[key] = val
5064     return info
5065
5066
5067 def urshift(val, n):
5068     return val >> n if val >= 0 else (val + 0x100000000) >> n
5069
5070
5071 # Based on png2str() written by @gdkchan and improved by @yokrysty
5072 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5073 def decode_png(png_data):
5074     # Reference: https://www.w3.org/TR/PNG/
5075     header = png_data[8:]
5076
5077     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5078         raise OSError('Not a valid PNG file.')
5079
5080     int_map = {1: '>B', 2: '>H', 4: '>I'}
5081     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5082
5083     chunks = []
5084
5085     while header:
5086         length = unpack_integer(header[:4])
5087         header = header[4:]
5088
5089         chunk_type = header[:4]
5090         header = header[4:]
5091
5092         chunk_data = header[:length]
5093         header = header[length:]
5094
5095         header = header[4:]  # Skip CRC
5096
5097         chunks.append({
5098             'type': chunk_type,
5099             'length': length,
5100             'data': chunk_data
5101         })
5102
5103     ihdr = chunks[0]['data']
5104
5105     width = unpack_integer(ihdr[:4])
5106     height = unpack_integer(ihdr[4:8])
5107
5108     idat = b''
5109
5110     for chunk in chunks:
5111         if chunk['type'] == b'IDAT':
5112             idat += chunk['data']
5113
5114     if not idat:
5115         raise OSError('Unable to read PNG data.')
5116
5117     decompressed_data = bytearray(zlib.decompress(idat))
5118
5119     stride = width * 3
5120     pixels = []
5121
5122     def _get_pixel(idx):
5123         x = idx % stride
5124         y = idx // stride
5125         return pixels[y][x]
5126
5127     for y in range(height):
5128         basePos = y * (1 + stride)
5129         filter_type = decompressed_data[basePos]
5130
5131         current_row = []
5132
5133         pixels.append(current_row)
5134
5135         for x in range(stride):
5136             color = decompressed_data[1 + basePos + x]
5137             basex = y * stride + x
5138             left = 0
5139             up = 0
5140
5141             if x > 2:
5142                 left = _get_pixel(basex - 3)
5143             if y > 0:
5144                 up = _get_pixel(basex - stride)
5145
5146             if filter_type == 1:  # Sub
5147                 color = (color + left) & 0xff
5148             elif filter_type == 2:  # Up
5149                 color = (color + up) & 0xff
5150             elif filter_type == 3:  # Average
5151                 color = (color + ((left + up) >> 1)) & 0xff
5152             elif filter_type == 4:  # Paeth
5153                 a = left
5154                 b = up
5155                 c = 0
5156
5157                 if x > 2 and y > 0:
5158                     c = _get_pixel(basex - stride - 3)
5159
5160                 p = a + b - c
5161
5162                 pa = abs(p - a)
5163                 pb = abs(p - b)
5164                 pc = abs(p - c)
5165
5166                 if pa <= pb and pa <= pc:
5167                     color = (color + a) & 0xff
5168                 elif pb <= pc:
5169                     color = (color + b) & 0xff
5170                 else:
5171                     color = (color + c) & 0xff
5172
5173             current_row.append(color)
5174
5175     return width, height, pixels
5176
5177
5178 def write_xattr(path, key, value):
5179     # Windows: Write xattrs to NTFS Alternate Data Streams:
5180     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5181     if compat_os_name == 'nt':
5182         assert ':' not in key
5183         assert os.path.exists(path)
5184
5185         try:
5186             with open(f'{path}:{key}', 'wb') as f:
5187                 f.write(value)
5188         except OSError as e:
5189             raise XAttrMetadataError(e.errno, e.strerror)
5190         return
5191
5192     # UNIX Method 1. Use xattrs/pyxattrs modules
5193
5194     setxattr = None
5195     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5196         # Unicode arguments are not supported in pyxattr until version 0.5.0
5197         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5198         if version_tuple(xattr.__version__) >= (0, 5, 0):
5199             setxattr = xattr.set
5200     elif xattr:
5201         setxattr = xattr.setxattr
5202
5203     if setxattr:
5204         try:
5205             setxattr(path, key, value)
5206         except OSError as e:
5207             raise XAttrMetadataError(e.errno, e.strerror)
5208         return
5209
5210     # UNIX Method 2. Use setfattr/xattr executables
5211     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5212            else 'xattr' if check_executable('xattr', ['-h']) else None)
5213     if not exe:
5214         raise XAttrUnavailableError(
5215             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5216             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5217
5218     value = value.decode()
5219     try:
5220         _, stderr, returncode = Popen.run(
5221             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5222             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5223     except OSError as e:
5224         raise XAttrMetadataError(e.errno, e.strerror)
5225     if returncode:
5226         raise XAttrMetadataError(returncode, stderr)
5227
5228
5229 def random_birthday(year_field, month_field, day_field):
5230     start_date = datetime.date(1950, 1, 1)
5231     end_date = datetime.date(1995, 12, 31)
5232     offset = random.randint(0, (end_date - start_date).days)
5233     random_date = start_date + datetime.timedelta(offset)
5234     return {
5235         year_field: str(random_date.year),
5236         month_field: str(random_date.month),
5237         day_field: str(random_date.day),
5238     }
5239
5240
5241 # Templates for internet shortcut files, which are plain text files.
5242 DOT_URL_LINK_TEMPLATE = '''\
5243 [InternetShortcut]
5244 URL=%(url)s
5245 '''
5246
5247 DOT_WEBLOC_LINK_TEMPLATE = '''\
5248 <?xml version="1.0" encoding="UTF-8"?>
5249 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5250 <plist version="1.0">
5251 <dict>
5252 \t<key>URL</key>
5253 \t<string>%(url)s</string>
5254 </dict>
5255 </plist>
5256 '''
5257
5258 DOT_DESKTOP_LINK_TEMPLATE = '''\
5259 [Desktop Entry]
5260 Encoding=UTF-8
5261 Name=%(filename)s
5262 Type=Link
5263 URL=%(url)s
5264 Icon=text-html
5265 '''
5266
5267 LINK_TEMPLATES = {
5268     'url': DOT_URL_LINK_TEMPLATE,
5269     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5270     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5271 }
5272
5273
5274 def iri_to_uri(iri):
5275     """
5276     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5277
5278     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5279     """
5280
5281     iri_parts = urllib.parse.urlparse(iri)
5282
5283     if '[' in iri_parts.netloc:
5284         raise ValueError('IPv6 URIs are not, yet, supported.')
5285         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5286
5287     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5288
5289     net_location = ''
5290     if iri_parts.username:
5291         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5292         if iri_parts.password is not None:
5293             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5294         net_location += '@'
5295
5296     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5297     # The 'idna' encoding produces ASCII text.
5298     if iri_parts.port is not None and iri_parts.port != 80:
5299         net_location += ':' + str(iri_parts.port)
5300
5301     return urllib.parse.urlunparse(
5302         (iri_parts.scheme,
5303             net_location,
5304
5305             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5306
5307             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5308             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5309
5310             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5311             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5312
5313             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5314
5315     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5316
5317
5318 def to_high_limit_path(path):
5319     if sys.platform in ['win32', 'cygwin']:
5320         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5321         return '\\\\?\\' + os.path.abspath(path)
5322
5323     return path
5324
5325
5326 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5327     val = traverse_obj(obj, *variadic(field))
5328     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5329         return default
5330     return template % func(val)
5331
5332
5333 def clean_podcast_url(url):
5334     return re.sub(r'''(?x)
5335         (?:
5336             (?:
5337                 chtbl\.com/track|
5338                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5339                 play\.podtrac\.com
5340             )/[^/]+|
5341             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5342             flex\.acast\.com|
5343             pd(?:
5344                 cn\.co| # https://podcorn.com/analytics-prefix/
5345                 st\.fm # https://podsights.com/docs/
5346             )/e
5347         )/''', '', url)
5348
5349
5350 _HEX_TABLE = '0123456789abcdef'
5351
5352
5353 def random_uuidv4():
5354     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5355
5356
5357 def make_dir(path, to_screen=None):
5358     try:
5359         dn = os.path.dirname(path)
5360         if dn and not os.path.exists(dn):
5361             os.makedirs(dn)
5362         return True
5363     except OSError as err:
5364         if callable(to_screen) is not None:
5365             to_screen('unable to create directory ' + error_to_compat_str(err))
5366         return False
5367
5368
5369 def get_executable_path():
5370     from .update import _get_variant_and_executable_path
5371
5372     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5373
5374
5375 def load_plugins(name, suffix, namespace):
5376     classes = {}
5377     with contextlib.suppress(FileNotFoundError):
5378         plugins_spec = importlib.util.spec_from_file_location(
5379             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5380         plugins = importlib.util.module_from_spec(plugins_spec)
5381         sys.modules[plugins_spec.name] = plugins
5382         plugins_spec.loader.exec_module(plugins)
5383         for name in dir(plugins):
5384             if name in namespace:
5385                 continue
5386             if not name.endswith(suffix):
5387                 continue
5388             klass = getattr(plugins, name)
5389             classes[name] = namespace[name] = klass
5390     return classes
5391
5392
5393 def traverse_obj(
5394         obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5395         casesense=True, is_user_input=False, traverse_string=False):
5396     """
5397     Safely traverse nested `dict`s and `Sequence`s
5398
5399     >>> obj = [{}, {"key": "value"}]
5400     >>> traverse_obj(obj, (1, "key"))
5401     "value"
5402
5403     Each of the provided `paths` is tested and the first producing a valid result will be returned.
5404     The next path will also be tested if the path branched but no results could be found.
5405     Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5406     A value of None is treated as the absence of a value.
5407
5408     The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5409
5410     The keys in the path can be one of:
5411         - `None`:           Return the current object.
5412         - `str`/`int`:      Return `obj[key]`. For `re.Match, return `obj.group(key)`.
5413         - `slice`:          Branch out and return all values in `obj[key]`.
5414         - `Ellipsis`:       Branch out and return a list of all values.
5415         - `tuple`/`list`:   Branch out and return a list of all matching values.
5416                             Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5417         - `function`:       Branch out and return values filtered by the function.
5418                             Read as: `[value for key, value in obj if function(key, value)]`.
5419                             For `Sequence`s, `key` is the index of the value.
5420         - `dict`            Transform the current object and return a matching dict.
5421                             Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5422
5423         `tuple`, `list`, and `dict` all support nested paths and branches.
5424
5425     @params paths           Paths which to traverse by.
5426     @param default          Value to return if the paths do not match.
5427     @param expected_type    If a `type`, only accept final values of this type.
5428                             If any other callable, try to call the function on each result.
5429     @param get_all          If `False`, return the first matching result, otherwise all matching ones.
5430     @param casesense        If `False`, consider string dictionary keys as case insensitive.
5431
5432     The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5433
5434     @param is_user_input    Whether the keys are generated from user input.
5435                             If `True` strings get converted to `int`/`slice` if needed.
5436     @param traverse_string  Whether to traverse into objects as strings.
5437                             If `True`, any non-compatible object will first be
5438                             converted into a string and then traversed into.
5439
5440
5441     @returns                The result of the object traversal.
5442                             If successful, `get_all=True`, and the path branches at least once,
5443                             then a list of results is returned instead.
5444                             A list is always returned if the last path branches and no `default` is given.
5445     """
5446     is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5447     casefold = lambda k: k.casefold() if isinstance(k, str) else k
5448
5449     if isinstance(expected_type, type):
5450         type_test = lambda val: val if isinstance(val, expected_type) else None
5451     else:
5452         type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5453
5454     def apply_key(key, obj):
5455         if obj is None:
5456             return
5457
5458         elif key is None:
5459             yield obj
5460
5461         elif isinstance(key, (list, tuple)):
5462             for branch in key:
5463                 _, result = apply_path(obj, branch)
5464                 yield from result
5465
5466         elif key is ...:
5467             if isinstance(obj, collections.abc.Mapping):
5468                 yield from obj.values()
5469             elif is_sequence(obj):
5470                 yield from obj
5471             elif isinstance(obj, re.Match):
5472                 yield from obj.groups()
5473             elif traverse_string:
5474                 yield from str(obj)
5475
5476         elif callable(key):
5477             if is_sequence(obj):
5478                 iter_obj = enumerate(obj)
5479             elif isinstance(obj, collections.abc.Mapping):
5480                 iter_obj = obj.items()
5481             elif isinstance(obj, re.Match):
5482                 iter_obj = enumerate((obj.group(), *obj.groups()))
5483             elif traverse_string:
5484                 iter_obj = enumerate(str(obj))
5485             else:
5486                 return
5487             yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5488
5489         elif isinstance(key, dict):
5490             iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5491             yield {k: v if v is not None else default for k, v in iter_obj
5492                    if v is not None or default is not NO_DEFAULT}
5493
5494         elif isinstance(obj, collections.abc.Mapping):
5495             yield (obj.get(key) if casesense or (key in obj)
5496                    else next((v for k, v in obj.items() if casefold(k) == key), None))
5497
5498         elif isinstance(obj, re.Match):
5499             if isinstance(key, int) or casesense:
5500                 with contextlib.suppress(IndexError):
5501                     yield obj.group(key)
5502                     return
5503
5504             if not isinstance(key, str):
5505                 return
5506
5507             yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5508
5509         else:
5510             if is_user_input:
5511                 key = (int_or_none(key) if ':' not in key
5512                        else slice(*map(int_or_none, key.split(':'))))
5513
5514             if not isinstance(key, (int, slice)):
5515                 return
5516
5517             if not is_sequence(obj):
5518                 if not traverse_string:
5519                     return
5520                 obj = str(obj)
5521
5522             with contextlib.suppress(IndexError):
5523                 yield obj[key]
5524
5525     def apply_path(start_obj, path):
5526         objs = (start_obj,)
5527         has_branched = False
5528
5529         for key in variadic(path):
5530             if is_user_input and key == ':':
5531                 key = ...
5532
5533             if not casesense and isinstance(key, str):
5534                 key = key.casefold()
5535
5536             if key is ... or isinstance(key, (list, tuple)) or callable(key):
5537                 has_branched = True
5538
5539             key_func = functools.partial(apply_key, key)
5540             objs = itertools.chain.from_iterable(map(key_func, objs))
5541
5542         return has_branched, objs
5543
5544     def _traverse_obj(obj, path, use_list=True):
5545         has_branched, results = apply_path(obj, path)
5546         results = LazyList(x for x in map(type_test, results) if x is not None)
5547
5548         if get_all and has_branched:
5549             return results.exhaust() if results or use_list else None
5550
5551         return results[0] if results else None
5552
5553     for index, path in enumerate(paths, 1):
5554         use_list = default is NO_DEFAULT and index == len(paths)
5555         result = _traverse_obj(obj, path, use_list)
5556         if result is not None:
5557             return result
5558
5559     return None if default is NO_DEFAULT else default
5560
5561
5562 def traverse_dict(dictn, keys, casesense=True):
5563     deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5564                         f'in a future version. Use "{__name__}.traverse_obj" instead')
5565     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5566
5567
5568 def get_first(obj, keys, **kwargs):
5569     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5570
5571
5572 def time_seconds(**kwargs):
5573     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5574     return t.timestamp()
5575
5576
5577 # create a JSON Web Signature (jws) with HS256 algorithm
5578 # the resulting format is in JWS Compact Serialization
5579 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5580 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5581 def jwt_encode_hs256(payload_data, key, headers={}):
5582     header_data = {
5583         'alg': 'HS256',
5584         'typ': 'JWT',
5585     }
5586     if headers:
5587         header_data.update(headers)
5588     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5589     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5590     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5591     signature_b64 = base64.b64encode(h.digest())
5592     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5593     return token
5594
5595
5596 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5597 def jwt_decode_hs256(jwt):
5598     header_b64, payload_b64, signature_b64 = jwt.split('.')
5599     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5600     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5601     return payload_data
5602
5603
5604 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5605
5606
5607 @functools.cache
5608 def supports_terminal_sequences(stream):
5609     if compat_os_name == 'nt':
5610         if not WINDOWS_VT_MODE:
5611             return False
5612     elif not os.getenv('TERM'):
5613         return False
5614     try:
5615         return stream.isatty()
5616     except BaseException:
5617         return False
5618
5619
5620 def windows_enable_vt_mode():
5621     """Ref: https://bugs.python.org/issue30075 """
5622     if get_windows_version() < (10, 0, 10586):
5623         return
5624
5625     import ctypes
5626     import ctypes.wintypes
5627     import msvcrt
5628
5629     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5630
5631     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5632     handle = os.open('CONOUT$', os.O_RDWR)
5633
5634     try:
5635         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5636         dw_original_mode = ctypes.wintypes.DWORD()
5637         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5638         if not success:
5639             raise Exception('GetConsoleMode failed')
5640
5641         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5642             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5643         if not success:
5644             raise Exception('SetConsoleMode failed')
5645     except Exception as e:
5646         write_string(f'WARNING: Cannot enable VT mode - {e}')
5647     else:
5648         global WINDOWS_VT_MODE
5649         WINDOWS_VT_MODE = True
5650         supports_terminal_sequences.cache_clear()
5651     finally:
5652         os.close(handle)
5653
5654
5655 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5656
5657
5658 def remove_terminal_sequences(string):
5659     return _terminal_sequences_re.sub('', string)
5660
5661
5662 def number_of_digits(number):
5663     return len('%d' % number)
5664
5665
5666 def join_nonempty(*values, delim='-', from_dict=None):
5667     if from_dict is not None:
5668         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5669     return delim.join(map(str, filter(None, values)))
5670
5671
5672 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5673     """
5674     Find the largest format dimensions in terms of video width and, for each thumbnail:
5675     * Modify the URL: Match the width with the provided regex and replace with the former width
5676     * Update dimensions
5677
5678     This function is useful with video services that scale the provided thumbnails on demand
5679     """
5680     _keys = ('width', 'height')
5681     max_dimensions = max(
5682         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5683         default=(0, 0))
5684     if not max_dimensions[0]:
5685         return thumbnails
5686     return [
5687         merge_dicts(
5688             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5689             dict(zip(_keys, max_dimensions)), thumbnail)
5690         for thumbnail in thumbnails
5691     ]
5692
5693
5694 def parse_http_range(range):
5695     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5696     if not range:
5697         return None, None, None
5698     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5699     if not crg:
5700         return None, None, None
5701     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5702
5703
5704 def read_stdin(what):
5705     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5706     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5707     return sys.stdin
5708
5709
5710 def determine_file_encoding(data):
5711     """
5712     Detect the text encoding used
5713     @returns (encoding, bytes to skip)
5714     """
5715
5716     # BOM marks are given priority over declarations
5717     for bom, enc in BOMS:
5718         if data.startswith(bom):
5719             return enc, len(bom)
5720
5721     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5722     # We ignore the endianness to get a good enough match
5723     data = data.replace(b'\0', b'')
5724     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5725     return mobj.group(1).decode() if mobj else None, 0
5726
5727
5728 class Config:
5729     own_args = None
5730     parsed_args = None
5731     filename = None
5732     __initialized = False
5733
5734     def __init__(self, parser, label=None):
5735         self.parser, self.label = parser, label
5736         self._loaded_paths, self.configs = set(), []
5737
5738     def init(self, args=None, filename=None):
5739         assert not self.__initialized
5740         self.own_args, self.filename = args, filename
5741         return self.load_configs()
5742
5743     def load_configs(self):
5744         directory = ''
5745         if self.filename:
5746             location = os.path.realpath(self.filename)
5747             directory = os.path.dirname(location)
5748             if location in self._loaded_paths:
5749                 return False
5750             self._loaded_paths.add(location)
5751
5752         self.__initialized = True
5753         opts, _ = self.parser.parse_known_args(self.own_args)
5754         self.parsed_args = self.own_args
5755         for location in opts.config_locations or []:
5756             if location == '-':
5757                 if location in self._loaded_paths:
5758                     continue
5759                 self._loaded_paths.add(location)
5760                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5761                 continue
5762             location = os.path.join(directory, expand_path(location))
5763             if os.path.isdir(location):
5764                 location = os.path.join(location, 'yt-dlp.conf')
5765             if not os.path.exists(location):
5766                 self.parser.error(f'config location {location} does not exist')
5767             self.append_config(self.read_file(location), location)
5768         return True
5769
5770     def __str__(self):
5771         label = join_nonempty(
5772             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5773             delim=' ')
5774         return join_nonempty(
5775             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5776             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5777             delim='\n')
5778
5779     @staticmethod
5780     def read_file(filename, default=[]):
5781         try:
5782             optionf = open(filename, 'rb')
5783         except OSError:
5784             return default  # silently skip if file is not present
5785         try:
5786             enc, skip = determine_file_encoding(optionf.read(512))
5787             optionf.seek(skip, io.SEEK_SET)
5788         except OSError:
5789             enc = None  # silently skip read errors
5790         try:
5791             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5792             contents = optionf.read().decode(enc or preferredencoding())
5793             res = shlex.split(contents, comments=True)
5794         except Exception as err:
5795             raise ValueError(f'Unable to parse "{filename}": {err}')
5796         finally:
5797             optionf.close()
5798         return res
5799
5800     @staticmethod
5801     def hide_login_info(opts):
5802         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5803         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5804
5805         def _scrub_eq(o):
5806             m = eqre.match(o)
5807             if m:
5808                 return m.group('key') + '=PRIVATE'
5809             else:
5810                 return o
5811
5812         opts = list(map(_scrub_eq, opts))
5813         for idx, opt in enumerate(opts):
5814             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5815                 opts[idx + 1] = 'PRIVATE'
5816         return opts
5817
5818     def append_config(self, *args, label=None):
5819         config = type(self)(self.parser, label)
5820         config._loaded_paths = self._loaded_paths
5821         if config.init(*args):
5822             self.configs.append(config)
5823
5824     @property
5825     def all_args(self):
5826         for config in reversed(self.configs):
5827             yield from config.all_args
5828         yield from self.parsed_args or []
5829
5830     def parse_known_args(self, **kwargs):
5831         return self.parser.parse_known_args(self.all_args, **kwargs)
5832
5833     def parse_args(self):
5834         return self.parser.parse_args(self.all_args)
5835
5836
5837 class WebSocketsWrapper:
5838     """Wraps websockets module to use in non-async scopes"""
5839     pool = None
5840
5841     def __init__(self, url, headers=None, connect=True):
5842         self.loop = asyncio.new_event_loop()
5843         # XXX: "loop" is deprecated
5844         self.conn = websockets.connect(
5845             url, extra_headers=headers, ping_interval=None,
5846             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5847         if connect:
5848             self.__enter__()
5849         atexit.register(self.__exit__, None, None, None)
5850
5851     def __enter__(self):
5852         if not self.pool:
5853             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5854         return self
5855
5856     def send(self, *args):
5857         self.run_with_loop(self.pool.send(*args), self.loop)
5858
5859     def recv(self, *args):
5860         return self.run_with_loop(self.pool.recv(*args), self.loop)
5861
5862     def __exit__(self, type, value, traceback):
5863         try:
5864             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5865         finally:
5866             self.loop.close()
5867             self._cancel_all_tasks(self.loop)
5868
5869     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5870     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5871     @staticmethod
5872     def run_with_loop(main, loop):
5873         if not asyncio.iscoroutine(main):
5874             raise ValueError(f'a coroutine was expected, got {main!r}')
5875
5876         try:
5877             return loop.run_until_complete(main)
5878         finally:
5879             loop.run_until_complete(loop.shutdown_asyncgens())
5880             if hasattr(loop, 'shutdown_default_executor'):
5881                 loop.run_until_complete(loop.shutdown_default_executor())
5882
5883     @staticmethod
5884     def _cancel_all_tasks(loop):
5885         to_cancel = asyncio.all_tasks(loop)
5886
5887         if not to_cancel:
5888             return
5889
5890         for task in to_cancel:
5891             task.cancel()
5892
5893         # XXX: "loop" is removed in python 3.10+
5894         loop.run_until_complete(
5895             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5896
5897         for task in to_cancel:
5898             if task.cancelled():
5899                 continue
5900             if task.exception() is not None:
5901                 loop.call_exception_handler({
5902                     'message': 'unhandled exception during asyncio.run() shutdown',
5903                     'exception': task.exception(),
5904                     'task': task,
5905                 })
5906
5907
5908 def merge_headers(*dicts):
5909     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5910     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5911
5912
5913 def cached_method(f):
5914     """Cache a method"""
5915     signature = inspect.signature(f)
5916
5917     @functools.wraps(f)
5918     def wrapper(self, *args, **kwargs):
5919         bound_args = signature.bind(self, *args, **kwargs)
5920         bound_args.apply_defaults()
5921         key = tuple(bound_args.arguments.values())[1:]
5922
5923         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5924         if key not in cache:
5925             cache[key] = f(self, *args, **kwargs)
5926         return cache[key]
5927     return wrapper
5928
5929
5930 class classproperty:
5931     """property access for class methods with optional caching"""
5932     def __new__(cls, func=None, *args, **kwargs):
5933         if not func:
5934             return functools.partial(cls, *args, **kwargs)
5935         return super().__new__(cls)
5936
5937     def __init__(self, func, *, cache=False):
5938         functools.update_wrapper(self, func)
5939         self.func = func
5940         self._cache = {} if cache else None
5941
5942     def __get__(self, _, cls):
5943         if self._cache is None:
5944             return self.func(cls)
5945         elif cls not in self._cache:
5946             self._cache[cls] = self.func(cls)
5947         return self._cache[cls]
5948
5949
5950 class Namespace(types.SimpleNamespace):
5951     """Immutable namespace"""
5952
5953     def __iter__(self):
5954         return iter(self.__dict__.values())
5955
5956     @property
5957     def items_(self):
5958         return self.__dict__.items()
5959
5960
5961 MEDIA_EXTENSIONS = Namespace(
5962     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5963     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5964     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5965     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5966     thumbnails=('jpg', 'png', 'webp'),
5967     storyboards=('mhtml', ),
5968     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5969     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5970 )
5971 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5972 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5973
5974 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5975
5976
5977 class RetryManager:
5978     """Usage:
5979         for retry in RetryManager(...):
5980             try:
5981                 ...
5982             except SomeException as err:
5983                 retry.error = err
5984                 continue
5985     """
5986     attempt, _error = 0, None
5987
5988     def __init__(self, _retries, _error_callback, **kwargs):
5989         self.retries = _retries or 0
5990         self.error_callback = functools.partial(_error_callback, **kwargs)
5991
5992     def _should_retry(self):
5993         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5994
5995     @property
5996     def error(self):
5997         if self._error is NO_DEFAULT:
5998             return None
5999         return self._error
6000
6001     @error.setter
6002     def error(self, value):
6003         self._error = value
6004
6005     def __iter__(self):
6006         while self._should_retry():
6007             self.error = NO_DEFAULT
6008             self.attempt += 1
6009             yield self
6010             if self.error:
6011                 self.error_callback(self.error, self.attempt, self.retries)
6012
6013     @staticmethod
6014     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6015         """Utility function for reporting retries"""
6016         if count > retries:
6017             if error:
6018                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6019             raise e
6020
6021         if not count:
6022             return warn(e)
6023         elif isinstance(e, ExtractorError):
6024             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
6025         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6026
6027         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6028         if delay:
6029             info(f'Sleeping {delay:.2f} seconds ...')
6030             time.sleep(delay)
6031
6032
6033 def make_archive_id(ie, video_id):
6034     ie_key = ie if isinstance(ie, str) else ie.ie_key()
6035     return f'{ie_key.lower()} {video_id}'
6036
6037
6038 def truncate_string(s, left, right=0):
6039     assert left > 3 and right >= 0
6040     if s is None or len(s) <= left + right:
6041         return s
6042     return f'{s[:left-3]}...{s[-right:] if right else ""}'
6043
6044
6045 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6046     assert 'all' in alias_dict, '"all" alias is required'
6047     requested = list(start or [])
6048     for val in options:
6049         discard = val.startswith('-')
6050         if discard:
6051             val = val[1:]
6052
6053         if val in alias_dict:
6054             val = alias_dict[val] if not discard else [
6055                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6056             # NB: Do not allow regex in aliases for performance
6057             requested = orderedSet_from_options(val, alias_dict, start=requested)
6058             continue
6059
6060         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6061                    else [val] if val in alias_dict['all'] else None)
6062         if current is None:
6063             raise ValueError(val)
6064
6065         if discard:
6066             for item in current:
6067                 while item in requested:
6068                     requested.remove(item)
6069         else:
6070             requested.extend(current)
6071
6072     return orderedSet(requested)
6073
6074
6075 class FormatSorter:
6076     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6077
6078     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6079                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6080                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
6081     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6082                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
6083                     'fps', 'fs_approx', 'source', 'id')
6084
6085     settings = {
6086         'vcodec': {'type': 'ordered', 'regex': True,
6087                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6088         'acodec': {'type': 'ordered', 'regex': True,
6089                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6090         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6091                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6092         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6093                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6094         'vext': {'type': 'ordered', 'field': 'video_ext',
6095                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6096                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6097         'aext': {'type': 'ordered', 'field': 'audio_ext',
6098                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6099                  'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6100         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6101         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6102                        'field': ('vcodec', 'acodec'),
6103                        'function': lambda it: int(any(v != 'none' for v in it))},
6104         'ie_pref': {'priority': True, 'type': 'extractor'},
6105         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6106         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6107         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6108         'quality': {'convert': 'float', 'default': -1},
6109         'filesize': {'convert': 'bytes'},
6110         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6111         'id': {'convert': 'string', 'field': 'format_id'},
6112         'height': {'convert': 'float_none'},
6113         'width': {'convert': 'float_none'},
6114         'fps': {'convert': 'float_none'},
6115         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6116         'tbr': {'convert': 'float_none'},
6117         'vbr': {'convert': 'float_none'},
6118         'abr': {'convert': 'float_none'},
6119         'asr': {'convert': 'float_none'},
6120         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6121
6122         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6123         'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6124         'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6125         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6126         'res': {'type': 'multiple', 'field': ('height', 'width'),
6127                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6128
6129         # Actual field names
6130         'format_id': {'type': 'alias', 'field': 'id'},
6131         'preference': {'type': 'alias', 'field': 'ie_pref'},
6132         'language_preference': {'type': 'alias', 'field': 'lang'},
6133         'source_preference': {'type': 'alias', 'field': 'source'},
6134         'protocol': {'type': 'alias', 'field': 'proto'},
6135         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6136         'audio_channels': {'type': 'alias', 'field': 'channels'},
6137
6138         # Deprecated
6139         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6140         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6141         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6142         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6143         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6144         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6145         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6146         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6147         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6148         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6149         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6150         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6151         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6152         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6153         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6154         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6155         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6156         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6157         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6158         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6159     }
6160
6161     def __init__(self, ydl, field_preference):
6162         self.ydl = ydl
6163         self._order = []
6164         self.evaluate_params(self.ydl.params, field_preference)
6165         if ydl.params.get('verbose'):
6166             self.print_verbose_info(self.ydl.write_debug)
6167
6168     def _get_field_setting(self, field, key):
6169         if field not in self.settings:
6170             if key in ('forced', 'priority'):
6171                 return False
6172             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6173                                         'deprecated and may be removed in a future version')
6174             self.settings[field] = {}
6175         propObj = self.settings[field]
6176         if key not in propObj:
6177             type = propObj.get('type')
6178             if key == 'field':
6179                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6180             elif key == 'convert':
6181                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6182             else:
6183                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6184             propObj[key] = default
6185         return propObj[key]
6186
6187     def _resolve_field_value(self, field, value, convertNone=False):
6188         if value is None:
6189             if not convertNone:
6190                 return None
6191         else:
6192             value = value.lower()
6193         conversion = self._get_field_setting(field, 'convert')
6194         if conversion == 'ignore':
6195             return None
6196         if conversion == 'string':
6197             return value
6198         elif conversion == 'float_none':
6199             return float_or_none(value)
6200         elif conversion == 'bytes':
6201             return parse_bytes(value)
6202         elif conversion == 'order':
6203             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6204             use_regex = self._get_field_setting(field, 'regex')
6205             list_length = len(order_list)
6206             empty_pos = order_list.index('') if '' in order_list else list_length + 1
6207             if use_regex and value is not None:
6208                 for i, regex in enumerate(order_list):
6209                     if regex and re.match(regex, value):
6210                         return list_length - i
6211                 return list_length - empty_pos  # not in list
6212             else:  # not regex or  value = None
6213                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6214         else:
6215             if value.isnumeric():
6216                 return float(value)
6217             else:
6218                 self.settings[field]['convert'] = 'string'
6219                 return value
6220
6221     def evaluate_params(self, params, sort_extractor):
6222         self._use_free_order = params.get('prefer_free_formats', False)
6223         self._sort_user = params.get('format_sort', [])
6224         self._sort_extractor = sort_extractor
6225
6226         def add_item(field, reverse, closest, limit_text):
6227             field = field.lower()
6228             if field in self._order:
6229                 return
6230             self._order.append(field)
6231             limit = self._resolve_field_value(field, limit_text)
6232             data = {
6233                 'reverse': reverse,
6234                 'closest': False if limit is None else closest,
6235                 'limit_text': limit_text,
6236                 'limit': limit}
6237             if field in self.settings:
6238                 self.settings[field].update(data)
6239             else:
6240                 self.settings[field] = data
6241
6242         sort_list = (
6243             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6244             + (tuple() if params.get('format_sort_force', False)
6245                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6246             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6247
6248         for item in sort_list:
6249             match = re.match(self.regex, item)
6250             if match is None:
6251                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6252             field = match.group('field')
6253             if field is None:
6254                 continue
6255             if self._get_field_setting(field, 'type') == 'alias':
6256                 alias, field = field, self._get_field_setting(field, 'field')
6257                 if self._get_field_setting(alias, 'deprecated'):
6258                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6259                                                 f'be removed in a future version. Please use {field} instead')
6260             reverse = match.group('reverse') is not None
6261             closest = match.group('separator') == '~'
6262             limit_text = match.group('limit')
6263
6264             has_limit = limit_text is not None
6265             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6266             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6267
6268             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6269             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6270             limit_count = len(limits)
6271             for (i, f) in enumerate(fields):
6272                 add_item(f, reverse, closest,
6273                          limits[i] if i < limit_count
6274                          else limits[0] if has_limit and not has_multiple_limits
6275                          else None)
6276
6277     def print_verbose_info(self, write_debug):
6278         if self._sort_user:
6279             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6280         if self._sort_extractor:
6281             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6282         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6283             '+' if self._get_field_setting(field, 'reverse') else '', field,
6284             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6285                           self._get_field_setting(field, 'limit_text'),
6286                           self._get_field_setting(field, 'limit'))
6287             if self._get_field_setting(field, 'limit_text') is not None else '')
6288             for field in self._order if self._get_field_setting(field, 'visible')]))
6289
6290     def _calculate_field_preference_from_value(self, format, field, type, value):
6291         reverse = self._get_field_setting(field, 'reverse')
6292         closest = self._get_field_setting(field, 'closest')
6293         limit = self._get_field_setting(field, 'limit')
6294
6295         if type == 'extractor':
6296             maximum = self._get_field_setting(field, 'max')
6297             if value is None or (maximum is not None and value >= maximum):
6298                 value = -1
6299         elif type == 'boolean':
6300             in_list = self._get_field_setting(field, 'in_list')
6301             not_in_list = self._get_field_setting(field, 'not_in_list')
6302             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6303         elif type == 'ordered':
6304             value = self._resolve_field_value(field, value, True)
6305
6306         # try to convert to number
6307         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6308         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6309         if is_num:
6310             value = val_num
6311
6312         return ((-10, 0) if value is None
6313                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
6314                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6315                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6316                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6317                 else (-1, value, 0))
6318
6319     def _calculate_field_preference(self, format, field):
6320         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
6321         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6322         if type == 'multiple':
6323             type = 'field'  # Only 'field' is allowed in multiple for now
6324             actual_fields = self._get_field_setting(field, 'field')
6325
6326             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6327         else:
6328             value = get_value(field)
6329         return self._calculate_field_preference_from_value(format, field, type, value)
6330
6331     def calculate_preference(self, format):
6332         # Determine missing protocol
6333         if not format.get('protocol'):
6334             format['protocol'] = determine_protocol(format)
6335
6336         # Determine missing ext
6337         if not format.get('ext') and 'url' in format:
6338             format['ext'] = determine_ext(format['url'])
6339         if format.get('vcodec') == 'none':
6340             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6341             format['video_ext'] = 'none'
6342         else:
6343             format['video_ext'] = format['ext']
6344             format['audio_ext'] = 'none'
6345         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
6346         #    format['preference'] = -1000
6347
6348         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6349             # HEVC-over-FLV is out-of-spec by FLV's original spec
6350             # ref. https://trac.ffmpeg.org/ticket/6389
6351             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6352             format['preference'] = -100
6353
6354         # Determine missing bitrates
6355         if format.get('tbr') is None:
6356             if format.get('vbr') is not None and format.get('abr') is not None:
6357                 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6358         else:
6359             if format.get('vcodec') != 'none' and format.get('vbr') is None:
6360                 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6361             if format.get('acodec') != 'none' and format.get('abr') is None:
6362                 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6363
6364         return tuple(self._calculate_field_preference(format, field) for field in self._order)
6365
6366
6367 # Deprecated
6368 has_certifi = bool(certifi)
6369 has_websockets = bool(websockets)