yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from .compat import functools  # isort: split
  51 from .compat import (
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_HTMLParseError,
  55     compat_os_name,
  56     compat_shlex_quote,
  57 )
  58 from .dependencies import brotli, certifi, websockets, xattr
  59 from .socks import ProxyType, sockssocket
  60
  61
  62 def register_socks_protocols():
  63     # "Register" SOCKS protocols
  64     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  65     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  66     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  67         if scheme not in urllib.parse.uses_netloc:
  68             urllib.parse.uses_netloc.append(scheme)
  69
  70
  71 # This is not clearly defined otherwise
  72 compiled_regex_type = type(re.compile(''))
  73
  74
  75 def random_user_agent():
  76     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  77     _CHROME_VERSIONS = (
  78         '90.0.4430.212',
  79         '90.0.4430.24',
  80         '90.0.4430.70',
  81         '90.0.4430.72',
  82         '90.0.4430.85',
  83         '90.0.4430.93',
  84         '91.0.4472.101',
  85         '91.0.4472.106',
  86         '91.0.4472.114',
  87         '91.0.4472.124',
  88         '91.0.4472.164',
  89         '91.0.4472.19',
  90         '91.0.4472.77',
  91         '92.0.4515.107',
  92         '92.0.4515.115',
  93         '92.0.4515.131',
  94         '92.0.4515.159',
  95         '92.0.4515.43',
  96         '93.0.4556.0',
  97         '93.0.4577.15',
  98         '93.0.4577.63',
  99         '93.0.4577.82',
 100         '94.0.4606.41',
 101         '94.0.4606.54',
 102         '94.0.4606.61',
 103         '94.0.4606.71',
 104         '94.0.4606.81',
 105         '94.0.4606.85',
 106         '95.0.4638.17',
 107         '95.0.4638.50',
 108         '95.0.4638.54',
 109         '95.0.4638.69',
 110         '95.0.4638.74',
 111         '96.0.4664.18',
 112         '96.0.4664.45',
 113         '96.0.4664.55',
 114         '96.0.4664.93',
 115         '97.0.4692.20',
 116     )
 117     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 118
 119
 120 SUPPORTED_ENCODINGS = [
 121     'gzip', 'deflate'
 122 ]
 123 if brotli:
 124     SUPPORTED_ENCODINGS.append('br')
 125
 126 std_headers = {
 127     'User-Agent': random_user_agent(),
 128     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 129     'Accept-Language': 'en-us,en;q=0.5',
 130     'Sec-Fetch-Mode': 'navigate',
 131 }
 132
 133
 134 USER_AGENTS = {
 135     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 136 }
 137
 138
 139 NO_DEFAULT = object()
 140 IDENTITY = lambda x: x
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151     # these follow the genitive grammatical case (dopełniacz)
 152     # some websites might be using nominative, which will require another month list
 153     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 154     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 155            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 156 }
 157
 158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 159 TIMEZONE_NAMES = {
 160     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 161     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 162     'EST': -5, 'EDT': -4,  # Eastern
 163     'CST': -6, 'CDT': -5,  # Central
 164     'MST': -7, 'MDT': -6,  # Mountain
 165     'PST': -8, 'PDT': -7   # Pacific
 166 }
 167
 168 # needed for sanitizing filenames in restricted mode
 169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 170                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 171                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 172
 173 DATE_FORMATS = (
 174     '%d %B %Y',
 175     '%d %b %Y',
 176     '%B %d %Y',
 177     '%B %dst %Y',
 178     '%B %dnd %Y',
 179     '%B %drd %Y',
 180     '%B %dth %Y',
 181     '%b %d %Y',
 182     '%b %dst %Y',
 183     '%b %dnd %Y',
 184     '%b %drd %Y',
 185     '%b %dth %Y',
 186     '%b %dst %Y %I:%M',
 187     '%b %dnd %Y %I:%M',
 188     '%b %drd %Y %I:%M',
 189     '%b %dth %Y %I:%M',
 190     '%Y %m %d',
 191     '%Y-%m-%d',
 192     '%Y.%m.%d.',
 193     '%Y/%m/%d',
 194     '%Y/%m/%d %H:%M',
 195     '%Y/%m/%d %H:%M:%S',
 196     '%Y%m%d%H%M',
 197     '%Y%m%d%H%M%S',
 198     '%Y%m%d',
 199     '%Y-%m-%d %H:%M',
 200     '%Y-%m-%d %H:%M:%S',
 201     '%Y-%m-%d %H:%M:%S.%f',
 202     '%Y-%m-%d %H:%M:%S:%f',
 203     '%d.%m.%Y %H:%M',
 204     '%d.%m.%Y %H.%M',
 205     '%Y-%m-%dT%H:%M:%SZ',
 206     '%Y-%m-%dT%H:%M:%S.%fZ',
 207     '%Y-%m-%dT%H:%M:%S.%f0Z',
 208     '%Y-%m-%dT%H:%M:%S',
 209     '%Y-%m-%dT%H:%M:%S.%f',
 210     '%Y-%m-%dT%H:%M',
 211     '%b %d %Y at %H:%M',
 212     '%b %d %Y at %H:%M:%S',
 213     '%B %d %Y at %H:%M',
 214     '%B %d %Y at %H:%M:%S',
 215     '%H:%M %d-%b-%Y',
 216 )
 217
 218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 219 DATE_FORMATS_DAY_FIRST.extend([
 220     '%d-%m-%Y',
 221     '%d.%m.%Y',
 222     '%d.%m.%y',
 223     '%d/%m/%Y',
 224     '%d/%m/%y',
 225     '%d/%m/%Y %H:%M:%S',
 226     '%d-%m-%Y %H:%M',
 227 ])
 228
 229 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 230 DATE_FORMATS_MONTH_FIRST.extend([
 231     '%m-%d-%Y',
 232     '%m.%d.%Y',
 233     '%m/%d/%Y',
 234     '%m/%d/%y',
 235     '%m/%d/%Y %H:%M:%S',
 236 ])
 237
 238 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 239 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 240
 241 NUMBER_RE = r'\d+(?:\.\d+)?'
 242
 243
 244 @functools.cache
 245 def preferredencoding():
 246     """Get preferred encoding.
 247
 248     Returns the best encoding scheme for the system, based on
 249     locale.getpreferredencoding() and some further tweaks.
 250     """
 251     try:
 252         pref = locale.getpreferredencoding()
 253         'TEST'.encode(pref)
 254     except Exception:
 255         pref = 'UTF-8'
 256
 257     return pref
 258
 259
 260 def write_json_file(obj, fn):
 261     """ Encode obj as JSON and write it to fn, atomically if possible """
 262
 263     tf = tempfile.NamedTemporaryFile(
 264         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 265         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 266
 267     try:
 268         with tf:
 269             json.dump(obj, tf, ensure_ascii=False)
 270         if sys.platform == 'win32':
 271             # Need to remove existing file on Windows, else os.rename raises
 272             # WindowsError or FileExistsError.
 273             with contextlib.suppress(OSError):
 274                 os.unlink(fn)
 275         with contextlib.suppress(OSError):
 276             mask = os.umask(0)
 277             os.umask(mask)
 278             os.chmod(tf.name, 0o666 & ~mask)
 279         os.rename(tf.name, fn)
 280     except Exception:
 281         with contextlib.suppress(OSError):
 282             os.remove(tf.name)
 283         raise
 284
 285
 286 def find_xpath_attr(node, xpath, key, val=None):
 287     """ Find the xpath xpath[@key=val] """
 288     assert re.match(r'^[a-zA-Z_-]+$', key)
 289     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 290     return node.find(expr)
 291
 292 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 293 # the namespace parameter
 294
 295
 296 def xpath_with_ns(path, ns_map):
 297     components = [c.split(':') for c in path.split('/')]
 298     replaced = []
 299     for c in components:
 300         if len(c) == 1:
 301             replaced.append(c[0])
 302         else:
 303             ns, tag = c
 304             replaced.append('{%s}%s' % (ns_map[ns], tag))
 305     return '/'.join(replaced)
 306
 307
 308 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 309     def _find_xpath(xpath):
 310         return node.find(xpath)
 311
 312     if isinstance(xpath, str):
 313         n = _find_xpath(xpath)
 314     else:
 315         for xp in xpath:
 316             n = _find_xpath(xp)
 317             if n is not None:
 318                 break
 319
 320     if n is None:
 321         if default is not NO_DEFAULT:
 322             return default
 323         elif fatal:
 324             name = xpath if name is None else name
 325             raise ExtractorError('Could not find XML element %s' % name)
 326         else:
 327             return None
 328     return n
 329
 330
 331 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 332     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 333     if n is None or n == default:
 334         return n
 335     if n.text is None:
 336         if default is not NO_DEFAULT:
 337             return default
 338         elif fatal:
 339             name = xpath if name is None else name
 340             raise ExtractorError('Could not find XML element\'s text %s' % name)
 341         else:
 342             return None
 343     return n.text
 344
 345
 346 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 347     n = find_xpath_attr(node, xpath, key)
 348     if n is None:
 349         if default is not NO_DEFAULT:
 350             return default
 351         elif fatal:
 352             name = f'{xpath}[@{key}]' if name is None else name
 353             raise ExtractorError('Could not find XML attribute %s' % name)
 354         else:
 355             return None
 356     return n.attrib[key]
 357
 358
 359 def get_element_by_id(id, html, **kwargs):
 360     """Return the content of the tag with the specified ID in the passed HTML document"""
 361     return get_element_by_attribute('id', id, html, **kwargs)
 362
 363
 364 def get_element_html_by_id(id, html, **kwargs):
 365     """Return the html of the tag with the specified ID in the passed HTML document"""
 366     return get_element_html_by_attribute('id', id, html, **kwargs)
 367
 368
 369 def get_element_by_class(class_name, html):
 370     """Return the content of the first tag with the specified class in the passed HTML document"""
 371     retval = get_elements_by_class(class_name, html)
 372     return retval[0] if retval else None
 373
 374
 375 def get_element_html_by_class(class_name, html):
 376     """Return the html of the first tag with the specified class in the passed HTML document"""
 377     retval = get_elements_html_by_class(class_name, html)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_by_attribute(attribute, value, html, **kwargs):
 382     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 383     return retval[0] if retval else None
 384
 385
 386 def get_element_html_by_attribute(attribute, value, html, **kargs):
 387     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 388     return retval[0] if retval else None
 389
 390
 391 def get_elements_by_class(class_name, html, **kargs):
 392     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 393     return get_elements_by_attribute(
 394         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 395         html, escape_value=False)
 396
 397
 398 def get_elements_html_by_class(class_name, html):
 399     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 400     return get_elements_html_by_attribute(
 401         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 402         html, escape_value=False)
 403
 404
 405 def get_elements_by_attribute(*args, **kwargs):
 406     """Return the content of the tag with the specified attribute in the passed HTML document"""
 407     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 408
 409
 410 def get_elements_html_by_attribute(*args, **kwargs):
 411     """Return the html of the tag with the specified attribute in the passed HTML document"""
 412     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 413
 414
 415 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 416     """
 417     Return the text (content) and the html (whole) of the tag with the specified
 418     attribute in the passed HTML document
 419     """
 420     if not value:
 421         return
 422
 423     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 424
 425     value = re.escape(value) if escape_value else value
 426
 427     partial_element_re = rf'''(?x)
 428         <(?P<tag>{tag})
 429          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 430          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 431         '''
 432
 433     for m in re.finditer(partial_element_re, html):
 434         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 435
 436         yield (
 437             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 438             whole
 439         )
 440
 441
 442 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 443     """
 444     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 445     closing tag for the first opening tag it has encountered, and can be used
 446     as a context manager
 447     """
 448
 449     class HTMLBreakOnClosingTagException(Exception):
 450         pass
 451
 452     def __init__(self):
 453         self.tagstack = collections.deque()
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def __enter__(self):
 457         return self
 458
 459     def __exit__(self, *_):
 460         self.close()
 461
 462     def close(self):
 463         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 464         # so data remains buffered; we no longer have any interest in it, thus
 465         # override this method to discard it
 466         pass
 467
 468     def handle_starttag(self, tag, _):
 469         self.tagstack.append(tag)
 470
 471     def handle_endtag(self, tag):
 472         if not self.tagstack:
 473             raise compat_HTMLParseError('no tags in the stack')
 474         while self.tagstack:
 475             inner_tag = self.tagstack.pop()
 476             if inner_tag == tag:
 477                 break
 478         else:
 479             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 480         if not self.tagstack:
 481             raise self.HTMLBreakOnClosingTagException()
 482
 483
 484 # XXX: This should be far less strict
 485 def get_element_text_and_html_by_tag(tag, html):
 486     """
 487     For the first element with the specified tag in the passed HTML document
 488     return its' content (text) and the whole element (html)
 489     """
 490     def find_or_raise(haystack, needle, exc):
 491         try:
 492             return haystack.index(needle)
 493         except ValueError:
 494             raise exc
 495     closing_tag = f'</{tag}>'
 496     whole_start = find_or_raise(
 497         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 498     content_start = find_or_raise(
 499         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 500     content_start += whole_start + 1
 501     with HTMLBreakOnClosingTagParser() as parser:
 502         parser.feed(html[whole_start:content_start])
 503         if not parser.tagstack or parser.tagstack[0] != tag:
 504             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 505         offset = content_start
 506         while offset < len(html):
 507             next_closing_tag_start = find_or_raise(
 508                 html[offset:], closing_tag,
 509                 compat_HTMLParseError(f'closing {tag} tag not found'))
 510             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 511             try:
 512                 parser.feed(html[offset:offset + next_closing_tag_end])
 513                 offset += next_closing_tag_end
 514             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 515                 return html[content_start:offset + next_closing_tag_start], \
 516                     html[whole_start:offset + next_closing_tag_end]
 517         raise compat_HTMLParseError('unexpected end of html')
 518
 519
 520 class HTMLAttributeParser(html.parser.HTMLParser):
 521     """Trivial HTML parser to gather the attributes for a single element"""
 522
 523     def __init__(self):
 524         self.attrs = {}
 525         html.parser.HTMLParser.__init__(self)
 526
 527     def handle_starttag(self, tag, attrs):
 528         self.attrs = dict(attrs)
 529         raise compat_HTMLParseError('done')
 530
 531
 532 class HTMLListAttrsParser(html.parser.HTMLParser):
 533     """HTML parser to gather the attributes for the elements of a list"""
 534
 535     def __init__(self):
 536         html.parser.HTMLParser.__init__(self)
 537         self.items = []
 538         self._level = 0
 539
 540     def handle_starttag(self, tag, attrs):
 541         if tag == 'li' and self._level == 0:
 542             self.items.append(dict(attrs))
 543         self._level += 1
 544
 545     def handle_endtag(self, tag):
 546         self._level -= 1
 547
 548
 549 def extract_attributes(html_element):
 550     """Given a string for an HTML element such as
 551     <el
 552          a="foo" B="bar" c="&98;az" d=boz
 553          empty= noval entity="&amp;"
 554          sq='"' dq="'"
 555     >
 556     Decode and return a dictionary of attributes.
 557     {
 558         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 559         'empty': '', 'noval': None, 'entity': '&',
 560         'sq': '"', 'dq': '\''
 561     }.
 562     """
 563     parser = HTMLAttributeParser()
 564     with contextlib.suppress(compat_HTMLParseError):
 565         parser.feed(html_element)
 566         parser.close()
 567     return parser.attrs
 568
 569
 570 def parse_list(webpage):
 571     """Given a string for an series of HTML <li> elements,
 572     return a dictionary of their attributes"""
 573     parser = HTMLListAttrsParser()
 574     parser.feed(webpage)
 575     parser.close()
 576     return parser.items
 577
 578
 579 def clean_html(html):
 580     """Clean an HTML snippet into a readable string"""
 581
 582     if html is None:  # Convenience for sanitizing descriptions etc.
 583         return html
 584
 585     html = re.sub(r'\s+', ' ', html)
 586     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 587     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 588     # Strip html tags
 589     html = re.sub('<.*?>', '', html)
 590     # Replace html entities
 591     html = unescapeHTML(html)
 592     return html.strip()
 593
 594
 595 class LenientJSONDecoder(json.JSONDecoder):
 596     # TODO: Write tests
 597     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 598         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 599         self._close_attempts = 2 * close_objects
 600         super().__init__(*args, **kwargs)
 601
 602     @staticmethod
 603     def _close_object(err):
 604         doc = err.doc[:err.pos]
 605         # We need to add comma first to get the correct error message
 606         if err.msg.startswith('Expecting \',\''):
 607             return doc + ','
 608         elif not doc.endswith(','):
 609             return
 610
 611         if err.msg.startswith('Expecting property name'):
 612             return doc[:-1] + '}'
 613         elif err.msg.startswith('Expecting value'):
 614             return doc[:-1] + ']'
 615
 616     def decode(self, s):
 617         if self.transform_source:
 618             s = self.transform_source(s)
 619         for attempt in range(self._close_attempts + 1):
 620             try:
 621                 if self.ignore_extra:
 622                     return self.raw_decode(s.lstrip())[0]
 623                 return super().decode(s)
 624             except json.JSONDecodeError as e:
 625                 if e.pos is None:
 626                     raise
 627                 elif attempt < self._close_attempts:
 628                     s = self._close_object(e)
 629                     if s is not None:
 630                         continue
 631                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 632         assert False, 'Too many attempts to decode JSON'
 633
 634
 635 def sanitize_open(filename, open_mode):
 636     """Try to open the given filename, and slightly tweak it if this fails.
 637
 638     Attempts to open the given filename. If this fails, it tries to change
 639     the filename slightly, step by step, until it's either able to open it
 640     or it fails and raises a final exception, like the standard open()
 641     function.
 642
 643     It returns the tuple (stream, definitive_file_name).
 644     """
 645     if filename == '-':
 646         if sys.platform == 'win32':
 647             import msvcrt
 648
 649             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 650             with contextlib.suppress(io.UnsupportedOperation):
 651                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 652         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 653
 654     for attempt in range(2):
 655         try:
 656             try:
 657                 if sys.platform == 'win32':
 658                     # FIXME: An exclusive lock also locks the file from being read.
 659                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 660                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 661                     raise LockingUnsupportedError()
 662                 stream = locked_file(filename, open_mode, block=False).__enter__()
 663             except OSError:
 664                 stream = open(filename, open_mode)
 665             return stream, filename
 666         except OSError as err:
 667             if attempt or err.errno in (errno.EACCES,):
 668                 raise
 669             old_filename, filename = filename, sanitize_path(filename)
 670             if old_filename == filename:
 671                 raise
 672
 673
 674 def timeconvert(timestr):
 675     """Convert RFC 2822 defined time string into system timestamp"""
 676     timestamp = None
 677     timetuple = email.utils.parsedate_tz(timestr)
 678     if timetuple is not None:
 679         timestamp = email.utils.mktime_tz(timetuple)
 680     return timestamp
 681
 682
 683 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 684     """Sanitizes a string so it could be used as part of a filename.
 685     @param restricted   Use a stricter subset of allowed characters
 686     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 687                         If unset, yt-dlp's new sanitization rules are in effect
 688     """
 689     if s == '':
 690         return ''
 691
 692     def replace_insane(char):
 693         if restricted and char in ACCENT_CHARS:
 694             return ACCENT_CHARS[char]
 695         elif not restricted and char == '\n':
 696             return '\0 '
 697         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 698             # Replace with their full-width unicode counterparts
 699             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 700         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 701             return ''
 702         elif char == '"':
 703             return '' if restricted else '\''
 704         elif char == ':':
 705             return '\0_\0-' if restricted else '\0 \0-'
 706         elif char in '\\/|*<>':
 707             return '\0_'
 708         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 709             return '\0_'
 710         return char
 711
 712     # Replace look-alike Unicode glyphs
 713     if restricted and (is_id is NO_DEFAULT or not is_id):
 714         s = unicodedata.normalize('NFKC', s)
 715     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 716     result = ''.join(map(replace_insane, s))
 717     if is_id is NO_DEFAULT:
 718         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 719         STRIP_RE = r'(?:\0.|[ _-])*'
 720         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 721     result = result.replace('\0', '') or '_'
 722
 723     if not is_id:
 724         while '__' in result:
 725             result = result.replace('__', '_')
 726         result = result.strip('_')
 727         # Common case of "Foreign band name - English song title"
 728         if restricted and result.startswith('-_'):
 729             result = result[2:]
 730         if result.startswith('-'):
 731             result = '_' + result[len('-'):]
 732         result = result.lstrip('.')
 733         if not result:
 734             result = '_'
 735     return result
 736
 737
 738 def sanitize_path(s, force=False):
 739     """Sanitizes and normalizes path on Windows"""
 740     if sys.platform == 'win32':
 741         force = False
 742         drive_or_unc, _ = os.path.splitdrive(s)
 743     elif force:
 744         drive_or_unc = ''
 745     else:
 746         return s
 747
 748     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 749     if drive_or_unc:
 750         norm_path.pop(0)
 751     sanitized_path = [
 752         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 753         for path_part in norm_path]
 754     if drive_or_unc:
 755         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 756     elif force and s and s[0] == os.path.sep:
 757         sanitized_path.insert(0, os.path.sep)
 758     return os.path.join(*sanitized_path)
 759
 760
 761 def sanitize_url(url, *, scheme='http'):
 762     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 763     # the number of unwanted failures due to missing protocol
 764     if url is None:
 765         return
 766     elif url.startswith('//'):
 767         return f'{scheme}:{url}'
 768     # Fix some common typos seen so far
 769     COMMON_TYPOS = (
 770         # https://github.com/ytdl-org/youtube-dl/issues/15649
 771         (r'^httpss://', r'https://'),
 772         # https://bx1.be/lives/direct-tv/
 773         (r'^rmtp([es]?)://', r'rtmp\1://'),
 774     )
 775     for mistake, fixup in COMMON_TYPOS:
 776         if re.match(mistake, url):
 777             return re.sub(mistake, fixup, url)
 778     return url
 779
 780
 781 def extract_basic_auth(url):
 782     parts = urllib.parse.urlsplit(url)
 783     if parts.username is None:
 784         return url, None
 785     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 786         parts.hostname if parts.port is None
 787         else '%s:%d' % (parts.hostname, parts.port))))
 788     auth_payload = base64.b64encode(
 789         ('%s:%s' % (parts.username, parts.password or '')).encode())
 790     return url, f'Basic {auth_payload.decode()}'
 791
 792
 793 def sanitized_Request(url, *args, **kwargs):
 794     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 795     if auth_header is not None:
 796         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 797         headers['Authorization'] = auth_header
 798     return urllib.request.Request(url, *args, **kwargs)
 799
 800
 801 def expand_path(s):
 802     """Expand shell variables and ~"""
 803     return os.path.expandvars(compat_expanduser(s))
 804
 805
 806 def orderedSet(iterable, *, lazy=False):
 807     """Remove all duplicates from the input iterable"""
 808     def _iter():
 809         seen = []  # Do not use set since the items can be unhashable
 810         for x in iterable:
 811             if x not in seen:
 812                 seen.append(x)
 813                 yield x
 814
 815     return _iter() if lazy else list(_iter())
 816
 817
 818 def _htmlentity_transform(entity_with_semicolon):
 819     """Transforms an HTML entity to a character."""
 820     entity = entity_with_semicolon[:-1]
 821
 822     # Known non-numeric HTML entity
 823     if entity in html.entities.name2codepoint:
 824         return chr(html.entities.name2codepoint[entity])
 825
 826     # TODO: HTML5 allows entities without a semicolon.
 827     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 828     if entity_with_semicolon in html.entities.html5:
 829         return html.entities.html5[entity_with_semicolon]
 830
 831     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 832     if mobj is not None:
 833         numstr = mobj.group(1)
 834         if numstr.startswith('x'):
 835             base = 16
 836             numstr = '0%s' % numstr
 837         else:
 838             base = 10
 839         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 840         with contextlib.suppress(ValueError):
 841             return chr(int(numstr, base))
 842
 843     # Unknown entity in name, return its literal representation
 844     return '&%s;' % entity
 845
 846
 847 def unescapeHTML(s):
 848     if s is None:
 849         return None
 850     assert isinstance(s, str)
 851
 852     return re.sub(
 853         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 854
 855
 856 def escapeHTML(text):
 857     return (
 858         text
 859         .replace('&', '&amp;')
 860         .replace('<', '&lt;')
 861         .replace('>', '&gt;')
 862         .replace('"', '&quot;')
 863         .replace("'", '&#39;')
 864     )
 865
 866
 867 def process_communicate_or_kill(p, *args, **kwargs):
 868     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 869                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 870     return Popen.communicate_or_kill(p, *args, **kwargs)
 871
 872
 873 class Popen(subprocess.Popen):
 874     if sys.platform == 'win32':
 875         _startupinfo = subprocess.STARTUPINFO()
 876         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 877     else:
 878         _startupinfo = None
 879
 880     @staticmethod
 881     def _fix_pyinstaller_ld_path(env):
 882         """Restore LD_LIBRARY_PATH when using PyInstaller
 883             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 884                  https://github.com/yt-dlp/yt-dlp/issues/4573
 885         """
 886         if not hasattr(sys, '_MEIPASS'):
 887             return
 888
 889         def _fix(key):
 890             orig = env.get(f'{key}_ORIG')
 891             if orig is None:
 892                 env.pop(key, None)
 893             else:
 894                 env[key] = orig
 895
 896         _fix('LD_LIBRARY_PATH')  # Linux
 897         _fix('DYLD_LIBRARY_PATH')  # macOS
 898
 899     def __init__(self, *args, env=None, text=False, **kwargs):
 900         if env is None:
 901             env = os.environ.copy()
 902         self._fix_pyinstaller_ld_path(env)
 903
 904         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 905         if text is True:
 906             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 907             kwargs.setdefault('encoding', 'utf-8')
 908             kwargs.setdefault('errors', 'replace')
 909         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 910
 911     def communicate_or_kill(self, *args, **kwargs):
 912         try:
 913             return self.communicate(*args, **kwargs)
 914         except BaseException:  # Including KeyboardInterrupt
 915             self.kill(timeout=None)
 916             raise
 917
 918     def kill(self, *, timeout=0):
 919         super().kill()
 920         if timeout != 0:
 921             self.wait(timeout=timeout)
 922
 923     @classmethod
 924     def run(cls, *args, timeout=None, **kwargs):
 925         with cls(*args, **kwargs) as proc:
 926             default = '' if proc.__text_mode else b''
 927             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 928             return stdout or default, stderr or default, proc.returncode
 929
 930
 931 def get_subprocess_encoding():
 932     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 933         # For subprocess calls, encode with locale encoding
 934         # Refer to http://stackoverflow.com/a/9951851/35070
 935         encoding = preferredencoding()
 936     else:
 937         encoding = sys.getfilesystemencoding()
 938     if encoding is None:
 939         encoding = 'utf-8'
 940     return encoding
 941
 942
 943 def encodeFilename(s, for_subprocess=False):
 944     assert isinstance(s, str)
 945     return s
 946
 947
 948 def decodeFilename(b, for_subprocess=False):
 949     return b
 950
 951
 952 def encodeArgument(s):
 953     # Legacy code that uses byte strings
 954     # Uncomment the following line after fixing all post processors
 955     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 956     return s if isinstance(s, str) else s.decode('ascii')
 957
 958
 959 def decodeArgument(b):
 960     return b
 961
 962
 963 def decodeOption(optval):
 964     if optval is None:
 965         return optval
 966     if isinstance(optval, bytes):
 967         optval = optval.decode(preferredencoding())
 968
 969     assert isinstance(optval, str)
 970     return optval
 971
 972
 973 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 974
 975
 976 def timetuple_from_msec(msec):
 977     secs, msec = divmod(msec, 1000)
 978     mins, secs = divmod(secs, 60)
 979     hrs, mins = divmod(mins, 60)
 980     return _timetuple(hrs, mins, secs, msec)
 981
 982
 983 def formatSeconds(secs, delim=':', msec=False):
 984     time = timetuple_from_msec(secs * 1000)
 985     if time.hours:
 986         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 987     elif time.minutes:
 988         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 989     else:
 990         ret = '%d' % time.seconds
 991     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 992
 993
 994 def _ssl_load_windows_store_certs(ssl_context, storename):
 995     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 996     try:
 997         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 998                  if encoding == 'x509_asn' and (
 999                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
1000     except PermissionError:
1001         return
1002     for cert in certs:
1003         with contextlib.suppress(ssl.SSLError):
1004             ssl_context.load_verify_locations(cadata=cert)
1005
1006
1007 def make_HTTPS_handler(params, **kwargs):
1008     opts_check_certificate = not params.get('nocheckcertificate')
1009     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1010     context.check_hostname = opts_check_certificate
1011     if params.get('legacyserverconnect'):
1012         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1013         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
1014         context.set_ciphers('DEFAULT')
1015     elif (
1016         sys.version_info < (3, 10)
1017         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
1018         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
1019     ):
1020         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
1021         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1022         # in some situations [2][3].
1023         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1024         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1025         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1026         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1027         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1028         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1029         # 4. https://peps.python.org/pep-0644/
1030         # 5. https://peps.python.org/pep-0644/#libressl-support
1031         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1032         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1033         context.minimum_version = ssl.TLSVersion.TLSv1_2
1034
1035     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1036     if opts_check_certificate:
1037         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1038             context.load_verify_locations(cafile=certifi.where())
1039         else:
1040             try:
1041                 context.load_default_certs()
1042                 # Work around the issue in load_default_certs when there are bad certificates. See:
1043                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1044                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1045             except ssl.SSLError:
1046                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1047                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1048                     for storename in ('CA', 'ROOT'):
1049                         _ssl_load_windows_store_certs(context, storename)
1050                 context.set_default_verify_paths()
1051
1052     client_certfile = params.get('client_certificate')
1053     if client_certfile:
1054         try:
1055             context.load_cert_chain(
1056                 client_certfile, keyfile=params.get('client_certificate_key'),
1057                 password=params.get('client_certificate_password'))
1058         except ssl.SSLError:
1059             raise YoutubeDLError('Unable to load client certificate')
1060
1061     # Some servers may reject requests if ALPN extension is not sent. See:
1062     # https://github.com/python/cpython/issues/85140
1063     # https://github.com/yt-dlp/yt-dlp/issues/3878
1064     with contextlib.suppress(NotImplementedError):
1065         context.set_alpn_protocols(['http/1.1'])
1066
1067     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1068
1069
1070 def bug_reports_message(before=';'):
1071     from .update import REPOSITORY
1072
1073     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1074            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1075
1076     before = before.rstrip()
1077     if not before or before.endswith(('.', '!', '?')):
1078         msg = msg[0].title() + msg[1:]
1079
1080     return (before + ' ' if before else '') + msg
1081
1082
1083 class YoutubeDLError(Exception):
1084     """Base exception for YoutubeDL errors."""
1085     msg = None
1086
1087     def __init__(self, msg=None):
1088         if msg is not None:
1089             self.msg = msg
1090         elif self.msg is None:
1091             self.msg = type(self).__name__
1092         super().__init__(self.msg)
1093
1094
1095 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1096 if hasattr(ssl, 'CertificateError'):
1097     network_exceptions.append(ssl.CertificateError)
1098 network_exceptions = tuple(network_exceptions)
1099
1100
1101 class ExtractorError(YoutubeDLError):
1102     """Error during info extraction."""
1103
1104     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1105         """ tb, if given, is the original traceback (so that it can be printed out).
1106         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1107         """
1108         if sys.exc_info()[0] in network_exceptions:
1109             expected = True
1110
1111         self.orig_msg = str(msg)
1112         self.traceback = tb
1113         self.expected = expected
1114         self.cause = cause
1115         self.video_id = video_id
1116         self.ie = ie
1117         self.exc_info = sys.exc_info()  # preserve original exception
1118         if isinstance(self.exc_info[1], ExtractorError):
1119             self.exc_info = self.exc_info[1].exc_info
1120         super().__init__(self.__msg)
1121
1122     @property
1123     def __msg(self):
1124         return ''.join((
1125             format_field(self.ie, None, '[%s] '),
1126             format_field(self.video_id, None, '%s: '),
1127             self.orig_msg,
1128             format_field(self.cause, None, ' (caused by %r)'),
1129             '' if self.expected else bug_reports_message()))
1130
1131     def format_traceback(self):
1132         return join_nonempty(
1133             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1134             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1135             delim='\n') or None
1136
1137     def __setattr__(self, name, value):
1138         super().__setattr__(name, value)
1139         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1140             self.msg = self.__msg or type(self).__name__
1141             self.args = (self.msg, )  # Cannot be property
1142
1143
1144 class UnsupportedError(ExtractorError):
1145     def __init__(self, url):
1146         super().__init__(
1147             'Unsupported URL: %s' % url, expected=True)
1148         self.url = url
1149
1150
1151 class RegexNotFoundError(ExtractorError):
1152     """Error when a regex didn't match"""
1153     pass
1154
1155
1156 class GeoRestrictedError(ExtractorError):
1157     """Geographic restriction Error exception.
1158
1159     This exception may be thrown when a video is not available from your
1160     geographic location due to geographic restrictions imposed by a website.
1161     """
1162
1163     def __init__(self, msg, countries=None, **kwargs):
1164         kwargs['expected'] = True
1165         super().__init__(msg, **kwargs)
1166         self.countries = countries
1167
1168
1169 class UserNotLive(ExtractorError):
1170     """Error when a channel/user is not live"""
1171
1172     def __init__(self, msg=None, **kwargs):
1173         kwargs['expected'] = True
1174         super().__init__(msg or 'The channel is not currently live', **kwargs)
1175
1176
1177 class DownloadError(YoutubeDLError):
1178     """Download Error exception.
1179
1180     This exception may be thrown by FileDownloader objects if they are not
1181     configured to continue on errors. They will contain the appropriate
1182     error message.
1183     """
1184
1185     def __init__(self, msg, exc_info=None):
1186         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1187         super().__init__(msg)
1188         self.exc_info = exc_info
1189
1190
1191 class EntryNotInPlaylist(YoutubeDLError):
1192     """Entry not in playlist exception.
1193
1194     This exception will be thrown by YoutubeDL when a requested entry
1195     is not found in the playlist info_dict
1196     """
1197     msg = 'Entry not found in info'
1198
1199
1200 class SameFileError(YoutubeDLError):
1201     """Same File exception.
1202
1203     This exception will be thrown by FileDownloader objects if they detect
1204     multiple files would have to be downloaded to the same file on disk.
1205     """
1206     msg = 'Fixed output name but more than one file to download'
1207
1208     def __init__(self, filename=None):
1209         if filename is not None:
1210             self.msg += f': {filename}'
1211         super().__init__(self.msg)
1212
1213
1214 class PostProcessingError(YoutubeDLError):
1215     """Post Processing exception.
1216
1217     This exception may be raised by PostProcessor's .run() method to
1218     indicate an error in the postprocessing task.
1219     """
1220
1221
1222 class DownloadCancelled(YoutubeDLError):
1223     """ Exception raised when the download queue should be interrupted """
1224     msg = 'The download was cancelled'
1225
1226
1227 class ExistingVideoReached(DownloadCancelled):
1228     """ --break-on-existing triggered """
1229     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1230
1231
1232 class RejectedVideoReached(DownloadCancelled):
1233     """ --break-match-filter triggered """
1234     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1235
1236
1237 class MaxDownloadsReached(DownloadCancelled):
1238     """ --max-downloads limit has been reached. """
1239     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1240
1241
1242 class ReExtractInfo(YoutubeDLError):
1243     """ Video info needs to be re-extracted. """
1244
1245     def __init__(self, msg, expected=False):
1246         super().__init__(msg)
1247         self.expected = expected
1248
1249
1250 class ThrottledDownload(ReExtractInfo):
1251     """ Download speed below --throttled-rate. """
1252     msg = 'The download speed is below throttle limit'
1253
1254     def __init__(self):
1255         super().__init__(self.msg, expected=False)
1256
1257
1258 class UnavailableVideoError(YoutubeDLError):
1259     """Unavailable Format exception.
1260
1261     This exception will be thrown when a video is requested
1262     in a format that is not available for that video.
1263     """
1264     msg = 'Unable to download video'
1265
1266     def __init__(self, err=None):
1267         if err is not None:
1268             self.msg += f': {err}'
1269         super().__init__(self.msg)
1270
1271
1272 class ContentTooShortError(YoutubeDLError):
1273     """Content Too Short exception.
1274
1275     This exception may be raised by FileDownloader objects when a file they
1276     download is too small for what the server announced first, indicating
1277     the connection was probably interrupted.
1278     """
1279
1280     def __init__(self, downloaded, expected):
1281         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1282         # Both in bytes
1283         self.downloaded = downloaded
1284         self.expected = expected
1285
1286
1287 class XAttrMetadataError(YoutubeDLError):
1288     def __init__(self, code=None, msg='Unknown error'):
1289         super().__init__(msg)
1290         self.code = code
1291         self.msg = msg
1292
1293         # Parsing code and msg
1294         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1295                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1296             self.reason = 'NO_SPACE'
1297         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1298             self.reason = 'VALUE_TOO_LONG'
1299         else:
1300             self.reason = 'NOT_SUPPORTED'
1301
1302
1303 class XAttrUnavailableError(YoutubeDLError):
1304     pass
1305
1306
1307 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1308     hc = http_class(*args, **kwargs)
1309     source_address = ydl_handler._params.get('source_address')
1310
1311     if source_address is not None:
1312         # This is to workaround _create_connection() from socket where it will try all
1313         # address data from getaddrinfo() including IPv6. This filters the result from
1314         # getaddrinfo() based on the source_address value.
1315         # This is based on the cpython socket.create_connection() function.
1316         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1317         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1318             host, port = address
1319             err = None
1320             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1321             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1322             ip_addrs = [addr for addr in addrs if addr[0] == af]
1323             if addrs and not ip_addrs:
1324                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1325                 raise OSError(
1326                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1327                     % (ip_version, source_address[0]))
1328             for res in ip_addrs:
1329                 af, socktype, proto, canonname, sa = res
1330                 sock = None
1331                 try:
1332                     sock = socket.socket(af, socktype, proto)
1333                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1334                         sock.settimeout(timeout)
1335                     sock.bind(source_address)
1336                     sock.connect(sa)
1337                     err = None  # Explicitly break reference cycle
1338                     return sock
1339                 except OSError as _:
1340                     err = _
1341                     if sock is not None:
1342                         sock.close()
1343             if err is not None:
1344                 raise err
1345             else:
1346                 raise OSError('getaddrinfo returns an empty list')
1347         if hasattr(hc, '_create_connection'):
1348             hc._create_connection = _create_connection
1349         hc.source_address = (source_address, 0)
1350
1351     return hc
1352
1353
1354 def handle_youtubedl_headers(headers):
1355     filtered_headers = headers
1356
1357     if 'Youtubedl-no-compression' in filtered_headers:
1358         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1359         del filtered_headers['Youtubedl-no-compression']
1360
1361     return filtered_headers
1362
1363
1364 class YoutubeDLHandler(urllib.request.HTTPHandler):
1365     """Handler for HTTP requests and responses.
1366
1367     This class, when installed with an OpenerDirector, automatically adds
1368     the standard headers to every HTTP request and handles gzipped and
1369     deflated responses from web servers. If compression is to be avoided in
1370     a particular request, the original request in the program code only has
1371     to include the HTTP header "Youtubedl-no-compression", which will be
1372     removed before making the real request.
1373
1374     Part of this code was copied from:
1375
1376     http://techknack.net/python-urllib2-handlers/
1377
1378     Andrew Rowls, the author of that code, agreed to release it to the
1379     public domain.
1380     """
1381
1382     def __init__(self, params, *args, **kwargs):
1383         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1384         self._params = params
1385
1386     def http_open(self, req):
1387         conn_class = http.client.HTTPConnection
1388
1389         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1390         if socks_proxy:
1391             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1392             del req.headers['Ytdl-socks-proxy']
1393
1394         return self.do_open(functools.partial(
1395             _create_http_connection, self, conn_class, False),
1396             req)
1397
1398     @staticmethod
1399     def deflate(data):
1400         if not data:
1401             return data
1402         try:
1403             return zlib.decompress(data, -zlib.MAX_WBITS)
1404         except zlib.error:
1405             return zlib.decompress(data)
1406
1407     @staticmethod
1408     def brotli(data):
1409         if not data:
1410             return data
1411         return brotli.decompress(data)
1412
1413     def http_request(self, req):
1414         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1415         # always respected by websites, some tend to give out URLs with non percent-encoded
1416         # non-ASCII characters (see telemb.py, ard.py [#3412])
1417         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1418         # To work around aforementioned issue we will replace request's original URL with
1419         # percent-encoded one
1420         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1421         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1422         url = req.get_full_url()
1423         url_escaped = escape_url(url)
1424
1425         # Substitute URL if any change after escaping
1426         if url != url_escaped:
1427             req = update_Request(req, url=url_escaped)
1428
1429         for h, v in self._params.get('http_headers', std_headers).items():
1430             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1431             # The dict keys are capitalized because of this bug by urllib
1432             if h.capitalize() not in req.headers:
1433                 req.add_header(h, v)
1434
1435         if 'Accept-encoding' not in req.headers:
1436             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1437
1438         req.headers = handle_youtubedl_headers(req.headers)
1439
1440         return super().do_request_(req)
1441
1442     def http_response(self, req, resp):
1443         old_resp = resp
1444         # gzip
1445         if resp.headers.get('Content-encoding', '') == 'gzip':
1446             content = resp.read()
1447             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1448             try:
1449                 uncompressed = io.BytesIO(gz.read())
1450             except OSError as original_ioerror:
1451                 # There may be junk add the end of the file
1452                 # See http://stackoverflow.com/q/4928560/35070 for details
1453                 for i in range(1, 1024):
1454                     try:
1455                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1456                         uncompressed = io.BytesIO(gz.read())
1457                     except OSError:
1458                         continue
1459                     break
1460                 else:
1461                     raise original_ioerror
1462             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1463             resp.msg = old_resp.msg
1464         # deflate
1465         if resp.headers.get('Content-encoding', '') == 'deflate':
1466             gz = io.BytesIO(self.deflate(resp.read()))
1467             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1468             resp.msg = old_resp.msg
1469         # brotli
1470         if resp.headers.get('Content-encoding', '') == 'br':
1471             resp = urllib.request.addinfourl(
1472                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1473             resp.msg = old_resp.msg
1474         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1475         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1476         if 300 <= resp.code < 400:
1477             location = resp.headers.get('Location')
1478             if location:
1479                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1480                 location = location.encode('iso-8859-1').decode()
1481                 location_escaped = escape_url(location)
1482                 if location != location_escaped:
1483                     del resp.headers['Location']
1484                     resp.headers['Location'] = location_escaped
1485         return resp
1486
1487     https_request = http_request
1488     https_response = http_response
1489
1490
1491 def make_socks_conn_class(base_class, socks_proxy):
1492     assert issubclass(base_class, (
1493         http.client.HTTPConnection, http.client.HTTPSConnection))
1494
1495     url_components = urllib.parse.urlparse(socks_proxy)
1496     if url_components.scheme.lower() == 'socks5':
1497         socks_type = ProxyType.SOCKS5
1498     elif url_components.scheme.lower() in ('socks', 'socks4'):
1499         socks_type = ProxyType.SOCKS4
1500     elif url_components.scheme.lower() == 'socks4a':
1501         socks_type = ProxyType.SOCKS4A
1502
1503     def unquote_if_non_empty(s):
1504         if not s:
1505             return s
1506         return urllib.parse.unquote_plus(s)
1507
1508     proxy_args = (
1509         socks_type,
1510         url_components.hostname, url_components.port or 1080,
1511         True,  # Remote DNS
1512         unquote_if_non_empty(url_components.username),
1513         unquote_if_non_empty(url_components.password),
1514     )
1515
1516     class SocksConnection(base_class):
1517         def connect(self):
1518             self.sock = sockssocket()
1519             self.sock.setproxy(*proxy_args)
1520             if isinstance(self.timeout, (int, float)):
1521                 self.sock.settimeout(self.timeout)
1522             self.sock.connect((self.host, self.port))
1523
1524             if isinstance(self, http.client.HTTPSConnection):
1525                 if hasattr(self, '_context'):  # Python > 2.6
1526                     self.sock = self._context.wrap_socket(
1527                         self.sock, server_hostname=self.host)
1528                 else:
1529                     self.sock = ssl.wrap_socket(self.sock)
1530
1531     return SocksConnection
1532
1533
1534 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1535     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1536         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1537         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1538         self._params = params
1539
1540     def https_open(self, req):
1541         kwargs = {}
1542         conn_class = self._https_conn_class
1543
1544         if hasattr(self, '_context'):  # python > 2.6
1545             kwargs['context'] = self._context
1546         if hasattr(self, '_check_hostname'):  # python 3.x
1547             kwargs['check_hostname'] = self._check_hostname
1548
1549         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1550         if socks_proxy:
1551             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1552             del req.headers['Ytdl-socks-proxy']
1553
1554         try:
1555             return self.do_open(
1556                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1557         except urllib.error.URLError as e:
1558             if (isinstance(e.reason, ssl.SSLError)
1559                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1560                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1561             raise
1562
1563
1564 def is_path_like(f):
1565     return isinstance(f, (str, bytes, os.PathLike))
1566
1567
1568 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1569     """
1570     See [1] for cookie file format.
1571
1572     1. https://curl.haxx.se/docs/http-cookies.html
1573     """
1574     _HTTPONLY_PREFIX = '#HttpOnly_'
1575     _ENTRY_LEN = 7
1576     _HEADER = '''# Netscape HTTP Cookie File
1577 # This file is generated by yt-dlp.  Do not edit.
1578
1579 '''
1580     _CookieFileEntry = collections.namedtuple(
1581         'CookieFileEntry',
1582         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1583
1584     def __init__(self, filename=None, *args, **kwargs):
1585         super().__init__(None, *args, **kwargs)
1586         if is_path_like(filename):
1587             filename = os.fspath(filename)
1588         self.filename = filename
1589
1590     @staticmethod
1591     def _true_or_false(cndn):
1592         return 'TRUE' if cndn else 'FALSE'
1593
1594     @contextlib.contextmanager
1595     def open(self, file, *, write=False):
1596         if is_path_like(file):
1597             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1598                 yield f
1599         else:
1600             if write:
1601                 file.truncate(0)
1602             yield file
1603
1604     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1605         now = time.time()
1606         for cookie in self:
1607             if (not ignore_discard and cookie.discard
1608                     or not ignore_expires and cookie.is_expired(now)):
1609                 continue
1610             name, value = cookie.name, cookie.value
1611             if value is None:
1612                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1613                 # with no name, whereas http.cookiejar regards it as a
1614                 # cookie with no value.
1615                 name, value = '', name
1616             f.write('%s\n' % '\t'.join((
1617                 cookie.domain,
1618                 self._true_or_false(cookie.domain.startswith('.')),
1619                 cookie.path,
1620                 self._true_or_false(cookie.secure),
1621                 str_or_none(cookie.expires, default=''),
1622                 name, value
1623             )))
1624
1625     def save(self, filename=None, *args, **kwargs):
1626         """
1627         Save cookies to a file.
1628         Code is taken from CPython 3.6
1629         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1630
1631         if filename is None:
1632             if self.filename is not None:
1633                 filename = self.filename
1634             else:
1635                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1636
1637         # Store session cookies with `expires` set to 0 instead of an empty string
1638         for cookie in self:
1639             if cookie.expires is None:
1640                 cookie.expires = 0
1641
1642         with self.open(filename, write=True) as f:
1643             f.write(self._HEADER)
1644             self._really_save(f, *args, **kwargs)
1645
1646     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1647         """Load cookies from a file."""
1648         if filename is None:
1649             if self.filename is not None:
1650                 filename = self.filename
1651             else:
1652                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1653
1654         def prepare_line(line):
1655             if line.startswith(self._HTTPONLY_PREFIX):
1656                 line = line[len(self._HTTPONLY_PREFIX):]
1657             # comments and empty lines are fine
1658             if line.startswith('#') or not line.strip():
1659                 return line
1660             cookie_list = line.split('\t')
1661             if len(cookie_list) != self._ENTRY_LEN:
1662                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1663             cookie = self._CookieFileEntry(*cookie_list)
1664             if cookie.expires_at and not cookie.expires_at.isdigit():
1665                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1666             return line
1667
1668         cf = io.StringIO()
1669         with self.open(filename) as f:
1670             for line in f:
1671                 try:
1672                     cf.write(prepare_line(line))
1673                 except http.cookiejar.LoadError as e:
1674                     if f'{line.strip()} '[0] in '[{"':
1675                         raise http.cookiejar.LoadError(
1676                             'Cookies file must be Netscape formatted, not JSON. See  '
1677                             'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1678                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1679                     continue
1680         cf.seek(0)
1681         self._really_load(cf, filename, ignore_discard, ignore_expires)
1682         # Session cookies are denoted by either `expires` field set to
1683         # an empty string or 0. MozillaCookieJar only recognizes the former
1684         # (see [1]). So we need force the latter to be recognized as session
1685         # cookies on our own.
1686         # Session cookies may be important for cookies-based authentication,
1687         # e.g. usually, when user does not check 'Remember me' check box while
1688         # logging in on a site, some important cookies are stored as session
1689         # cookies so that not recognizing them will result in failed login.
1690         # 1. https://bugs.python.org/issue17164
1691         for cookie in self:
1692             # Treat `expires=0` cookies as session cookies
1693             if cookie.expires == 0:
1694                 cookie.expires = None
1695                 cookie.discard = True
1696
1697
1698 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1699     def __init__(self, cookiejar=None):
1700         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1701
1702     def http_response(self, request, response):
1703         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1704
1705     https_request = urllib.request.HTTPCookieProcessor.http_request
1706     https_response = http_response
1707
1708
1709 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1710     """YoutubeDL redirect handler
1711
1712     The code is based on HTTPRedirectHandler implementation from CPython [1].
1713
1714     This redirect handler solves two issues:
1715      - ensures redirect URL is always unicode under python 2
1716      - introduces support for experimental HTTP response status code
1717        308 Permanent Redirect [2] used by some sites [3]
1718
1719     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1720     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1721     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1722     """
1723
1724     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1725
1726     def redirect_request(self, req, fp, code, msg, headers, newurl):
1727         """Return a Request or None in response to a redirect.
1728
1729         This is called by the http_error_30x methods when a
1730         redirection response is received.  If a redirection should
1731         take place, return a new Request to allow http_error_30x to
1732         perform the redirect.  Otherwise, raise HTTPError if no-one
1733         else should try to handle this url.  Return None if you can't
1734         but another Handler might.
1735         """
1736         m = req.get_method()
1737         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1738                  or code in (301, 302, 303) and m == "POST")):
1739             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1740         # Strictly (according to RFC 2616), 301 or 302 in response to
1741         # a POST MUST NOT cause a redirection without confirmation
1742         # from the user (of urllib.request, in this case).  In practice,
1743         # essentially all clients do redirect in this case, so we do
1744         # the same.
1745
1746         # Be conciliant with URIs containing a space.  This is mainly
1747         # redundant with the more complete encoding done in http_error_302(),
1748         # but it is kept for compatibility with other callers.
1749         newurl = newurl.replace(' ', '%20')
1750
1751         CONTENT_HEADERS = ("content-length", "content-type")
1752         # NB: don't use dict comprehension for python 2.6 compatibility
1753         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1754
1755         # A 303 must either use GET or HEAD for subsequent request
1756         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1757         if code == 303 and m != 'HEAD':
1758             m = 'GET'
1759         # 301 and 302 redirects are commonly turned into a GET from a POST
1760         # for subsequent requests by browsers, so we'll do the same.
1761         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1762         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1763         if code in (301, 302) and m == 'POST':
1764             m = 'GET'
1765
1766         return urllib.request.Request(
1767             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1768             unverifiable=True, method=m)
1769
1770
1771 def extract_timezone(date_str):
1772     m = re.search(
1773         r'''(?x)
1774             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1775             (?P<tz>Z|                                            # just the UTC Z, or
1776                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1777                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1778                    [ ]?                                          # optional space
1779                 (?P<sign>\+|-)                                   # +/-
1780                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1781             $)
1782         ''', date_str)
1783     if not m:
1784         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1785         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1786         if timezone is not None:
1787             date_str = date_str[:-len(m.group('tz'))]
1788         timezone = datetime.timedelta(hours=timezone or 0)
1789     else:
1790         date_str = date_str[:-len(m.group('tz'))]
1791         if not m.group('sign'):
1792             timezone = datetime.timedelta()
1793         else:
1794             sign = 1 if m.group('sign') == '+' else -1
1795             timezone = datetime.timedelta(
1796                 hours=sign * int(m.group('hours')),
1797                 minutes=sign * int(m.group('minutes')))
1798     return timezone, date_str
1799
1800
1801 def parse_iso8601(date_str, delimiter='T', timezone=None):
1802     """ Return a UNIX timestamp from the given date """
1803
1804     if date_str is None:
1805         return None
1806
1807     date_str = re.sub(r'\.[0-9]+', '', date_str)
1808
1809     if timezone is None:
1810         timezone, date_str = extract_timezone(date_str)
1811
1812     with contextlib.suppress(ValueError):
1813         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1814         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1815         return calendar.timegm(dt.timetuple())
1816
1817
1818 def date_formats(day_first=True):
1819     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1820
1821
1822 def unified_strdate(date_str, day_first=True):
1823     """Return a string with the date in the format YYYYMMDD"""
1824
1825     if date_str is None:
1826         return None
1827     upload_date = None
1828     # Replace commas
1829     date_str = date_str.replace(',', ' ')
1830     # Remove AM/PM + timezone
1831     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1832     _, date_str = extract_timezone(date_str)
1833
1834     for expression in date_formats(day_first):
1835         with contextlib.suppress(ValueError):
1836             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1837     if upload_date is None:
1838         timetuple = email.utils.parsedate_tz(date_str)
1839         if timetuple:
1840             with contextlib.suppress(ValueError):
1841                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1842     if upload_date is not None:
1843         return str(upload_date)
1844
1845
1846 def unified_timestamp(date_str, day_first=True):
1847     if date_str is None:
1848         return None
1849
1850     date_str = re.sub(r'\s+', ' ', re.sub(
1851         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1852
1853     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1854     timezone, date_str = extract_timezone(date_str)
1855
1856     # Remove AM/PM + timezone
1857     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1858
1859     # Remove unrecognized timezones from ISO 8601 alike timestamps
1860     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1861     if m:
1862         date_str = date_str[:-len(m.group('tz'))]
1863
1864     # Python only supports microseconds, so remove nanoseconds
1865     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1866     if m:
1867         date_str = m.group(1)
1868
1869     for expression in date_formats(day_first):
1870         with contextlib.suppress(ValueError):
1871             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1872             return calendar.timegm(dt.timetuple())
1873
1874     timetuple = email.utils.parsedate_tz(date_str)
1875     if timetuple:
1876         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1877
1878
1879 def determine_ext(url, default_ext='unknown_video'):
1880     if url is None or '.' not in url:
1881         return default_ext
1882     guess = url.partition('?')[0].rpartition('.')[2]
1883     if re.match(r'^[A-Za-z0-9]+$', guess):
1884         return guess
1885     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1886     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1887         return guess.rstrip('/')
1888     else:
1889         return default_ext
1890
1891
1892 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1893     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1894
1895
1896 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1897     R"""
1898     Return a datetime object from a string.
1899     Supported format:
1900         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1901
1902     @param format       strftime format of DATE
1903     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1904                         auto: round to the unit provided in date_str (if applicable).
1905     """
1906     auto_precision = False
1907     if precision == 'auto':
1908         auto_precision = True
1909         precision = 'microsecond'
1910     today = datetime_round(datetime.datetime.utcnow(), precision)
1911     if date_str in ('now', 'today'):
1912         return today
1913     if date_str == 'yesterday':
1914         return today - datetime.timedelta(days=1)
1915     match = re.match(
1916         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1917         date_str)
1918     if match is not None:
1919         start_time = datetime_from_str(match.group('start'), precision, format)
1920         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1921         unit = match.group('unit')
1922         if unit == 'month' or unit == 'year':
1923             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1924             unit = 'day'
1925         else:
1926             if unit == 'week':
1927                 unit = 'day'
1928                 time *= 7
1929             delta = datetime.timedelta(**{unit + 's': time})
1930             new_date = start_time + delta
1931         if auto_precision:
1932             return datetime_round(new_date, unit)
1933         return new_date
1934
1935     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1936
1937
1938 def date_from_str(date_str, format='%Y%m%d', strict=False):
1939     R"""
1940     Return a date object from a string using datetime_from_str
1941
1942     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1943                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1944     """
1945     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1946         raise ValueError(f'Invalid date format "{date_str}"')
1947     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1948
1949
1950 def datetime_add_months(dt, months):
1951     """Increment/Decrement a datetime object by months."""
1952     month = dt.month + months - 1
1953     year = dt.year + month // 12
1954     month = month % 12 + 1
1955     day = min(dt.day, calendar.monthrange(year, month)[1])
1956     return dt.replace(year, month, day)
1957
1958
1959 def datetime_round(dt, precision='day'):
1960     """
1961     Round a datetime object's time to a specific precision
1962     """
1963     if precision == 'microsecond':
1964         return dt
1965
1966     unit_seconds = {
1967         'day': 86400,
1968         'hour': 3600,
1969         'minute': 60,
1970         'second': 1,
1971     }
1972     roundto = lambda x, n: ((x + n / 2) // n) * n
1973     timestamp = calendar.timegm(dt.timetuple())
1974     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1975
1976
1977 def hyphenate_date(date_str):
1978     """
1979     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1980     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1981     if match is not None:
1982         return '-'.join(match.groups())
1983     else:
1984         return date_str
1985
1986
1987 class DateRange:
1988     """Represents a time interval between two dates"""
1989
1990     def __init__(self, start=None, end=None):
1991         """start and end must be strings in the format accepted by date"""
1992         if start is not None:
1993             self.start = date_from_str(start, strict=True)
1994         else:
1995             self.start = datetime.datetime.min.date()
1996         if end is not None:
1997             self.end = date_from_str(end, strict=True)
1998         else:
1999             self.end = datetime.datetime.max.date()
2000         if self.start > self.end:
2001             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
2002
2003     @classmethod
2004     def day(cls, day):
2005         """Returns a range that only contains the given day"""
2006         return cls(day, day)
2007
2008     def __contains__(self, date):
2009         """Check if the date is in the range"""
2010         if not isinstance(date, datetime.date):
2011             date = date_from_str(date)
2012         return self.start <= date <= self.end
2013
2014     def __str__(self):
2015         return f'{self.start.isoformat()} - {self.end.isoformat()}'
2016
2017     def __eq__(self, other):
2018         return (isinstance(other, DateRange)
2019                 and self.start == other.start and self.end == other.end)
2020
2021
2022 def platform_name():
2023     """ Returns the platform name as a str """
2024     deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2025     return platform.platform()
2026
2027
2028 @functools.cache
2029 def system_identifier():
2030     python_implementation = platform.python_implementation()
2031     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2032         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2033     libc_ver = []
2034     with contextlib.suppress(OSError):  # We may not have access to the executable
2035         libc_ver = platform.libc_ver()
2036
2037     return 'Python %s (%s %s %s) - %s (%s%s)' % (
2038         platform.python_version(),
2039         python_implementation,
2040         platform.machine(),
2041         platform.architecture()[0],
2042         platform.platform(),
2043         ssl.OPENSSL_VERSION,
2044         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2045     )
2046
2047
2048 @functools.cache
2049 def get_windows_version():
2050     ''' Get Windows version. returns () if it's not running on Windows '''
2051     if compat_os_name == 'nt':
2052         return version_tuple(platform.win32_ver()[1])
2053     else:
2054         return ()
2055
2056
2057 def write_string(s, out=None, encoding=None):
2058     assert isinstance(s, str)
2059     out = out or sys.stderr
2060     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
2061     if not out:
2062         return
2063
2064     if compat_os_name == 'nt' and supports_terminal_sequences(out):
2065         s = re.sub(r'([\r\n]+)', r' \1', s)
2066
2067     enc, buffer = None, out
2068     if 'b' in getattr(out, 'mode', ''):
2069         enc = encoding or preferredencoding()
2070     elif hasattr(out, 'buffer'):
2071         buffer = out.buffer
2072         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2073
2074     buffer.write(s.encode(enc, 'ignore') if enc else s)
2075     out.flush()
2076
2077
2078 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2079     from . import _IN_CLI
2080     if _IN_CLI:
2081         if msg in deprecation_warning._cache:
2082             return
2083         deprecation_warning._cache.add(msg)
2084         if printer:
2085             return printer(f'{msg}{bug_reports_message()}', **kwargs)
2086         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2087     else:
2088         import warnings
2089         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2090
2091
2092 deprecation_warning._cache = set()
2093
2094
2095 def bytes_to_intlist(bs):
2096     if not bs:
2097         return []
2098     if isinstance(bs[0], int):  # Python 3
2099         return list(bs)
2100     else:
2101         return [ord(c) for c in bs]
2102
2103
2104 def intlist_to_bytes(xs):
2105     if not xs:
2106         return b''
2107     return struct.pack('%dB' % len(xs), *xs)
2108
2109
2110 class LockingUnsupportedError(OSError):
2111     msg = 'File locking is not supported'
2112
2113     def __init__(self):
2114         super().__init__(self.msg)
2115
2116
2117 # Cross-platform file locking
2118 if sys.platform == 'win32':
2119     import ctypes
2120     import ctypes.wintypes
2121     import msvcrt
2122
2123     class OVERLAPPED(ctypes.Structure):
2124         _fields_ = [
2125             ('Internal', ctypes.wintypes.LPVOID),
2126             ('InternalHigh', ctypes.wintypes.LPVOID),
2127             ('Offset', ctypes.wintypes.DWORD),
2128             ('OffsetHigh', ctypes.wintypes.DWORD),
2129             ('hEvent', ctypes.wintypes.HANDLE),
2130         ]
2131
2132     kernel32 = ctypes.WinDLL('kernel32')
2133     LockFileEx = kernel32.LockFileEx
2134     LockFileEx.argtypes = [
2135         ctypes.wintypes.HANDLE,     # hFile
2136         ctypes.wintypes.DWORD,      # dwFlags
2137         ctypes.wintypes.DWORD,      # dwReserved
2138         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2139         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2140         ctypes.POINTER(OVERLAPPED)  # Overlapped
2141     ]
2142     LockFileEx.restype = ctypes.wintypes.BOOL
2143     UnlockFileEx = kernel32.UnlockFileEx
2144     UnlockFileEx.argtypes = [
2145         ctypes.wintypes.HANDLE,     # hFile
2146         ctypes.wintypes.DWORD,      # dwReserved
2147         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2148         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2149         ctypes.POINTER(OVERLAPPED)  # Overlapped
2150     ]
2151     UnlockFileEx.restype = ctypes.wintypes.BOOL
2152     whole_low = 0xffffffff
2153     whole_high = 0x7fffffff
2154
2155     def _lock_file(f, exclusive, block):
2156         overlapped = OVERLAPPED()
2157         overlapped.Offset = 0
2158         overlapped.OffsetHigh = 0
2159         overlapped.hEvent = 0
2160         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2161
2162         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2163                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2164                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2165             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2166             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2167
2168     def _unlock_file(f):
2169         assert f._lock_file_overlapped_p
2170         handle = msvcrt.get_osfhandle(f.fileno())
2171         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2172             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2173
2174 else:
2175     try:
2176         import fcntl
2177
2178         def _lock_file(f, exclusive, block):
2179             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2180             if not block:
2181                 flags |= fcntl.LOCK_NB
2182             try:
2183                 fcntl.flock(f, flags)
2184             except BlockingIOError:
2185                 raise
2186             except OSError:  # AOSP does not have flock()
2187                 fcntl.lockf(f, flags)
2188
2189         def _unlock_file(f):
2190             try:
2191                 fcntl.flock(f, fcntl.LOCK_UN)
2192             except OSError:
2193                 fcntl.lockf(f, fcntl.LOCK_UN)
2194
2195     except ImportError:
2196
2197         def _lock_file(f, exclusive, block):
2198             raise LockingUnsupportedError()
2199
2200         def _unlock_file(f):
2201             raise LockingUnsupportedError()
2202
2203
2204 class locked_file:
2205     locked = False
2206
2207     def __init__(self, filename, mode, block=True, encoding=None):
2208         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2209             raise NotImplementedError(mode)
2210         self.mode, self.block = mode, block
2211
2212         writable = any(f in mode for f in 'wax+')
2213         readable = any(f in mode for f in 'r+')
2214         flags = functools.reduce(operator.ior, (
2215             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2216             getattr(os, 'O_BINARY', 0),  # Windows only
2217             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2218             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2219             os.O_APPEND if 'a' in mode else 0,
2220             os.O_EXCL if 'x' in mode else 0,
2221             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2222         ))
2223
2224         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2225
2226     def __enter__(self):
2227         exclusive = 'r' not in self.mode
2228         try:
2229             _lock_file(self.f, exclusive, self.block)
2230             self.locked = True
2231         except OSError:
2232             self.f.close()
2233             raise
2234         if 'w' in self.mode:
2235             try:
2236                 self.f.truncate()
2237             except OSError as e:
2238                 if e.errno not in (
2239                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2240                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2241                 ):
2242                     raise
2243         return self
2244
2245     def unlock(self):
2246         if not self.locked:
2247             return
2248         try:
2249             _unlock_file(self.f)
2250         finally:
2251             self.locked = False
2252
2253     def __exit__(self, *_):
2254         try:
2255             self.unlock()
2256         finally:
2257             self.f.close()
2258
2259     open = __enter__
2260     close = __exit__
2261
2262     def __getattr__(self, attr):
2263         return getattr(self.f, attr)
2264
2265     def __iter__(self):
2266         return iter(self.f)
2267
2268
2269 @functools.cache
2270 def get_filesystem_encoding():
2271     encoding = sys.getfilesystemencoding()
2272     return encoding if encoding is not None else 'utf-8'
2273
2274
2275 def shell_quote(args):
2276     quoted_args = []
2277     encoding = get_filesystem_encoding()
2278     for a in args:
2279         if isinstance(a, bytes):
2280             # We may get a filename encoded with 'encodeFilename'
2281             a = a.decode(encoding)
2282         quoted_args.append(compat_shlex_quote(a))
2283     return ' '.join(quoted_args)
2284
2285
2286 def smuggle_url(url, data):
2287     """ Pass additional data in a URL for internal use. """
2288
2289     url, idata = unsmuggle_url(url, {})
2290     data.update(idata)
2291     sdata = urllib.parse.urlencode(
2292         {'__youtubedl_smuggle': json.dumps(data)})
2293     return url + '#' + sdata
2294
2295
2296 def unsmuggle_url(smug_url, default=None):
2297     if '#__youtubedl_smuggle' not in smug_url:
2298         return smug_url, default
2299     url, _, sdata = smug_url.rpartition('#')
2300     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2301     data = json.loads(jsond)
2302     return url, data
2303
2304
2305 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2306     """ Formats numbers with decimal sufixes like K, M, etc """
2307     num, factor = float_or_none(num), float(factor)
2308     if num is None or num < 0:
2309         return None
2310     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2311     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2312     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2313     if factor == 1024:
2314         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2315     converted = num / (factor ** exponent)
2316     return fmt % (converted, suffix)
2317
2318
2319 def format_bytes(bytes):
2320     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2321
2322
2323 def lookup_unit_table(unit_table, s, strict=False):
2324     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2325     units_re = '|'.join(re.escape(u) for u in unit_table)
2326     m = (re.fullmatch if strict else re.match)(
2327         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2328     if not m:
2329         return None
2330
2331     num = float(m.group('num').replace(',', '.'))
2332     mult = unit_table[m.group('unit')]
2333     return round(num * mult)
2334
2335
2336 def parse_bytes(s):
2337     """Parse a string indicating a byte quantity into an integer"""
2338     return lookup_unit_table(
2339         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2340         s.upper(), strict=True)
2341
2342
2343 def parse_filesize(s):
2344     if s is None:
2345         return None
2346
2347     # The lower-case forms are of course incorrect and unofficial,
2348     # but we support those too
2349     _UNIT_TABLE = {
2350         'B': 1,
2351         'b': 1,
2352         'bytes': 1,
2353         'KiB': 1024,
2354         'KB': 1000,
2355         'kB': 1024,
2356         'Kb': 1000,
2357         'kb': 1000,
2358         'kilobytes': 1000,
2359         'kibibytes': 1024,
2360         'MiB': 1024 ** 2,
2361         'MB': 1000 ** 2,
2362         'mB': 1024 ** 2,
2363         'Mb': 1000 ** 2,
2364         'mb': 1000 ** 2,
2365         'megabytes': 1000 ** 2,
2366         'mebibytes': 1024 ** 2,
2367         'GiB': 1024 ** 3,
2368         'GB': 1000 ** 3,
2369         'gB': 1024 ** 3,
2370         'Gb': 1000 ** 3,
2371         'gb': 1000 ** 3,
2372         'gigabytes': 1000 ** 3,
2373         'gibibytes': 1024 ** 3,
2374         'TiB': 1024 ** 4,
2375         'TB': 1000 ** 4,
2376         'tB': 1024 ** 4,
2377         'Tb': 1000 ** 4,
2378         'tb': 1000 ** 4,
2379         'terabytes': 1000 ** 4,
2380         'tebibytes': 1024 ** 4,
2381         'PiB': 1024 ** 5,
2382         'PB': 1000 ** 5,
2383         'pB': 1024 ** 5,
2384         'Pb': 1000 ** 5,
2385         'pb': 1000 ** 5,
2386         'petabytes': 1000 ** 5,
2387         'pebibytes': 1024 ** 5,
2388         'EiB': 1024 ** 6,
2389         'EB': 1000 ** 6,
2390         'eB': 1024 ** 6,
2391         'Eb': 1000 ** 6,
2392         'eb': 1000 ** 6,
2393         'exabytes': 1000 ** 6,
2394         'exbibytes': 1024 ** 6,
2395         'ZiB': 1024 ** 7,
2396         'ZB': 1000 ** 7,
2397         'zB': 1024 ** 7,
2398         'Zb': 1000 ** 7,
2399         'zb': 1000 ** 7,
2400         'zettabytes': 1000 ** 7,
2401         'zebibytes': 1024 ** 7,
2402         'YiB': 1024 ** 8,
2403         'YB': 1000 ** 8,
2404         'yB': 1024 ** 8,
2405         'Yb': 1000 ** 8,
2406         'yb': 1000 ** 8,
2407         'yottabytes': 1000 ** 8,
2408         'yobibytes': 1024 ** 8,
2409     }
2410
2411     return lookup_unit_table(_UNIT_TABLE, s)
2412
2413
2414 def parse_count(s):
2415     if s is None:
2416         return None
2417
2418     s = re.sub(r'^[^\d]+\s', '', s).strip()
2419
2420     if re.match(r'^[\d,.]+$', s):
2421         return str_to_int(s)
2422
2423     _UNIT_TABLE = {
2424         'k': 1000,
2425         'K': 1000,
2426         'm': 1000 ** 2,
2427         'M': 1000 ** 2,
2428         'kk': 1000 ** 2,
2429         'KK': 1000 ** 2,
2430         'b': 1000 ** 3,
2431         'B': 1000 ** 3,
2432     }
2433
2434     ret = lookup_unit_table(_UNIT_TABLE, s)
2435     if ret is not None:
2436         return ret
2437
2438     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2439     if mobj:
2440         return str_to_int(mobj.group(1))
2441
2442
2443 def parse_resolution(s, *, lenient=False):
2444     if s is None:
2445         return {}
2446
2447     if lenient:
2448         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2449     else:
2450         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2451     if mobj:
2452         return {
2453             'width': int(mobj.group('w')),
2454             'height': int(mobj.group('h')),
2455         }
2456
2457     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2458     if mobj:
2459         return {'height': int(mobj.group(1))}
2460
2461     mobj = re.search(r'\b([48])[kK]\b', s)
2462     if mobj:
2463         return {'height': int(mobj.group(1)) * 540}
2464
2465     return {}
2466
2467
2468 def parse_bitrate(s):
2469     if not isinstance(s, str):
2470         return
2471     mobj = re.search(r'\b(\d+)\s*kbps', s)
2472     if mobj:
2473         return int(mobj.group(1))
2474
2475
2476 def month_by_name(name, lang='en'):
2477     """ Return the number of a month by (locale-independently) English name """
2478
2479     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2480
2481     try:
2482         return month_names.index(name) + 1
2483     except ValueError:
2484         return None
2485
2486
2487 def month_by_abbreviation(abbrev):
2488     """ Return the number of a month by (locale-independently) English
2489         abbreviations """
2490
2491     try:
2492         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2493     except ValueError:
2494         return None
2495
2496
2497 def fix_xml_ampersands(xml_str):
2498     """Replace all the '&' by '&amp;' in XML"""
2499     return re.sub(
2500         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2501         '&amp;',
2502         xml_str)
2503
2504
2505 def setproctitle(title):
2506     assert isinstance(title, str)
2507
2508     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2509     try:
2510         import ctypes
2511     except ImportError:
2512         return
2513
2514     try:
2515         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2516     except OSError:
2517         return
2518     except TypeError:
2519         # LoadLibrary in Windows Python 2.7.13 only expects
2520         # a bytestring, but since unicode_literals turns
2521         # every string into a unicode string, it fails.
2522         return
2523     title_bytes = title.encode()
2524     buf = ctypes.create_string_buffer(len(title_bytes))
2525     buf.value = title_bytes
2526     try:
2527         libc.prctl(15, buf, 0, 0, 0)
2528     except AttributeError:
2529         return  # Strange libc, just skip this
2530
2531
2532 def remove_start(s, start):
2533     return s[len(start):] if s is not None and s.startswith(start) else s
2534
2535
2536 def remove_end(s, end):
2537     return s[:-len(end)] if s is not None and s.endswith(end) else s
2538
2539
2540 def remove_quotes(s):
2541     if s is None or len(s) < 2:
2542         return s
2543     for quote in ('"', "'", ):
2544         if s[0] == quote and s[-1] == quote:
2545             return s[1:-1]
2546     return s
2547
2548
2549 def get_domain(url):
2550     """
2551     This implementation is inconsistent, but is kept for compatibility.
2552     Use this only for "webpage_url_domain"
2553     """
2554     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2555
2556
2557 def url_basename(url):
2558     path = urllib.parse.urlparse(url).path
2559     return path.strip('/').split('/')[-1]
2560
2561
2562 def base_url(url):
2563     return re.match(r'https?://[^?#]+/', url).group()
2564
2565
2566 def urljoin(base, path):
2567     if isinstance(path, bytes):
2568         path = path.decode()
2569     if not isinstance(path, str) or not path:
2570         return None
2571     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2572         return path
2573     if isinstance(base, bytes):
2574         base = base.decode()
2575     if not isinstance(base, str) or not re.match(
2576             r'^(?:https?:)?//', base):
2577         return None
2578     return urllib.parse.urljoin(base, path)
2579
2580
2581 class HEADRequest(urllib.request.Request):
2582     def get_method(self):
2583         return 'HEAD'
2584
2585
2586 class PUTRequest(urllib.request.Request):
2587     def get_method(self):
2588         return 'PUT'
2589
2590
2591 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2592     if get_attr and v is not None:
2593         v = getattr(v, get_attr, None)
2594     try:
2595         return int(v) * invscale // scale
2596     except (ValueError, TypeError, OverflowError):
2597         return default
2598
2599
2600 def str_or_none(v, default=None):
2601     return default if v is None else str(v)
2602
2603
2604 def str_to_int(int_str):
2605     """ A more relaxed version of int_or_none """
2606     if isinstance(int_str, int):
2607         return int_str
2608     elif isinstance(int_str, str):
2609         int_str = re.sub(r'[,\.\+]', '', int_str)
2610         return int_or_none(int_str)
2611
2612
2613 def float_or_none(v, scale=1, invscale=1, default=None):
2614     if v is None:
2615         return default
2616     try:
2617         return float(v) * invscale / scale
2618     except (ValueError, TypeError):
2619         return default
2620
2621
2622 def bool_or_none(v, default=None):
2623     return v if isinstance(v, bool) else default
2624
2625
2626 def strip_or_none(v, default=None):
2627     return v.strip() if isinstance(v, str) else default
2628
2629
2630 def url_or_none(url):
2631     if not url or not isinstance(url, str):
2632         return None
2633     url = url.strip()
2634     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2635
2636
2637 def request_to_url(req):
2638     if isinstance(req, urllib.request.Request):
2639         return req.get_full_url()
2640     else:
2641         return req
2642
2643
2644 def strftime_or_none(timestamp, date_format, default=None):
2645     datetime_object = None
2646     try:
2647         if isinstance(timestamp, (int, float)):  # unix timestamp
2648             # Using naive datetime here can break timestamp() in Windows
2649             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2650             datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2651         elif isinstance(timestamp, str):  # assume YYYYMMDD
2652             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2653         date_format = re.sub(  # Support %s on windows
2654             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2655         return datetime_object.strftime(date_format)
2656     except (ValueError, TypeError, AttributeError):
2657         return default
2658
2659
2660 def parse_duration(s):
2661     if not isinstance(s, str):
2662         return None
2663     s = s.strip()
2664     if not s:
2665         return None
2666
2667     days, hours, mins, secs, ms = [None] * 5
2668     m = re.match(r'''(?x)
2669             (?P<before_secs>
2670                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2671             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2672             (?P<ms>[.:][0-9]+)?Z?$
2673         ''', s)
2674     if m:
2675         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2676     else:
2677         m = re.match(
2678             r'''(?ix)(?:P?
2679                 (?:
2680                     [0-9]+\s*y(?:ears?)?,?\s*
2681                 )?
2682                 (?:
2683                     [0-9]+\s*m(?:onths?)?,?\s*
2684                 )?
2685                 (?:
2686                     [0-9]+\s*w(?:eeks?)?,?\s*
2687                 )?
2688                 (?:
2689                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2690                 )?
2691                 T)?
2692                 (?:
2693                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2694                 )?
2695                 (?:
2696                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2697                 )?
2698                 (?:
2699                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2700                 )?Z?$''', s)
2701         if m:
2702             days, hours, mins, secs, ms = m.groups()
2703         else:
2704             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2705             if m:
2706                 hours, mins = m.groups()
2707             else:
2708                 return None
2709
2710     if ms:
2711         ms = ms.replace(':', '.')
2712     return sum(float(part or 0) * mult for part, mult in (
2713         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2714
2715
2716 def prepend_extension(filename, ext, expected_real_ext=None):
2717     name, real_ext = os.path.splitext(filename)
2718     return (
2719         f'{name}.{ext}{real_ext}'
2720         if not expected_real_ext or real_ext[1:] == expected_real_ext
2721         else f'{filename}.{ext}')
2722
2723
2724 def replace_extension(filename, ext, expected_real_ext=None):
2725     name, real_ext = os.path.splitext(filename)
2726     return '{}.{}'.format(
2727         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2728         ext)
2729
2730
2731 def check_executable(exe, args=[]):
2732     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2733     args can be a list of arguments for a short output (like -version) """
2734     try:
2735         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2736     except OSError:
2737         return False
2738     return exe
2739
2740
2741 def _get_exe_version_output(exe, args):
2742     try:
2743         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2744         # SIGTTOU if yt-dlp is run in the background.
2745         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2746         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2747                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2748         if ret:
2749             return None
2750     except OSError:
2751         return False
2752     return stdout
2753
2754
2755 def detect_exe_version(output, version_re=None, unrecognized='present'):
2756     assert isinstance(output, str)
2757     if version_re is None:
2758         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2759     m = re.search(version_re, output)
2760     if m:
2761         return m.group(1)
2762     else:
2763         return unrecognized
2764
2765
2766 def get_exe_version(exe, args=['--version'],
2767                     version_re=None, unrecognized=('present', 'broken')):
2768     """ Returns the version of the specified executable,
2769     or False if the executable is not present """
2770     unrecognized = variadic(unrecognized)
2771     assert len(unrecognized) in (1, 2)
2772     out = _get_exe_version_output(exe, args)
2773     if out is None:
2774         return unrecognized[-1]
2775     return out and detect_exe_version(out, version_re, unrecognized[0])
2776
2777
2778 def frange(start=0, stop=None, step=1):
2779     """Float range"""
2780     if stop is None:
2781         start, stop = 0, start
2782     sign = [-1, 1][step > 0] if step else 0
2783     while sign * start < sign * stop:
2784         yield start
2785         start += step
2786
2787
2788 class LazyList(collections.abc.Sequence):
2789     """Lazy immutable list from an iterable
2790     Note that slices of a LazyList are lists and not LazyList"""
2791
2792     class IndexError(IndexError):
2793         pass
2794
2795     def __init__(self, iterable, *, reverse=False, _cache=None):
2796         self._iterable = iter(iterable)
2797         self._cache = [] if _cache is None else _cache
2798         self._reversed = reverse
2799
2800     def __iter__(self):
2801         if self._reversed:
2802             # We need to consume the entire iterable to iterate in reverse
2803             yield from self.exhaust()
2804             return
2805         yield from self._cache
2806         for item in self._iterable:
2807             self._cache.append(item)
2808             yield item
2809
2810     def _exhaust(self):
2811         self._cache.extend(self._iterable)
2812         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2813         return self._cache
2814
2815     def exhaust(self):
2816         """Evaluate the entire iterable"""
2817         return self._exhaust()[::-1 if self._reversed else 1]
2818
2819     @staticmethod
2820     def _reverse_index(x):
2821         return None if x is None else ~x
2822
2823     def __getitem__(self, idx):
2824         if isinstance(idx, slice):
2825             if self._reversed:
2826                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2827             start, stop, step = idx.start, idx.stop, idx.step or 1
2828         elif isinstance(idx, int):
2829             if self._reversed:
2830                 idx = self._reverse_index(idx)
2831             start, stop, step = idx, idx, 0
2832         else:
2833             raise TypeError('indices must be integers or slices')
2834         if ((start or 0) < 0 or (stop or 0) < 0
2835                 or (start is None and step < 0)
2836                 or (stop is None and step > 0)):
2837             # We need to consume the entire iterable to be able to slice from the end
2838             # Obviously, never use this with infinite iterables
2839             self._exhaust()
2840             try:
2841                 return self._cache[idx]
2842             except IndexError as e:
2843                 raise self.IndexError(e) from e
2844         n = max(start or 0, stop or 0) - len(self._cache) + 1
2845         if n > 0:
2846             self._cache.extend(itertools.islice(self._iterable, n))
2847         try:
2848             return self._cache[idx]
2849         except IndexError as e:
2850             raise self.IndexError(e) from e
2851
2852     def __bool__(self):
2853         try:
2854             self[-1] if self._reversed else self[0]
2855         except self.IndexError:
2856             return False
2857         return True
2858
2859     def __len__(self):
2860         self._exhaust()
2861         return len(self._cache)
2862
2863     def __reversed__(self):
2864         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2865
2866     def __copy__(self):
2867         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2868
2869     def __repr__(self):
2870         # repr and str should mimic a list. So we exhaust the iterable
2871         return repr(self.exhaust())
2872
2873     def __str__(self):
2874         return repr(self.exhaust())
2875
2876
2877 class PagedList:
2878
2879     class IndexError(IndexError):
2880         pass
2881
2882     def __len__(self):
2883         # This is only useful for tests
2884         return len(self.getslice())
2885
2886     def __init__(self, pagefunc, pagesize, use_cache=True):
2887         self._pagefunc = pagefunc
2888         self._pagesize = pagesize
2889         self._pagecount = float('inf')
2890         self._use_cache = use_cache
2891         self._cache = {}
2892
2893     def getpage(self, pagenum):
2894         page_results = self._cache.get(pagenum)
2895         if page_results is None:
2896             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2897         if self._use_cache:
2898             self._cache[pagenum] = page_results
2899         return page_results
2900
2901     def getslice(self, start=0, end=None):
2902         return list(self._getslice(start, end))
2903
2904     def _getslice(self, start, end):
2905         raise NotImplementedError('This method must be implemented by subclasses')
2906
2907     def __getitem__(self, idx):
2908         assert self._use_cache, 'Indexing PagedList requires cache'
2909         if not isinstance(idx, int) or idx < 0:
2910             raise TypeError('indices must be non-negative integers')
2911         entries = self.getslice(idx, idx + 1)
2912         if not entries:
2913             raise self.IndexError()
2914         return entries[0]
2915
2916
2917 class OnDemandPagedList(PagedList):
2918     """Download pages until a page with less than maximum results"""
2919
2920     def _getslice(self, start, end):
2921         for pagenum in itertools.count(start // self._pagesize):
2922             firstid = pagenum * self._pagesize
2923             nextfirstid = pagenum * self._pagesize + self._pagesize
2924             if start >= nextfirstid:
2925                 continue
2926
2927             startv = (
2928                 start % self._pagesize
2929                 if firstid <= start < nextfirstid
2930                 else 0)
2931             endv = (
2932                 ((end - 1) % self._pagesize) + 1
2933                 if (end is not None and firstid <= end <= nextfirstid)
2934                 else None)
2935
2936             try:
2937                 page_results = self.getpage(pagenum)
2938             except Exception:
2939                 self._pagecount = pagenum - 1
2940                 raise
2941             if startv != 0 or endv is not None:
2942                 page_results = page_results[startv:endv]
2943             yield from page_results
2944
2945             # A little optimization - if current page is not "full", ie. does
2946             # not contain page_size videos then we can assume that this page
2947             # is the last one - there are no more ids on further pages -
2948             # i.e. no need to query again.
2949             if len(page_results) + startv < self._pagesize:
2950                 break
2951
2952             # If we got the whole page, but the next page is not interesting,
2953             # break out early as well
2954             if end == nextfirstid:
2955                 break
2956
2957
2958 class InAdvancePagedList(PagedList):
2959     """PagedList with total number of pages known in advance"""
2960
2961     def __init__(self, pagefunc, pagecount, pagesize):
2962         PagedList.__init__(self, pagefunc, pagesize, True)
2963         self._pagecount = pagecount
2964
2965     def _getslice(self, start, end):
2966         start_page = start // self._pagesize
2967         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2968         skip_elems = start - start_page * self._pagesize
2969         only_more = None if end is None else end - start
2970         for pagenum in range(start_page, end_page):
2971             page_results = self.getpage(pagenum)
2972             if skip_elems:
2973                 page_results = page_results[skip_elems:]
2974                 skip_elems = None
2975             if only_more is not None:
2976                 if len(page_results) < only_more:
2977                     only_more -= len(page_results)
2978                 else:
2979                     yield from page_results[:only_more]
2980                     break
2981             yield from page_results
2982
2983
2984 class PlaylistEntries:
2985     MissingEntry = object()
2986     is_exhausted = False
2987
2988     def __init__(self, ydl, info_dict):
2989         self.ydl = ydl
2990
2991         # _entries must be assigned now since infodict can change during iteration
2992         entries = info_dict.get('entries')
2993         if entries is None:
2994             raise EntryNotInPlaylist('There are no entries')
2995         elif isinstance(entries, list):
2996             self.is_exhausted = True
2997
2998         requested_entries = info_dict.get('requested_entries')
2999         self.is_incomplete = requested_entries is not None
3000         if self.is_incomplete:
3001             assert self.is_exhausted
3002             self._entries = [self.MissingEntry] * max(requested_entries or [0])
3003             for i, entry in zip(requested_entries, entries):
3004                 self._entries[i - 1] = entry
3005         elif isinstance(entries, (list, PagedList, LazyList)):
3006             self._entries = entries
3007         else:
3008             self._entries = LazyList(entries)
3009
3010     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
3011         (?P<start>[+-]?\d+)?
3012         (?P<range>[:-]
3013             (?P<end>[+-]?\d+|inf(?:inite)?)?
3014             (?::(?P<step>[+-]?\d+))?
3015         )?''')
3016
3017     @classmethod
3018     def parse_playlist_items(cls, string):
3019         for segment in string.split(','):
3020             if not segment:
3021                 raise ValueError('There is two or more consecutive commas')
3022             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
3023             if not mobj:
3024                 raise ValueError(f'{segment!r} is not a valid specification')
3025             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
3026             if int_or_none(step) == 0:
3027                 raise ValueError(f'Step in {segment!r} cannot be zero')
3028             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3029
3030     def get_requested_items(self):
3031         playlist_items = self.ydl.params.get('playlist_items')
3032         playlist_start = self.ydl.params.get('playliststart', 1)
3033         playlist_end = self.ydl.params.get('playlistend')
3034         # For backwards compatibility, interpret -1 as whole list
3035         if playlist_end in (-1, None):
3036             playlist_end = ''
3037         if not playlist_items:
3038             playlist_items = f'{playlist_start}:{playlist_end}'
3039         elif playlist_start != 1 or playlist_end:
3040             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3041
3042         for index in self.parse_playlist_items(playlist_items):
3043             for i, entry in self[index]:
3044                 yield i, entry
3045                 if not entry:
3046                     continue
3047                 try:
3048                     # The item may have just been added to archive. Don't break due to it
3049                     if not self.ydl.params.get('lazy_playlist'):
3050                         # TODO: Add auto-generated fields
3051                         self.ydl._match_entry(entry, incomplete=True, silent=True)
3052                 except (ExistingVideoReached, RejectedVideoReached):
3053                     return
3054
3055     def get_full_count(self):
3056         if self.is_exhausted and not self.is_incomplete:
3057             return len(self)
3058         elif isinstance(self._entries, InAdvancePagedList):
3059             if self._entries._pagesize == 1:
3060                 return self._entries._pagecount
3061
3062     @functools.cached_property
3063     def _getter(self):
3064         if isinstance(self._entries, list):
3065             def get_entry(i):
3066                 try:
3067                     entry = self._entries[i]
3068                 except IndexError:
3069                     entry = self.MissingEntry
3070                     if not self.is_incomplete:
3071                         raise self.IndexError()
3072                 if entry is self.MissingEntry:
3073                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3074                 return entry
3075         else:
3076             def get_entry(i):
3077                 try:
3078                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3079                 except (LazyList.IndexError, PagedList.IndexError):
3080                     raise self.IndexError()
3081         return get_entry
3082
3083     def __getitem__(self, idx):
3084         if isinstance(idx, int):
3085             idx = slice(idx, idx)
3086
3087         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3088         step = 1 if idx.step is None else idx.step
3089         if idx.start is None:
3090             start = 0 if step > 0 else len(self) - 1
3091         else:
3092             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3093
3094         # NB: Do not call len(self) when idx == [:]
3095         if idx.stop is None:
3096             stop = 0 if step < 0 else float('inf')
3097         else:
3098             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3099         stop += [-1, 1][step > 0]
3100
3101         for i in frange(start, stop, step):
3102             if i < 0:
3103                 continue
3104             try:
3105                 entry = self._getter(i)
3106             except self.IndexError:
3107                 self.is_exhausted = True
3108                 if step > 0:
3109                     break
3110                 continue
3111             yield i + 1, entry
3112
3113     def __len__(self):
3114         return len(tuple(self[:]))
3115
3116     class IndexError(IndexError):
3117         pass
3118
3119
3120 def uppercase_escape(s):
3121     unicode_escape = codecs.getdecoder('unicode_escape')
3122     return re.sub(
3123         r'\\U[0-9a-fA-F]{8}',
3124         lambda m: unicode_escape(m.group(0))[0],
3125         s)
3126
3127
3128 def lowercase_escape(s):
3129     unicode_escape = codecs.getdecoder('unicode_escape')
3130     return re.sub(
3131         r'\\u[0-9a-fA-F]{4}',
3132         lambda m: unicode_escape(m.group(0))[0],
3133         s)
3134
3135
3136 def escape_rfc3986(s):
3137     """Escape non-ASCII characters as suggested by RFC 3986"""
3138     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3139
3140
3141 def escape_url(url):
3142     """Escape URL as suggested by RFC 3986"""
3143     url_parsed = urllib.parse.urlparse(url)
3144     return url_parsed._replace(
3145         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3146         path=escape_rfc3986(url_parsed.path),
3147         params=escape_rfc3986(url_parsed.params),
3148         query=escape_rfc3986(url_parsed.query),
3149         fragment=escape_rfc3986(url_parsed.fragment)
3150     ).geturl()
3151
3152
3153 def parse_qs(url, **kwargs):
3154     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3155
3156
3157 def read_batch_urls(batch_fd):
3158     def fixup(url):
3159         if not isinstance(url, str):
3160             url = url.decode('utf-8', 'replace')
3161         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3162         for bom in BOM_UTF8:
3163             if url.startswith(bom):
3164                 url = url[len(bom):]
3165         url = url.lstrip()
3166         if not url or url.startswith(('#', ';', ']')):
3167             return False
3168         # "#" cannot be stripped out since it is part of the URI
3169         # However, it can be safely stripped out if following a whitespace
3170         return re.split(r'\s#', url, 1)[0].rstrip()
3171
3172     with contextlib.closing(batch_fd) as fd:
3173         return [url for url in map(fixup, fd) if url]
3174
3175
3176 def urlencode_postdata(*args, **kargs):
3177     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3178
3179
3180 def update_url(url, *, query_update=None, **kwargs):
3181     """Replace URL components specified by kwargs
3182        @param url           str or parse url tuple
3183        @param query_update  update query
3184        @returns             str
3185     """
3186     if isinstance(url, str):
3187         if not kwargs and not query_update:
3188             return url
3189         else:
3190             url = urllib.parse.urlparse(url)
3191     if query_update:
3192         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3193         kwargs['query'] = urllib.parse.urlencode({
3194             **urllib.parse.parse_qs(url.query),
3195             **query_update
3196         }, True)
3197     return urllib.parse.urlunparse(url._replace(**kwargs))
3198
3199
3200 def update_url_query(url, query):
3201     return update_url(url, query_update=query)
3202
3203
3204 def update_Request(req, url=None, data=None, headers=None, query=None):
3205     req_headers = req.headers.copy()
3206     req_headers.update(headers or {})
3207     req_data = data or req.data
3208     req_url = update_url_query(url or req.get_full_url(), query)
3209     req_get_method = req.get_method()
3210     if req_get_method == 'HEAD':
3211         req_type = HEADRequest
3212     elif req_get_method == 'PUT':
3213         req_type = PUTRequest
3214     else:
3215         req_type = urllib.request.Request
3216     new_req = req_type(
3217         req_url, data=req_data, headers=req_headers,
3218         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3219     if hasattr(req, 'timeout'):
3220         new_req.timeout = req.timeout
3221     return new_req
3222
3223
3224 def _multipart_encode_impl(data, boundary):
3225     content_type = 'multipart/form-data; boundary=%s' % boundary
3226
3227     out = b''
3228     for k, v in data.items():
3229         out += b'--' + boundary.encode('ascii') + b'\r\n'
3230         if isinstance(k, str):
3231             k = k.encode()
3232         if isinstance(v, str):
3233             v = v.encode()
3234         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3235         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3236         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3237         if boundary.encode('ascii') in content:
3238             raise ValueError('Boundary overlaps with data')
3239         out += content
3240
3241     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3242
3243     return out, content_type
3244
3245
3246 def multipart_encode(data, boundary=None):
3247     '''
3248     Encode a dict to RFC 7578-compliant form-data
3249
3250     data:
3251         A dict where keys and values can be either Unicode or bytes-like
3252         objects.
3253     boundary:
3254         If specified a Unicode object, it's used as the boundary. Otherwise
3255         a random boundary is generated.
3256
3257     Reference: https://tools.ietf.org/html/rfc7578
3258     '''
3259     has_specified_boundary = boundary is not None
3260
3261     while True:
3262         if boundary is None:
3263             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3264
3265         try:
3266             out, content_type = _multipart_encode_impl(data, boundary)
3267             break
3268         except ValueError:
3269             if has_specified_boundary:
3270                 raise
3271             boundary = None
3272
3273     return out, content_type
3274
3275
3276 def variadic(x, allowed_types=(str, bytes, dict)):
3277     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3278
3279
3280 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3281     for val in map(d.get, variadic(key_or_keys)):
3282         if val is not None and (val or not skip_false_values):
3283             return val
3284     return default
3285
3286
3287 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3288     for f in funcs:
3289         try:
3290             val = f(*args, **kwargs)
3291         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3292             pass
3293         else:
3294             if expected_type is None or isinstance(val, expected_type):
3295                 return val
3296
3297
3298 def try_get(src, getter, expected_type=None):
3299     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3300
3301
3302 def filter_dict(dct, cndn=lambda _, v: v is not None):
3303     return {k: v for k, v in dct.items() if cndn(k, v)}
3304
3305
3306 def merge_dicts(*dicts):
3307     merged = {}
3308     for a_dict in dicts:
3309         for k, v in a_dict.items():
3310             if (v is not None and k not in merged
3311                     or isinstance(v, str) and merged[k] == ''):
3312                 merged[k] = v
3313     return merged
3314
3315
3316 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3317     return string if isinstance(string, str) else str(string, encoding, errors)
3318
3319
3320 US_RATINGS = {
3321     'G': 0,
3322     'PG': 10,
3323     'PG-13': 13,
3324     'R': 16,
3325     'NC': 18,
3326 }
3327
3328
3329 TV_PARENTAL_GUIDELINES = {
3330     'TV-Y': 0,
3331     'TV-Y7': 7,
3332     'TV-G': 0,
3333     'TV-PG': 0,
3334     'TV-14': 14,
3335     'TV-MA': 17,
3336 }
3337
3338
3339 def parse_age_limit(s):
3340     # isinstance(False, int) is True. So type() must be used instead
3341     if type(s) is int:  # noqa: E721
3342         return s if 0 <= s <= 21 else None
3343     elif not isinstance(s, str):
3344         return None
3345     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3346     if m:
3347         return int(m.group('age'))
3348     s = s.upper()
3349     if s in US_RATINGS:
3350         return US_RATINGS[s]
3351     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3352     if m:
3353         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3354     return None
3355
3356
3357 def strip_jsonp(code):
3358     return re.sub(
3359         r'''(?sx)^
3360             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3361             (?:\s*&&\s*(?P=func_name))?
3362             \s*\(\s*(?P<callback_data>.*)\);?
3363             \s*?(?://[^\n]*)*$''',
3364         r'\g<callback_data>', code)
3365
3366
3367 def js_to_json(code, vars={}, *, strict=False):
3368     # vars is a dict of var, val pairs to substitute
3369     STRING_QUOTES = '\'"`'
3370     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3371     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3372     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3373     INTEGER_TABLE = (
3374         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3375         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3376     )
3377
3378     def process_escape(match):
3379         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3380         escape = match.group(1) or match.group(2)
3381
3382         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3383                 else R'\u00' if escape == 'x'
3384                 else '' if escape == '\n'
3385                 else escape)
3386
3387     def template_substitute(match):
3388         evaluated = js_to_json(match.group(1), vars, strict=strict)
3389         if evaluated[0] == '"':
3390             return json.loads(evaluated)
3391         return evaluated
3392
3393     def fix_kv(m):
3394         v = m.group(0)
3395         if v in ('true', 'false', 'null'):
3396             return v
3397         elif v in ('undefined', 'void 0'):
3398             return 'null'
3399         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3400             return ''
3401
3402         if v[0] in STRING_QUOTES:
3403             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3404             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3405             return f'"{escaped}"'
3406
3407         for regex, base in INTEGER_TABLE:
3408             im = re.match(regex, v)
3409             if im:
3410                 i = int(im.group(1), base)
3411                 return f'"{i}":' if v.endswith(':') else str(i)
3412
3413         if v in vars:
3414             try:
3415                 if not strict:
3416                     json.loads(vars[v])
3417             except json.JSONDecodeError:
3418                 return json.dumps(vars[v])
3419             else:
3420                 return vars[v]
3421
3422         if not strict:
3423             return f'"{v}"'
3424
3425         raise ValueError(f'Unknown value: {v}')
3426
3427     def create_map(mobj):
3428         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3429
3430     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3431     if not strict:
3432         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3433         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3434         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3435         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3436
3437     return re.sub(rf'''(?sx)
3438         {STRING_RE}|
3439         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3440         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3441         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3442         [0-9]+(?={SKIP_RE}:)|
3443         !+
3444         ''', fix_kv, code)
3445
3446
3447 def qualities(quality_ids):
3448     """ Get a numeric quality value out of a list of possible values """
3449     def q(qid):
3450         try:
3451             return quality_ids.index(qid)
3452         except ValueError:
3453             return -1
3454     return q
3455
3456
3457 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3458
3459
3460 DEFAULT_OUTTMPL = {
3461     'default': '%(title)s [%(id)s].%(ext)s',
3462     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3463 }
3464 OUTTMPL_TYPES = {
3465     'chapter': None,
3466     'subtitle': None,
3467     'thumbnail': None,
3468     'description': 'description',
3469     'annotation': 'annotations.xml',
3470     'infojson': 'info.json',
3471     'link': None,
3472     'pl_video': None,
3473     'pl_thumbnail': None,
3474     'pl_description': 'description',
3475     'pl_infojson': 'info.json',
3476 }
3477
3478 # As of [1] format syntax is:
3479 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3480 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3481 STR_FORMAT_RE_TMPL = r'''(?x)
3482     (?<!%)(?P<prefix>(?:%%)*)
3483     %
3484     (?P<has_key>\((?P<key>{0})\))?
3485     (?P<format>
3486         (?P<conversion>[#0\-+ ]+)?
3487         (?P<min_width>\d+)?
3488         (?P<precision>\.\d+)?
3489         (?P<len_mod>[hlL])?  # unused in python
3490         {1}  # conversion type
3491     )
3492 '''
3493
3494
3495 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3496
3497
3498 def limit_length(s, length):
3499     """ Add ellipses to overly long strings """
3500     if s is None:
3501         return None
3502     ELLIPSES = '...'
3503     if len(s) > length:
3504         return s[:length - len(ELLIPSES)] + ELLIPSES
3505     return s
3506
3507
3508 def version_tuple(v):
3509     return tuple(int(e) for e in re.split(r'[-.]', v))
3510
3511
3512 def is_outdated_version(version, limit, assume_new=True):
3513     if not version:
3514         return not assume_new
3515     try:
3516         return version_tuple(version) < version_tuple(limit)
3517     except ValueError:
3518         return not assume_new
3519
3520
3521 def ytdl_is_updateable():
3522     """ Returns if yt-dlp can be updated with -U """
3523
3524     from .update import is_non_updateable
3525
3526     return not is_non_updateable()
3527
3528
3529 def args_to_str(args):
3530     # Get a short string representation for a subprocess command
3531     return ' '.join(compat_shlex_quote(a) for a in args)
3532
3533
3534 def error_to_compat_str(err):
3535     return str(err)
3536
3537
3538 def error_to_str(err):
3539     return f'{type(err).__name__}: {err}'
3540
3541
3542 def mimetype2ext(mt, default=NO_DEFAULT):
3543     if not isinstance(mt, str):
3544         if default is not NO_DEFAULT:
3545             return default
3546         return None
3547
3548     MAP = {
3549         # video
3550         '3gpp': '3gp',
3551         'mp2t': 'ts',
3552         'mp4': 'mp4',
3553         'mpeg': 'mpeg',
3554         'mpegurl': 'm3u8',
3555         'quicktime': 'mov',
3556         'webm': 'webm',
3557         'vp9': 'vp9',
3558         'x-flv': 'flv',
3559         'x-m4v': 'm4v',
3560         'x-matroska': 'mkv',
3561         'x-mng': 'mng',
3562         'x-mp4-fragmented': 'mp4',
3563         'x-ms-asf': 'asf',
3564         'x-ms-wmv': 'wmv',
3565         'x-msvideo': 'avi',
3566
3567         # application (streaming playlists)
3568         'dash+xml': 'mpd',
3569         'f4m+xml': 'f4m',
3570         'hds+xml': 'f4m',
3571         'vnd.apple.mpegurl': 'm3u8',
3572         'vnd.ms-sstr+xml': 'ism',
3573         'x-mpegurl': 'm3u8',
3574
3575         # audio
3576         'audio/mp4': 'm4a',
3577         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3578         # Using .mp3 as it's the most popular one
3579         'audio/mpeg': 'mp3',
3580         'audio/webm': 'webm',
3581         'audio/x-matroska': 'mka',
3582         'audio/x-mpegurl': 'm3u',
3583         'midi': 'mid',
3584         'ogg': 'ogg',
3585         'wav': 'wav',
3586         'wave': 'wav',
3587         'x-aac': 'aac',
3588         'x-flac': 'flac',
3589         'x-m4a': 'm4a',
3590         'x-realaudio': 'ra',
3591         'x-wav': 'wav',
3592
3593         # image
3594         'avif': 'avif',
3595         'bmp': 'bmp',
3596         'gif': 'gif',
3597         'jpeg': 'jpg',
3598         'png': 'png',
3599         'svg+xml': 'svg',
3600         'tiff': 'tif',
3601         'vnd.wap.wbmp': 'wbmp',
3602         'webp': 'webp',
3603         'x-icon': 'ico',
3604         'x-jng': 'jng',
3605         'x-ms-bmp': 'bmp',
3606
3607         # caption
3608         'filmstrip+json': 'fs',
3609         'smptett+xml': 'tt',
3610         'ttaf+xml': 'dfxp',
3611         'ttml+xml': 'ttml',
3612         'x-ms-sami': 'sami',
3613
3614         # misc
3615         'gzip': 'gz',
3616         'json': 'json',
3617         'xml': 'xml',
3618         'zip': 'zip',
3619     }
3620
3621     mimetype = mt.partition(';')[0].strip().lower()
3622     _, _, subtype = mimetype.rpartition('/')
3623
3624     ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3625     if ext:
3626         return ext
3627     elif default is not NO_DEFAULT:
3628         return default
3629     return subtype.replace('+', '.')
3630
3631
3632 def ext2mimetype(ext_or_url):
3633     if not ext_or_url:
3634         return None
3635     if '.' not in ext_or_url:
3636         ext_or_url = f'file.{ext_or_url}'
3637     return mimetypes.guess_type(ext_or_url)[0]
3638
3639
3640 def parse_codecs(codecs_str):
3641     # http://tools.ietf.org/html/rfc6381
3642     if not codecs_str:
3643         return {}
3644     split_codecs = list(filter(None, map(
3645         str.strip, codecs_str.strip().strip(',').split(','))))
3646     vcodec, acodec, scodec, hdr = None, None, None, None
3647     for full_codec in split_codecs:
3648         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3649         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3650                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3651             if vcodec:
3652                 continue
3653             vcodec = full_codec
3654             if parts[0] in ('dvh1', 'dvhe'):
3655                 hdr = 'DV'
3656             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3657                 hdr = 'HDR10'
3658             elif parts[:2] == ['vp9', '2']:
3659                 hdr = 'HDR10'
3660         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3661                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3662             acodec = acodec or full_codec
3663         elif parts[0] in ('stpp', 'wvtt'):
3664             scodec = scodec or full_codec
3665         else:
3666             write_string(f'WARNING: Unknown codec {full_codec}\n')
3667     if vcodec or acodec or scodec:
3668         return {
3669             'vcodec': vcodec or 'none',
3670             'acodec': acodec or 'none',
3671             'dynamic_range': hdr,
3672             **({'scodec': scodec} if scodec is not None else {}),
3673         }
3674     elif len(split_codecs) == 2:
3675         return {
3676             'vcodec': split_codecs[0],
3677             'acodec': split_codecs[1],
3678         }
3679     return {}
3680
3681
3682 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3683     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3684
3685     allow_mkv = not preferences or 'mkv' in preferences
3686
3687     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3688         return 'mkv'  # TODO: any other format allows this?
3689
3690     # TODO: All codecs supported by parse_codecs isn't handled here
3691     COMPATIBLE_CODECS = {
3692         'mp4': {
3693             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3694             'h264', 'aacl', 'ec-3',  # Set in ISM
3695         },
3696         'webm': {
3697             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3698             'vp9x', 'vp8x',  # in the webm spec
3699         },
3700     }
3701
3702     sanitize_codec = functools.partial(
3703         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3704     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3705
3706     for ext in preferences or COMPATIBLE_CODECS.keys():
3707         codec_set = COMPATIBLE_CODECS.get(ext, set())
3708         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3709             return ext
3710
3711     COMPATIBLE_EXTS = (
3712         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3713         {'webm', 'weba'},
3714     )
3715     for ext in preferences or vexts:
3716         current_exts = {ext, *vexts, *aexts}
3717         if ext == 'mkv' or current_exts == {ext} or any(
3718                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3719             return ext
3720     return 'mkv' if allow_mkv else preferences[-1]
3721
3722
3723 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3724     getheader = url_handle.headers.get
3725
3726     cd = getheader('Content-Disposition')
3727     if cd:
3728         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3729         if m:
3730             e = determine_ext(m.group('filename'), default_ext=None)
3731             if e:
3732                 return e
3733
3734     meta_ext = getheader('x-amz-meta-name')
3735     if meta_ext:
3736         e = meta_ext.rpartition('.')[2]
3737         if e:
3738             return e
3739
3740     return mimetype2ext(getheader('Content-Type'), default=default)
3741
3742
3743 def encode_data_uri(data, mime_type):
3744     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3745
3746
3747 def age_restricted(content_limit, age_limit):
3748     """ Returns True iff the content should be blocked """
3749
3750     if age_limit is None:  # No limit set
3751         return False
3752     if content_limit is None:
3753         return False  # Content available for everyone
3754     return age_limit < content_limit
3755
3756
3757 # List of known byte-order-marks (BOM)
3758 BOMS = [
3759     (b'\xef\xbb\xbf', 'utf-8'),
3760     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3761     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3762     (b'\xff\xfe', 'utf-16-le'),
3763     (b'\xfe\xff', 'utf-16-be'),
3764 ]
3765
3766
3767 def is_html(first_bytes):
3768     """ Detect whether a file contains HTML by examining its first bytes. """
3769
3770     encoding = 'utf-8'
3771     for bom, enc in BOMS:
3772         while first_bytes.startswith(bom):
3773             encoding, first_bytes = enc, first_bytes[len(bom):]
3774
3775     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3776
3777
3778 def determine_protocol(info_dict):
3779     protocol = info_dict.get('protocol')
3780     if protocol is not None:
3781         return protocol
3782
3783     url = sanitize_url(info_dict['url'])
3784     if url.startswith('rtmp'):
3785         return 'rtmp'
3786     elif url.startswith('mms'):
3787         return 'mms'
3788     elif url.startswith('rtsp'):
3789         return 'rtsp'
3790
3791     ext = determine_ext(url)
3792     if ext == 'm3u8':
3793         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3794     elif ext == 'f4m':
3795         return 'f4m'
3796
3797     return urllib.parse.urlparse(url).scheme
3798
3799
3800 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3801     """ Render a list of rows, each as a list of values.
3802     Text after a \t will be right aligned """
3803     def width(string):
3804         return len(remove_terminal_sequences(string).replace('\t', ''))
3805
3806     def get_max_lens(table):
3807         return [max(width(str(v)) for v in col) for col in zip(*table)]
3808
3809     def filter_using_list(row, filterArray):
3810         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3811
3812     max_lens = get_max_lens(data) if hide_empty else []
3813     header_row = filter_using_list(header_row, max_lens)
3814     data = [filter_using_list(row, max_lens) for row in data]
3815
3816     table = [header_row] + data
3817     max_lens = get_max_lens(table)
3818     extra_gap += 1
3819     if delim:
3820         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3821         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3822     for row in table:
3823         for pos, text in enumerate(map(str, row)):
3824             if '\t' in text:
3825                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3826             else:
3827                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3828     ret = '\n'.join(''.join(row).rstrip() for row in table)
3829     return ret
3830
3831
3832 def _match_one(filter_part, dct, incomplete):
3833     # TODO: Generalize code with YoutubeDL._build_format_filter
3834     STRING_OPERATORS = {
3835         '*=': operator.contains,
3836         '^=': lambda attr, value: attr.startswith(value),
3837         '$=': lambda attr, value: attr.endswith(value),
3838         '~=': lambda attr, value: re.search(value, attr),
3839     }
3840     COMPARISON_OPERATORS = {
3841         **STRING_OPERATORS,
3842         '<=': operator.le,  # "<=" must be defined above "<"
3843         '<': operator.lt,
3844         '>=': operator.ge,
3845         '>': operator.gt,
3846         '=': operator.eq,
3847     }
3848
3849     if isinstance(incomplete, bool):
3850         is_incomplete = lambda _: incomplete
3851     else:
3852         is_incomplete = lambda k: k in incomplete
3853
3854     operator_rex = re.compile(r'''(?x)
3855         (?P<key>[a-z_]+)
3856         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3857         (?:
3858             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3859             (?P<strval>.+?)
3860         )
3861         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3862     m = operator_rex.fullmatch(filter_part.strip())
3863     if m:
3864         m = m.groupdict()
3865         unnegated_op = COMPARISON_OPERATORS[m['op']]
3866         if m['negation']:
3867             op = lambda attr, value: not unnegated_op(attr, value)
3868         else:
3869             op = unnegated_op
3870         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3871         if m['quote']:
3872             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3873         actual_value = dct.get(m['key'])
3874         numeric_comparison = None
3875         if isinstance(actual_value, (int, float)):
3876             # If the original field is a string and matching comparisonvalue is
3877             # a number we should respect the origin of the original field
3878             # and process comparison value as a string (see
3879             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3880             try:
3881                 numeric_comparison = int(comparison_value)
3882             except ValueError:
3883                 numeric_comparison = parse_filesize(comparison_value)
3884                 if numeric_comparison is None:
3885                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3886                 if numeric_comparison is None:
3887                     numeric_comparison = parse_duration(comparison_value)
3888         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3889             raise ValueError('Operator %s only supports string values!' % m['op'])
3890         if actual_value is None:
3891             return is_incomplete(m['key']) or m['none_inclusive']
3892         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3893
3894     UNARY_OPERATORS = {
3895         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3896         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3897     }
3898     operator_rex = re.compile(r'''(?x)
3899         (?P<op>%s)\s*(?P<key>[a-z_]+)
3900         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3901     m = operator_rex.fullmatch(filter_part.strip())
3902     if m:
3903         op = UNARY_OPERATORS[m.group('op')]
3904         actual_value = dct.get(m.group('key'))
3905         if is_incomplete(m.group('key')) and actual_value is None:
3906             return True
3907         return op(actual_value)
3908
3909     raise ValueError('Invalid filter part %r' % filter_part)
3910
3911
3912 def match_str(filter_str, dct, incomplete=False):
3913     """ Filter a dictionary with a simple string syntax.
3914     @returns           Whether the filter passes
3915     @param incomplete  Set of keys that is expected to be missing from dct.
3916                        Can be True/False to indicate all/none of the keys may be missing.
3917                        All conditions on incomplete keys pass if the key is missing
3918     """
3919     return all(
3920         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3921         for filter_part in re.split(r'(?<!\\)&', filter_str))
3922
3923
3924 def match_filter_func(filters, breaking_filters=None):
3925     if not filters and not breaking_filters:
3926         return None
3927     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3928     filters = set(variadic(filters or []))
3929
3930     interactive = '-' in filters
3931     if interactive:
3932         filters.remove('-')
3933
3934     def _match_func(info_dict, incomplete=False):
3935         ret = breaking_filters(info_dict, incomplete)
3936         if ret is not None:
3937             raise RejectedVideoReached(ret)
3938
3939         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3940             return NO_DEFAULT if interactive and not incomplete else None
3941         else:
3942             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3943             filter_str = ') | ('.join(map(str.strip, filters))
3944             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3945     return _match_func
3946
3947
3948 class download_range_func:
3949     def __init__(self, chapters, ranges):
3950         self.chapters, self.ranges = chapters, ranges
3951
3952     def __call__(self, info_dict, ydl):
3953         if not self.ranges and not self.chapters:
3954             yield {}
3955
3956         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3957                    else 'Cannot match chapters since chapter information is unavailable')
3958         for regex in self.chapters or []:
3959             for i, chapter in enumerate(info_dict.get('chapters') or []):
3960                 if re.search(regex, chapter['title']):
3961                     warning = None
3962                     yield {**chapter, 'index': i}
3963         if self.chapters and warning:
3964             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3965
3966         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3967
3968     def __eq__(self, other):
3969         return (isinstance(other, download_range_func)
3970                 and self.chapters == other.chapters and self.ranges == other.ranges)
3971
3972     def __repr__(self):
3973         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3974
3975
3976 def parse_dfxp_time_expr(time_expr):
3977     if not time_expr:
3978         return
3979
3980     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3981     if mobj:
3982         return float(mobj.group('time_offset'))
3983
3984     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3985     if mobj:
3986         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3987
3988
3989 def srt_subtitles_timecode(seconds):
3990     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3991
3992
3993 def ass_subtitles_timecode(seconds):
3994     time = timetuple_from_msec(seconds * 1000)
3995     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3996
3997
3998 def dfxp2srt(dfxp_data):
3999     '''
4000     @param dfxp_data A bytes-like object containing DFXP data
4001     @returns A unicode object containing converted SRT data
4002     '''
4003     LEGACY_NAMESPACES = (
4004         (b'http://www.w3.org/ns/ttml', [
4005             b'http://www.w3.org/2004/11/ttaf1',
4006             b'http://www.w3.org/2006/04/ttaf1',
4007             b'http://www.w3.org/2006/10/ttaf1',
4008         ]),
4009         (b'http://www.w3.org/ns/ttml#styling', [
4010             b'http://www.w3.org/ns/ttml#style',
4011         ]),
4012     )
4013
4014     SUPPORTED_STYLING = [
4015         'color',
4016         'fontFamily',
4017         'fontSize',
4018         'fontStyle',
4019         'fontWeight',
4020         'textDecoration'
4021     ]
4022
4023     _x = functools.partial(xpath_with_ns, ns_map={
4024         'xml': 'http://www.w3.org/XML/1998/namespace',
4025         'ttml': 'http://www.w3.org/ns/ttml',
4026         'tts': 'http://www.w3.org/ns/ttml#styling',
4027     })
4028
4029     styles = {}
4030     default_style = {}
4031
4032     class TTMLPElementParser:
4033         _out = ''
4034         _unclosed_elements = []
4035         _applied_styles = []
4036
4037         def start(self, tag, attrib):
4038             if tag in (_x('ttml:br'), 'br'):
4039                 self._out += '\n'
4040             else:
4041                 unclosed_elements = []
4042                 style = {}
4043                 element_style_id = attrib.get('style')
4044                 if default_style:
4045                     style.update(default_style)
4046                 if element_style_id:
4047                     style.update(styles.get(element_style_id, {}))
4048                 for prop in SUPPORTED_STYLING:
4049                     prop_val = attrib.get(_x('tts:' + prop))
4050                     if prop_val:
4051                         style[prop] = prop_val
4052                 if style:
4053                     font = ''
4054                     for k, v in sorted(style.items()):
4055                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
4056                             continue
4057                         if k == 'color':
4058                             font += ' color="%s"' % v
4059                         elif k == 'fontSize':
4060                             font += ' size="%s"' % v
4061                         elif k == 'fontFamily':
4062                             font += ' face="%s"' % v
4063                         elif k == 'fontWeight' and v == 'bold':
4064                             self._out += '<b>'
4065                             unclosed_elements.append('b')
4066                         elif k == 'fontStyle' and v == 'italic':
4067                             self._out += '<i>'
4068                             unclosed_elements.append('i')
4069                         elif k == 'textDecoration' and v == 'underline':
4070                             self._out += '<u>'
4071                             unclosed_elements.append('u')
4072                     if font:
4073                         self._out += '<font' + font + '>'
4074                         unclosed_elements.append('font')
4075                     applied_style = {}
4076                     if self._applied_styles:
4077                         applied_style.update(self._applied_styles[-1])
4078                     applied_style.update(style)
4079                     self._applied_styles.append(applied_style)
4080                 self._unclosed_elements.append(unclosed_elements)
4081
4082         def end(self, tag):
4083             if tag not in (_x('ttml:br'), 'br'):
4084                 unclosed_elements = self._unclosed_elements.pop()
4085                 for element in reversed(unclosed_elements):
4086                     self._out += '</%s>' % element
4087                 if unclosed_elements and self._applied_styles:
4088                     self._applied_styles.pop()
4089
4090         def data(self, data):
4091             self._out += data
4092
4093         def close(self):
4094             return self._out.strip()
4095
4096     def parse_node(node):
4097         target = TTMLPElementParser()
4098         parser = xml.etree.ElementTree.XMLParser(target=target)
4099         parser.feed(xml.etree.ElementTree.tostring(node))
4100         return parser.close()
4101
4102     for k, v in LEGACY_NAMESPACES:
4103         for ns in v:
4104             dfxp_data = dfxp_data.replace(ns, k)
4105
4106     dfxp = compat_etree_fromstring(dfxp_data)
4107     out = []
4108     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4109
4110     if not paras:
4111         raise ValueError('Invalid dfxp/TTML subtitle')
4112
4113     repeat = False
4114     while True:
4115         for style in dfxp.findall(_x('.//ttml:style')):
4116             style_id = style.get('id') or style.get(_x('xml:id'))
4117             if not style_id:
4118                 continue
4119             parent_style_id = style.get('style')
4120             if parent_style_id:
4121                 if parent_style_id not in styles:
4122                     repeat = True
4123                     continue
4124                 styles[style_id] = styles[parent_style_id].copy()
4125             for prop in SUPPORTED_STYLING:
4126                 prop_val = style.get(_x('tts:' + prop))
4127                 if prop_val:
4128                     styles.setdefault(style_id, {})[prop] = prop_val
4129         if repeat:
4130             repeat = False
4131         else:
4132             break
4133
4134     for p in ('body', 'div'):
4135         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4136         if ele is None:
4137             continue
4138         style = styles.get(ele.get('style'))
4139         if not style:
4140             continue
4141         default_style.update(style)
4142
4143     for para, index in zip(paras, itertools.count(1)):
4144         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4145         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4146         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4147         if begin_time is None:
4148             continue
4149         if not end_time:
4150             if not dur:
4151                 continue
4152             end_time = begin_time + dur
4153         out.append('%d\n%s --> %s\n%s\n\n' % (
4154             index,
4155             srt_subtitles_timecode(begin_time),
4156             srt_subtitles_timecode(end_time),
4157             parse_node(para)))
4158
4159     return ''.join(out)
4160
4161
4162 def cli_option(params, command_option, param, separator=None):
4163     param = params.get(param)
4164     return ([] if param is None
4165             else [command_option, str(param)] if separator is None
4166             else [f'{command_option}{separator}{param}'])
4167
4168
4169 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4170     param = params.get(param)
4171     assert param in (True, False, None)
4172     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4173
4174
4175 def cli_valueless_option(params, command_option, param, expected_value=True):
4176     return [command_option] if params.get(param) == expected_value else []
4177
4178
4179 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4180     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4181         if use_compat:
4182             return argdict
4183         else:
4184             argdict = None
4185     if argdict is None:
4186         return default
4187     assert isinstance(argdict, dict)
4188
4189     assert isinstance(keys, (list, tuple))
4190     for key_list in keys:
4191         arg_list = list(filter(
4192             lambda x: x is not None,
4193             [argdict.get(key.lower()) for key in variadic(key_list)]))
4194         if arg_list:
4195             return [arg for args in arg_list for arg in args]
4196     return default
4197
4198
4199 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4200     main_key, exe = main_key.lower(), exe.lower()
4201     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4202     keys = [f'{root_key}{k}' for k in (keys or [''])]
4203     if root_key in keys:
4204         if main_key != exe:
4205             keys.append((main_key, exe))
4206         keys.append('default')
4207     else:
4208         use_compat = False
4209     return cli_configuration_args(argdict, keys, default, use_compat)
4210
4211
4212 class ISO639Utils:
4213     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4214     _lang_map = {
4215         'aa': 'aar',
4216         'ab': 'abk',
4217         'ae': 'ave',
4218         'af': 'afr',
4219         'ak': 'aka',
4220         'am': 'amh',
4221         'an': 'arg',
4222         'ar': 'ara',
4223         'as': 'asm',
4224         'av': 'ava',
4225         'ay': 'aym',
4226         'az': 'aze',
4227         'ba': 'bak',
4228         'be': 'bel',
4229         'bg': 'bul',
4230         'bh': 'bih',
4231         'bi': 'bis',
4232         'bm': 'bam',
4233         'bn': 'ben',
4234         'bo': 'bod',
4235         'br': 'bre',
4236         'bs': 'bos',
4237         'ca': 'cat',
4238         'ce': 'che',
4239         'ch': 'cha',
4240         'co': 'cos',
4241         'cr': 'cre',
4242         'cs': 'ces',
4243         'cu': 'chu',
4244         'cv': 'chv',
4245         'cy': 'cym',
4246         'da': 'dan',
4247         'de': 'deu',
4248         'dv': 'div',
4249         'dz': 'dzo',
4250         'ee': 'ewe',
4251         'el': 'ell',
4252         'en': 'eng',
4253         'eo': 'epo',
4254         'es': 'spa',
4255         'et': 'est',
4256         'eu': 'eus',
4257         'fa': 'fas',
4258         'ff': 'ful',
4259         'fi': 'fin',
4260         'fj': 'fij',
4261         'fo': 'fao',
4262         'fr': 'fra',
4263         'fy': 'fry',
4264         'ga': 'gle',
4265         'gd': 'gla',
4266         'gl': 'glg',
4267         'gn': 'grn',
4268         'gu': 'guj',
4269         'gv': 'glv',
4270         'ha': 'hau',
4271         'he': 'heb',
4272         'iw': 'heb',  # Replaced by he in 1989 revision
4273         'hi': 'hin',
4274         'ho': 'hmo',
4275         'hr': 'hrv',
4276         'ht': 'hat',
4277         'hu': 'hun',
4278         'hy': 'hye',
4279         'hz': 'her',
4280         'ia': 'ina',
4281         'id': 'ind',
4282         'in': 'ind',  # Replaced by id in 1989 revision
4283         'ie': 'ile',
4284         'ig': 'ibo',
4285         'ii': 'iii',
4286         'ik': 'ipk',
4287         'io': 'ido',
4288         'is': 'isl',
4289         'it': 'ita',
4290         'iu': 'iku',
4291         'ja': 'jpn',
4292         'jv': 'jav',
4293         'ka': 'kat',
4294         'kg': 'kon',
4295         'ki': 'kik',
4296         'kj': 'kua',
4297         'kk': 'kaz',
4298         'kl': 'kal',
4299         'km': 'khm',
4300         'kn': 'kan',
4301         'ko': 'kor',
4302         'kr': 'kau',
4303         'ks': 'kas',
4304         'ku': 'kur',
4305         'kv': 'kom',
4306         'kw': 'cor',
4307         'ky': 'kir',
4308         'la': 'lat',
4309         'lb': 'ltz',
4310         'lg': 'lug',
4311         'li': 'lim',
4312         'ln': 'lin',
4313         'lo': 'lao',
4314         'lt': 'lit',
4315         'lu': 'lub',
4316         'lv': 'lav',
4317         'mg': 'mlg',
4318         'mh': 'mah',
4319         'mi': 'mri',
4320         'mk': 'mkd',
4321         'ml': 'mal',
4322         'mn': 'mon',
4323         'mr': 'mar',
4324         'ms': 'msa',
4325         'mt': 'mlt',
4326         'my': 'mya',
4327         'na': 'nau',
4328         'nb': 'nob',
4329         'nd': 'nde',
4330         'ne': 'nep',
4331         'ng': 'ndo',
4332         'nl': 'nld',
4333         'nn': 'nno',
4334         'no': 'nor',
4335         'nr': 'nbl',
4336         'nv': 'nav',
4337         'ny': 'nya',
4338         'oc': 'oci',
4339         'oj': 'oji',
4340         'om': 'orm',
4341         'or': 'ori',
4342         'os': 'oss',
4343         'pa': 'pan',
4344         'pi': 'pli',
4345         'pl': 'pol',
4346         'ps': 'pus',
4347         'pt': 'por',
4348         'qu': 'que',
4349         'rm': 'roh',
4350         'rn': 'run',
4351         'ro': 'ron',
4352         'ru': 'rus',
4353         'rw': 'kin',
4354         'sa': 'san',
4355         'sc': 'srd',
4356         'sd': 'snd',
4357         'se': 'sme',
4358         'sg': 'sag',
4359         'si': 'sin',
4360         'sk': 'slk',
4361         'sl': 'slv',
4362         'sm': 'smo',
4363         'sn': 'sna',
4364         'so': 'som',
4365         'sq': 'sqi',
4366         'sr': 'srp',
4367         'ss': 'ssw',
4368         'st': 'sot',
4369         'su': 'sun',
4370         'sv': 'swe',
4371         'sw': 'swa',
4372         'ta': 'tam',
4373         'te': 'tel',
4374         'tg': 'tgk',
4375         'th': 'tha',
4376         'ti': 'tir',
4377         'tk': 'tuk',
4378         'tl': 'tgl',
4379         'tn': 'tsn',
4380         'to': 'ton',
4381         'tr': 'tur',
4382         'ts': 'tso',
4383         'tt': 'tat',
4384         'tw': 'twi',
4385         'ty': 'tah',
4386         'ug': 'uig',
4387         'uk': 'ukr',
4388         'ur': 'urd',
4389         'uz': 'uzb',
4390         've': 'ven',
4391         'vi': 'vie',
4392         'vo': 'vol',
4393         'wa': 'wln',
4394         'wo': 'wol',
4395         'xh': 'xho',
4396         'yi': 'yid',
4397         'ji': 'yid',  # Replaced by yi in 1989 revision
4398         'yo': 'yor',
4399         'za': 'zha',
4400         'zh': 'zho',
4401         'zu': 'zul',
4402     }
4403
4404     @classmethod
4405     def short2long(cls, code):
4406         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4407         return cls._lang_map.get(code[:2])
4408
4409     @classmethod
4410     def long2short(cls, code):
4411         """Convert language code from ISO 639-2/T to ISO 639-1"""
4412         for short_name, long_name in cls._lang_map.items():
4413             if long_name == code:
4414                 return short_name
4415
4416
4417 class ISO3166Utils:
4418     # From http://data.okfn.org/data/core/country-list
4419     _country_map = {
4420         'AF': 'Afghanistan',
4421         'AX': 'Åland Islands',
4422         'AL': 'Albania',
4423         'DZ': 'Algeria',
4424         'AS': 'American Samoa',
4425         'AD': 'Andorra',
4426         'AO': 'Angola',
4427         'AI': 'Anguilla',
4428         'AQ': 'Antarctica',
4429         'AG': 'Antigua and Barbuda',
4430         'AR': 'Argentina',
4431         'AM': 'Armenia',
4432         'AW': 'Aruba',
4433         'AU': 'Australia',
4434         'AT': 'Austria',
4435         'AZ': 'Azerbaijan',
4436         'BS': 'Bahamas',
4437         'BH': 'Bahrain',
4438         'BD': 'Bangladesh',
4439         'BB': 'Barbados',
4440         'BY': 'Belarus',
4441         'BE': 'Belgium',
4442         'BZ': 'Belize',
4443         'BJ': 'Benin',
4444         'BM': 'Bermuda',
4445         'BT': 'Bhutan',
4446         'BO': 'Bolivia, Plurinational State of',
4447         'BQ': 'Bonaire, Sint Eustatius and Saba',
4448         'BA': 'Bosnia and Herzegovina',
4449         'BW': 'Botswana',
4450         'BV': 'Bouvet Island',
4451         'BR': 'Brazil',
4452         'IO': 'British Indian Ocean Territory',
4453         'BN': 'Brunei Darussalam',
4454         'BG': 'Bulgaria',
4455         'BF': 'Burkina Faso',
4456         'BI': 'Burundi',
4457         'KH': 'Cambodia',
4458         'CM': 'Cameroon',
4459         'CA': 'Canada',
4460         'CV': 'Cape Verde',
4461         'KY': 'Cayman Islands',
4462         'CF': 'Central African Republic',
4463         'TD': 'Chad',
4464         'CL': 'Chile',
4465         'CN': 'China',
4466         'CX': 'Christmas Island',
4467         'CC': 'Cocos (Keeling) Islands',
4468         'CO': 'Colombia',
4469         'KM': 'Comoros',
4470         'CG': 'Congo',
4471         'CD': 'Congo, the Democratic Republic of the',
4472         'CK': 'Cook Islands',
4473         'CR': 'Costa Rica',
4474         'CI': 'Côte d\'Ivoire',
4475         'HR': 'Croatia',
4476         'CU': 'Cuba',
4477         'CW': 'Curaçao',
4478         'CY': 'Cyprus',
4479         'CZ': 'Czech Republic',
4480         'DK': 'Denmark',
4481         'DJ': 'Djibouti',
4482         'DM': 'Dominica',
4483         'DO': 'Dominican Republic',
4484         'EC': 'Ecuador',
4485         'EG': 'Egypt',
4486         'SV': 'El Salvador',
4487         'GQ': 'Equatorial Guinea',
4488         'ER': 'Eritrea',
4489         'EE': 'Estonia',
4490         'ET': 'Ethiopia',
4491         'FK': 'Falkland Islands (Malvinas)',
4492         'FO': 'Faroe Islands',
4493         'FJ': 'Fiji',
4494         'FI': 'Finland',
4495         'FR': 'France',
4496         'GF': 'French Guiana',
4497         'PF': 'French Polynesia',
4498         'TF': 'French Southern Territories',
4499         'GA': 'Gabon',
4500         'GM': 'Gambia',
4501         'GE': 'Georgia',
4502         'DE': 'Germany',
4503         'GH': 'Ghana',
4504         'GI': 'Gibraltar',
4505         'GR': 'Greece',
4506         'GL': 'Greenland',
4507         'GD': 'Grenada',
4508         'GP': 'Guadeloupe',
4509         'GU': 'Guam',
4510         'GT': 'Guatemala',
4511         'GG': 'Guernsey',
4512         'GN': 'Guinea',
4513         'GW': 'Guinea-Bissau',
4514         'GY': 'Guyana',
4515         'HT': 'Haiti',
4516         'HM': 'Heard Island and McDonald Islands',
4517         'VA': 'Holy See (Vatican City State)',
4518         'HN': 'Honduras',
4519         'HK': 'Hong Kong',
4520         'HU': 'Hungary',
4521         'IS': 'Iceland',
4522         'IN': 'India',
4523         'ID': 'Indonesia',
4524         'IR': 'Iran, Islamic Republic of',
4525         'IQ': 'Iraq',
4526         'IE': 'Ireland',
4527         'IM': 'Isle of Man',
4528         'IL': 'Israel',
4529         'IT': 'Italy',
4530         'JM': 'Jamaica',
4531         'JP': 'Japan',
4532         'JE': 'Jersey',
4533         'JO': 'Jordan',
4534         'KZ': 'Kazakhstan',
4535         'KE': 'Kenya',
4536         'KI': 'Kiribati',
4537         'KP': 'Korea, Democratic People\'s Republic of',
4538         'KR': 'Korea, Republic of',
4539         'KW': 'Kuwait',
4540         'KG': 'Kyrgyzstan',
4541         'LA': 'Lao People\'s Democratic Republic',
4542         'LV': 'Latvia',
4543         'LB': 'Lebanon',
4544         'LS': 'Lesotho',
4545         'LR': 'Liberia',
4546         'LY': 'Libya',
4547         'LI': 'Liechtenstein',
4548         'LT': 'Lithuania',
4549         'LU': 'Luxembourg',
4550         'MO': 'Macao',
4551         'MK': 'Macedonia, the Former Yugoslav Republic of',
4552         'MG': 'Madagascar',
4553         'MW': 'Malawi',
4554         'MY': 'Malaysia',
4555         'MV': 'Maldives',
4556         'ML': 'Mali',
4557         'MT': 'Malta',
4558         'MH': 'Marshall Islands',
4559         'MQ': 'Martinique',
4560         'MR': 'Mauritania',
4561         'MU': 'Mauritius',
4562         'YT': 'Mayotte',
4563         'MX': 'Mexico',
4564         'FM': 'Micronesia, Federated States of',
4565         'MD': 'Moldova, Republic of',
4566         'MC': 'Monaco',
4567         'MN': 'Mongolia',
4568         'ME': 'Montenegro',
4569         'MS': 'Montserrat',
4570         'MA': 'Morocco',
4571         'MZ': 'Mozambique',
4572         'MM': 'Myanmar',
4573         'NA': 'Namibia',
4574         'NR': 'Nauru',
4575         'NP': 'Nepal',
4576         'NL': 'Netherlands',
4577         'NC': 'New Caledonia',
4578         'NZ': 'New Zealand',
4579         'NI': 'Nicaragua',
4580         'NE': 'Niger',
4581         'NG': 'Nigeria',
4582         'NU': 'Niue',
4583         'NF': 'Norfolk Island',
4584         'MP': 'Northern Mariana Islands',
4585         'NO': 'Norway',
4586         'OM': 'Oman',
4587         'PK': 'Pakistan',
4588         'PW': 'Palau',
4589         'PS': 'Palestine, State of',
4590         'PA': 'Panama',
4591         'PG': 'Papua New Guinea',
4592         'PY': 'Paraguay',
4593         'PE': 'Peru',
4594         'PH': 'Philippines',
4595         'PN': 'Pitcairn',
4596         'PL': 'Poland',
4597         'PT': 'Portugal',
4598         'PR': 'Puerto Rico',
4599         'QA': 'Qatar',
4600         'RE': 'Réunion',
4601         'RO': 'Romania',
4602         'RU': 'Russian Federation',
4603         'RW': 'Rwanda',
4604         'BL': 'Saint Barthélemy',
4605         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4606         'KN': 'Saint Kitts and Nevis',
4607         'LC': 'Saint Lucia',
4608         'MF': 'Saint Martin (French part)',
4609         'PM': 'Saint Pierre and Miquelon',
4610         'VC': 'Saint Vincent and the Grenadines',
4611         'WS': 'Samoa',
4612         'SM': 'San Marino',
4613         'ST': 'Sao Tome and Principe',
4614         'SA': 'Saudi Arabia',
4615         'SN': 'Senegal',
4616         'RS': 'Serbia',
4617         'SC': 'Seychelles',
4618         'SL': 'Sierra Leone',
4619         'SG': 'Singapore',
4620         'SX': 'Sint Maarten (Dutch part)',
4621         'SK': 'Slovakia',
4622         'SI': 'Slovenia',
4623         'SB': 'Solomon Islands',
4624         'SO': 'Somalia',
4625         'ZA': 'South Africa',
4626         'GS': 'South Georgia and the South Sandwich Islands',
4627         'SS': 'South Sudan',
4628         'ES': 'Spain',
4629         'LK': 'Sri Lanka',
4630         'SD': 'Sudan',
4631         'SR': 'Suriname',
4632         'SJ': 'Svalbard and Jan Mayen',
4633         'SZ': 'Swaziland',
4634         'SE': 'Sweden',
4635         'CH': 'Switzerland',
4636         'SY': 'Syrian Arab Republic',
4637         'TW': 'Taiwan, Province of China',
4638         'TJ': 'Tajikistan',
4639         'TZ': 'Tanzania, United Republic of',
4640         'TH': 'Thailand',
4641         'TL': 'Timor-Leste',
4642         'TG': 'Togo',
4643         'TK': 'Tokelau',
4644         'TO': 'Tonga',
4645         'TT': 'Trinidad and Tobago',
4646         'TN': 'Tunisia',
4647         'TR': 'Turkey',
4648         'TM': 'Turkmenistan',
4649         'TC': 'Turks and Caicos Islands',
4650         'TV': 'Tuvalu',
4651         'UG': 'Uganda',
4652         'UA': 'Ukraine',
4653         'AE': 'United Arab Emirates',
4654         'GB': 'United Kingdom',
4655         'US': 'United States',
4656         'UM': 'United States Minor Outlying Islands',
4657         'UY': 'Uruguay',
4658         'UZ': 'Uzbekistan',
4659         'VU': 'Vanuatu',
4660         'VE': 'Venezuela, Bolivarian Republic of',
4661         'VN': 'Viet Nam',
4662         'VG': 'Virgin Islands, British',
4663         'VI': 'Virgin Islands, U.S.',
4664         'WF': 'Wallis and Futuna',
4665         'EH': 'Western Sahara',
4666         'YE': 'Yemen',
4667         'ZM': 'Zambia',
4668         'ZW': 'Zimbabwe',
4669         # Not ISO 3166 codes, but used for IP blocks
4670         'AP': 'Asia/Pacific Region',
4671         'EU': 'Europe',
4672     }
4673
4674     @classmethod
4675     def short2full(cls, code):
4676         """Convert an ISO 3166-2 country code to the corresponding full name"""
4677         return cls._country_map.get(code.upper())
4678
4679
4680 class GeoUtils:
4681     # Major IPv4 address blocks per country
4682     _country_ip_map = {
4683         'AD': '46.172.224.0/19',
4684         'AE': '94.200.0.0/13',
4685         'AF': '149.54.0.0/17',
4686         'AG': '209.59.64.0/18',
4687         'AI': '204.14.248.0/21',
4688         'AL': '46.99.0.0/16',
4689         'AM': '46.70.0.0/15',
4690         'AO': '105.168.0.0/13',
4691         'AP': '182.50.184.0/21',
4692         'AQ': '23.154.160.0/24',
4693         'AR': '181.0.0.0/12',
4694         'AS': '202.70.112.0/20',
4695         'AT': '77.116.0.0/14',
4696         'AU': '1.128.0.0/11',
4697         'AW': '181.41.0.0/18',
4698         'AX': '185.217.4.0/22',
4699         'AZ': '5.197.0.0/16',
4700         'BA': '31.176.128.0/17',
4701         'BB': '65.48.128.0/17',
4702         'BD': '114.130.0.0/16',
4703         'BE': '57.0.0.0/8',
4704         'BF': '102.178.0.0/15',
4705         'BG': '95.42.0.0/15',
4706         'BH': '37.131.0.0/17',
4707         'BI': '154.117.192.0/18',
4708         'BJ': '137.255.0.0/16',
4709         'BL': '185.212.72.0/23',
4710         'BM': '196.12.64.0/18',
4711         'BN': '156.31.0.0/16',
4712         'BO': '161.56.0.0/16',
4713         'BQ': '161.0.80.0/20',
4714         'BR': '191.128.0.0/12',
4715         'BS': '24.51.64.0/18',
4716         'BT': '119.2.96.0/19',
4717         'BW': '168.167.0.0/16',
4718         'BY': '178.120.0.0/13',
4719         'BZ': '179.42.192.0/18',
4720         'CA': '99.224.0.0/11',
4721         'CD': '41.243.0.0/16',
4722         'CF': '197.242.176.0/21',
4723         'CG': '160.113.0.0/16',
4724         'CH': '85.0.0.0/13',
4725         'CI': '102.136.0.0/14',
4726         'CK': '202.65.32.0/19',
4727         'CL': '152.172.0.0/14',
4728         'CM': '102.244.0.0/14',
4729         'CN': '36.128.0.0/10',
4730         'CO': '181.240.0.0/12',
4731         'CR': '201.192.0.0/12',
4732         'CU': '152.206.0.0/15',
4733         'CV': '165.90.96.0/19',
4734         'CW': '190.88.128.0/17',
4735         'CY': '31.153.0.0/16',
4736         'CZ': '88.100.0.0/14',
4737         'DE': '53.0.0.0/8',
4738         'DJ': '197.241.0.0/17',
4739         'DK': '87.48.0.0/12',
4740         'DM': '192.243.48.0/20',
4741         'DO': '152.166.0.0/15',
4742         'DZ': '41.96.0.0/12',
4743         'EC': '186.68.0.0/15',
4744         'EE': '90.190.0.0/15',
4745         'EG': '156.160.0.0/11',
4746         'ER': '196.200.96.0/20',
4747         'ES': '88.0.0.0/11',
4748         'ET': '196.188.0.0/14',
4749         'EU': '2.16.0.0/13',
4750         'FI': '91.152.0.0/13',
4751         'FJ': '144.120.0.0/16',
4752         'FK': '80.73.208.0/21',
4753         'FM': '119.252.112.0/20',
4754         'FO': '88.85.32.0/19',
4755         'FR': '90.0.0.0/9',
4756         'GA': '41.158.0.0/15',
4757         'GB': '25.0.0.0/8',
4758         'GD': '74.122.88.0/21',
4759         'GE': '31.146.0.0/16',
4760         'GF': '161.22.64.0/18',
4761         'GG': '62.68.160.0/19',
4762         'GH': '154.160.0.0/12',
4763         'GI': '95.164.0.0/16',
4764         'GL': '88.83.0.0/19',
4765         'GM': '160.182.0.0/15',
4766         'GN': '197.149.192.0/18',
4767         'GP': '104.250.0.0/19',
4768         'GQ': '105.235.224.0/20',
4769         'GR': '94.64.0.0/13',
4770         'GT': '168.234.0.0/16',
4771         'GU': '168.123.0.0/16',
4772         'GW': '197.214.80.0/20',
4773         'GY': '181.41.64.0/18',
4774         'HK': '113.252.0.0/14',
4775         'HN': '181.210.0.0/16',
4776         'HR': '93.136.0.0/13',
4777         'HT': '148.102.128.0/17',
4778         'HU': '84.0.0.0/14',
4779         'ID': '39.192.0.0/10',
4780         'IE': '87.32.0.0/12',
4781         'IL': '79.176.0.0/13',
4782         'IM': '5.62.80.0/20',
4783         'IN': '117.192.0.0/10',
4784         'IO': '203.83.48.0/21',
4785         'IQ': '37.236.0.0/14',
4786         'IR': '2.176.0.0/12',
4787         'IS': '82.221.0.0/16',
4788         'IT': '79.0.0.0/10',
4789         'JE': '87.244.64.0/18',
4790         'JM': '72.27.0.0/17',
4791         'JO': '176.29.0.0/16',
4792         'JP': '133.0.0.0/8',
4793         'KE': '105.48.0.0/12',
4794         'KG': '158.181.128.0/17',
4795         'KH': '36.37.128.0/17',
4796         'KI': '103.25.140.0/22',
4797         'KM': '197.255.224.0/20',
4798         'KN': '198.167.192.0/19',
4799         'KP': '175.45.176.0/22',
4800         'KR': '175.192.0.0/10',
4801         'KW': '37.36.0.0/14',
4802         'KY': '64.96.0.0/15',
4803         'KZ': '2.72.0.0/13',
4804         'LA': '115.84.64.0/18',
4805         'LB': '178.135.0.0/16',
4806         'LC': '24.92.144.0/20',
4807         'LI': '82.117.0.0/19',
4808         'LK': '112.134.0.0/15',
4809         'LR': '102.183.0.0/16',
4810         'LS': '129.232.0.0/17',
4811         'LT': '78.56.0.0/13',
4812         'LU': '188.42.0.0/16',
4813         'LV': '46.109.0.0/16',
4814         'LY': '41.252.0.0/14',
4815         'MA': '105.128.0.0/11',
4816         'MC': '88.209.64.0/18',
4817         'MD': '37.246.0.0/16',
4818         'ME': '178.175.0.0/17',
4819         'MF': '74.112.232.0/21',
4820         'MG': '154.126.0.0/17',
4821         'MH': '117.103.88.0/21',
4822         'MK': '77.28.0.0/15',
4823         'ML': '154.118.128.0/18',
4824         'MM': '37.111.0.0/17',
4825         'MN': '49.0.128.0/17',
4826         'MO': '60.246.0.0/16',
4827         'MP': '202.88.64.0/20',
4828         'MQ': '109.203.224.0/19',
4829         'MR': '41.188.64.0/18',
4830         'MS': '208.90.112.0/22',
4831         'MT': '46.11.0.0/16',
4832         'MU': '105.16.0.0/12',
4833         'MV': '27.114.128.0/18',
4834         'MW': '102.70.0.0/15',
4835         'MX': '187.192.0.0/11',
4836         'MY': '175.136.0.0/13',
4837         'MZ': '197.218.0.0/15',
4838         'NA': '41.182.0.0/16',
4839         'NC': '101.101.0.0/18',
4840         'NE': '197.214.0.0/18',
4841         'NF': '203.17.240.0/22',
4842         'NG': '105.112.0.0/12',
4843         'NI': '186.76.0.0/15',
4844         'NL': '145.96.0.0/11',
4845         'NO': '84.208.0.0/13',
4846         'NP': '36.252.0.0/15',
4847         'NR': '203.98.224.0/19',
4848         'NU': '49.156.48.0/22',
4849         'NZ': '49.224.0.0/14',
4850         'OM': '5.36.0.0/15',
4851         'PA': '186.72.0.0/15',
4852         'PE': '186.160.0.0/14',
4853         'PF': '123.50.64.0/18',
4854         'PG': '124.240.192.0/19',
4855         'PH': '49.144.0.0/13',
4856         'PK': '39.32.0.0/11',
4857         'PL': '83.0.0.0/11',
4858         'PM': '70.36.0.0/20',
4859         'PR': '66.50.0.0/16',
4860         'PS': '188.161.0.0/16',
4861         'PT': '85.240.0.0/13',
4862         'PW': '202.124.224.0/20',
4863         'PY': '181.120.0.0/14',
4864         'QA': '37.210.0.0/15',
4865         'RE': '102.35.0.0/16',
4866         'RO': '79.112.0.0/13',
4867         'RS': '93.86.0.0/15',
4868         'RU': '5.136.0.0/13',
4869         'RW': '41.186.0.0/16',
4870         'SA': '188.48.0.0/13',
4871         'SB': '202.1.160.0/19',
4872         'SC': '154.192.0.0/11',
4873         'SD': '102.120.0.0/13',
4874         'SE': '78.64.0.0/12',
4875         'SG': '8.128.0.0/10',
4876         'SI': '188.196.0.0/14',
4877         'SK': '78.98.0.0/15',
4878         'SL': '102.143.0.0/17',
4879         'SM': '89.186.32.0/19',
4880         'SN': '41.82.0.0/15',
4881         'SO': '154.115.192.0/18',
4882         'SR': '186.179.128.0/17',
4883         'SS': '105.235.208.0/21',
4884         'ST': '197.159.160.0/19',
4885         'SV': '168.243.0.0/16',
4886         'SX': '190.102.0.0/20',
4887         'SY': '5.0.0.0/16',
4888         'SZ': '41.84.224.0/19',
4889         'TC': '65.255.48.0/20',
4890         'TD': '154.68.128.0/19',
4891         'TG': '196.168.0.0/14',
4892         'TH': '171.96.0.0/13',
4893         'TJ': '85.9.128.0/18',
4894         'TK': '27.96.24.0/21',
4895         'TL': '180.189.160.0/20',
4896         'TM': '95.85.96.0/19',
4897         'TN': '197.0.0.0/11',
4898         'TO': '175.176.144.0/21',
4899         'TR': '78.160.0.0/11',
4900         'TT': '186.44.0.0/15',
4901         'TV': '202.2.96.0/19',
4902         'TW': '120.96.0.0/11',
4903         'TZ': '156.156.0.0/14',
4904         'UA': '37.52.0.0/14',
4905         'UG': '102.80.0.0/13',
4906         'US': '6.0.0.0/8',
4907         'UY': '167.56.0.0/13',
4908         'UZ': '84.54.64.0/18',
4909         'VA': '212.77.0.0/19',
4910         'VC': '207.191.240.0/21',
4911         'VE': '186.88.0.0/13',
4912         'VG': '66.81.192.0/20',
4913         'VI': '146.226.0.0/16',
4914         'VN': '14.160.0.0/11',
4915         'VU': '202.80.32.0/20',
4916         'WF': '117.20.32.0/21',
4917         'WS': '202.4.32.0/19',
4918         'YE': '134.35.0.0/16',
4919         'YT': '41.242.116.0/22',
4920         'ZA': '41.0.0.0/11',
4921         'ZM': '102.144.0.0/13',
4922         'ZW': '102.177.192.0/18',
4923     }
4924
4925     @classmethod
4926     def random_ipv4(cls, code_or_block):
4927         if len(code_or_block) == 2:
4928             block = cls._country_ip_map.get(code_or_block.upper())
4929             if not block:
4930                 return None
4931         else:
4932             block = code_or_block
4933         addr, preflen = block.split('/')
4934         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4935         addr_max = addr_min | (0xffffffff >> int(preflen))
4936         return str(socket.inet_ntoa(
4937             struct.pack('!L', random.randint(addr_min, addr_max))))
4938
4939
4940 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4941     def __init__(self, proxies=None):
4942         # Set default handlers
4943         for type in ('http', 'https'):
4944             setattr(self, '%s_open' % type,
4945                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4946                         meth(r, proxy, type))
4947         urllib.request.ProxyHandler.__init__(self, proxies)
4948
4949     def proxy_open(self, req, proxy, type):
4950         req_proxy = req.headers.get('Ytdl-request-proxy')
4951         if req_proxy is not None:
4952             proxy = req_proxy
4953             del req.headers['Ytdl-request-proxy']
4954
4955         if proxy == '__noproxy__':
4956             return None  # No Proxy
4957         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4958             req.add_header('Ytdl-socks-proxy', proxy)
4959             # yt-dlp's http/https handlers do wrapping the socket with socks
4960             return None
4961         return urllib.request.ProxyHandler.proxy_open(
4962             self, req, proxy, type)
4963
4964
4965 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4966 # released into Public Domain
4967 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4968
4969 def long_to_bytes(n, blocksize=0):
4970     """long_to_bytes(n:long, blocksize:int) : string
4971     Convert a long integer to a byte string.
4972
4973     If optional blocksize is given and greater than zero, pad the front of the
4974     byte string with binary zeros so that the length is a multiple of
4975     blocksize.
4976     """
4977     # after much testing, this algorithm was deemed to be the fastest
4978     s = b''
4979     n = int(n)
4980     while n > 0:
4981         s = struct.pack('>I', n & 0xffffffff) + s
4982         n = n >> 32
4983     # strip off leading zeros
4984     for i in range(len(s)):
4985         if s[i] != b'\000'[0]:
4986             break
4987     else:
4988         # only happens when n == 0
4989         s = b'\000'
4990         i = 0
4991     s = s[i:]
4992     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4993     # de-padding being done above, but sigh...
4994     if blocksize > 0 and len(s) % blocksize:
4995         s = (blocksize - len(s) % blocksize) * b'\000' + s
4996     return s
4997
4998
4999 def bytes_to_long(s):
5000     """bytes_to_long(string) : long
5001     Convert a byte string to a long integer.
5002
5003     This is (essentially) the inverse of long_to_bytes().
5004     """
5005     acc = 0
5006     length = len(s)
5007     if length % 4:
5008         extra = (4 - length % 4)
5009         s = b'\000' * extra + s
5010         length = length + extra
5011     for i in range(0, length, 4):
5012         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
5013     return acc
5014
5015
5016 def ohdave_rsa_encrypt(data, exponent, modulus):
5017     '''
5018     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5019
5020     Input:
5021         data: data to encrypt, bytes-like object
5022         exponent, modulus: parameter e and N of RSA algorithm, both integer
5023     Output: hex string of encrypted data
5024
5025     Limitation: supports one block encryption only
5026     '''
5027
5028     payload = int(binascii.hexlify(data[::-1]), 16)
5029     encrypted = pow(payload, exponent, modulus)
5030     return '%x' % encrypted
5031
5032
5033 def pkcs1pad(data, length):
5034     """
5035     Padding input data with PKCS#1 scheme
5036
5037     @param {int[]} data        input data
5038     @param {int}   length      target length
5039     @returns {int[]}           padded data
5040     """
5041     if len(data) > length - 11:
5042         raise ValueError('Input data too long for PKCS#1 padding')
5043
5044     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5045     return [0, 2] + pseudo_random + [0] + data
5046
5047
5048 def _base_n_table(n, table):
5049     if not table and not n:
5050         raise ValueError('Either table or n must be specified')
5051     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
5052
5053     if n and n != len(table):
5054         raise ValueError(f'base {n} exceeds table length {len(table)}')
5055     return table
5056
5057
5058 def encode_base_n(num, n=None, table=None):
5059     """Convert given int to a base-n string"""
5060     table = _base_n_table(n, table)
5061     if not num:
5062         return table[0]
5063
5064     result, base = '', len(table)
5065     while num:
5066         result = table[num % base] + result
5067         num = num // base
5068     return result
5069
5070
5071 def decode_base_n(string, n=None, table=None):
5072     """Convert given base-n string to int"""
5073     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5074     result, base = 0, len(table)
5075     for char in string:
5076         result = result * base + table[char]
5077     return result
5078
5079
5080 def decode_base(value, digits):
5081     deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5082                         f'in a future version. Use {__name__}.decode_base_n instead')
5083     return decode_base_n(value, table=digits)
5084
5085
5086 def decode_packed_codes(code):
5087     mobj = re.search(PACKED_CODES_RE, code)
5088     obfuscated_code, base, count, symbols = mobj.groups()
5089     base = int(base)
5090     count = int(count)
5091     symbols = symbols.split('|')
5092     symbol_table = {}
5093
5094     while count:
5095         count -= 1
5096         base_n_count = encode_base_n(count, base)
5097         symbol_table[base_n_count] = symbols[count] or base_n_count
5098
5099     return re.sub(
5100         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5101         obfuscated_code)
5102
5103
5104 def caesar(s, alphabet, shift):
5105     if shift == 0:
5106         return s
5107     l = len(alphabet)
5108     return ''.join(
5109         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5110         for c in s)
5111
5112
5113 def rot47(s):
5114     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5115
5116
5117 def parse_m3u8_attributes(attrib):
5118     info = {}
5119     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5120         if val.startswith('"'):
5121             val = val[1:-1]
5122         info[key] = val
5123     return info
5124
5125
5126 def urshift(val, n):
5127     return val >> n if val >= 0 else (val + 0x100000000) >> n
5128
5129
5130 # Based on png2str() written by @gdkchan and improved by @yokrysty
5131 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5132 def decode_png(png_data):
5133     # Reference: https://www.w3.org/TR/PNG/
5134     header = png_data[8:]
5135
5136     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5137         raise OSError('Not a valid PNG file.')
5138
5139     int_map = {1: '>B', 2: '>H', 4: '>I'}
5140     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5141
5142     chunks = []
5143
5144     while header:
5145         length = unpack_integer(header[:4])
5146         header = header[4:]
5147
5148         chunk_type = header[:4]
5149         header = header[4:]
5150
5151         chunk_data = header[:length]
5152         header = header[length:]
5153
5154         header = header[4:]  # Skip CRC
5155
5156         chunks.append({
5157             'type': chunk_type,
5158             'length': length,
5159             'data': chunk_data
5160         })
5161
5162     ihdr = chunks[0]['data']
5163
5164     width = unpack_integer(ihdr[:4])
5165     height = unpack_integer(ihdr[4:8])
5166
5167     idat = b''
5168
5169     for chunk in chunks:
5170         if chunk['type'] == b'IDAT':
5171             idat += chunk['data']
5172
5173     if not idat:
5174         raise OSError('Unable to read PNG data.')
5175
5176     decompressed_data = bytearray(zlib.decompress(idat))
5177
5178     stride = width * 3
5179     pixels = []
5180
5181     def _get_pixel(idx):
5182         x = idx % stride
5183         y = idx // stride
5184         return pixels[y][x]
5185
5186     for y in range(height):
5187         basePos = y * (1 + stride)
5188         filter_type = decompressed_data[basePos]
5189
5190         current_row = []
5191
5192         pixels.append(current_row)
5193
5194         for x in range(stride):
5195             color = decompressed_data[1 + basePos + x]
5196             basex = y * stride + x
5197             left = 0
5198             up = 0
5199
5200             if x > 2:
5201                 left = _get_pixel(basex - 3)
5202             if y > 0:
5203                 up = _get_pixel(basex - stride)
5204
5205             if filter_type == 1:  # Sub
5206                 color = (color + left) & 0xff
5207             elif filter_type == 2:  # Up
5208                 color = (color + up) & 0xff
5209             elif filter_type == 3:  # Average
5210                 color = (color + ((left + up) >> 1)) & 0xff
5211             elif filter_type == 4:  # Paeth
5212                 a = left
5213                 b = up
5214                 c = 0
5215
5216                 if x > 2 and y > 0:
5217                     c = _get_pixel(basex - stride - 3)
5218
5219                 p = a + b - c
5220
5221                 pa = abs(p - a)
5222                 pb = abs(p - b)
5223                 pc = abs(p - c)
5224
5225                 if pa <= pb and pa <= pc:
5226                     color = (color + a) & 0xff
5227                 elif pb <= pc:
5228                     color = (color + b) & 0xff
5229                 else:
5230                     color = (color + c) & 0xff
5231
5232             current_row.append(color)
5233
5234     return width, height, pixels
5235
5236
5237 def write_xattr(path, key, value):
5238     # Windows: Write xattrs to NTFS Alternate Data Streams:
5239     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5240     if compat_os_name == 'nt':
5241         assert ':' not in key
5242         assert os.path.exists(path)
5243
5244         try:
5245             with open(f'{path}:{key}', 'wb') as f:
5246                 f.write(value)
5247         except OSError as e:
5248             raise XAttrMetadataError(e.errno, e.strerror)
5249         return
5250
5251     # UNIX Method 1. Use xattrs/pyxattrs modules
5252
5253     setxattr = None
5254     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5255         # Unicode arguments are not supported in pyxattr until version 0.5.0
5256         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5257         if version_tuple(xattr.__version__) >= (0, 5, 0):
5258             setxattr = xattr.set
5259     elif xattr:
5260         setxattr = xattr.setxattr
5261
5262     if setxattr:
5263         try:
5264             setxattr(path, key, value)
5265         except OSError as e:
5266             raise XAttrMetadataError(e.errno, e.strerror)
5267         return
5268
5269     # UNIX Method 2. Use setfattr/xattr executables
5270     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5271            else 'xattr' if check_executable('xattr', ['-h']) else None)
5272     if not exe:
5273         raise XAttrUnavailableError(
5274             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5275             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5276
5277     value = value.decode()
5278     try:
5279         _, stderr, returncode = Popen.run(
5280             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5281             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5282     except OSError as e:
5283         raise XAttrMetadataError(e.errno, e.strerror)
5284     if returncode:
5285         raise XAttrMetadataError(returncode, stderr)
5286
5287
5288 def random_birthday(year_field, month_field, day_field):
5289     start_date = datetime.date(1950, 1, 1)
5290     end_date = datetime.date(1995, 12, 31)
5291     offset = random.randint(0, (end_date - start_date).days)
5292     random_date = start_date + datetime.timedelta(offset)
5293     return {
5294         year_field: str(random_date.year),
5295         month_field: str(random_date.month),
5296         day_field: str(random_date.day),
5297     }
5298
5299
5300 def find_available_port(interface=''):
5301     try:
5302         with socket.socket() as sock:
5303             sock.bind((interface, 0))
5304             return sock.getsockname()[1]
5305     except OSError:
5306         return None
5307
5308
5309 # Templates for internet shortcut files, which are plain text files.
5310 DOT_URL_LINK_TEMPLATE = '''\
5311 [InternetShortcut]
5312 URL=%(url)s
5313 '''
5314
5315 DOT_WEBLOC_LINK_TEMPLATE = '''\
5316 <?xml version="1.0" encoding="UTF-8"?>
5317 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5318 <plist version="1.0">
5319 <dict>
5320 \t<key>URL</key>
5321 \t<string>%(url)s</string>
5322 </dict>
5323 </plist>
5324 '''
5325
5326 DOT_DESKTOP_LINK_TEMPLATE = '''\
5327 [Desktop Entry]
5328 Encoding=UTF-8
5329 Name=%(filename)s
5330 Type=Link
5331 URL=%(url)s
5332 Icon=text-html
5333 '''
5334
5335 LINK_TEMPLATES = {
5336     'url': DOT_URL_LINK_TEMPLATE,
5337     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5338     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5339 }
5340
5341
5342 def iri_to_uri(iri):
5343     """
5344     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5345
5346     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5347     """
5348
5349     iri_parts = urllib.parse.urlparse(iri)
5350
5351     if '[' in iri_parts.netloc:
5352         raise ValueError('IPv6 URIs are not, yet, supported.')
5353         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5354
5355     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5356
5357     net_location = ''
5358     if iri_parts.username:
5359         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5360         if iri_parts.password is not None:
5361             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5362         net_location += '@'
5363
5364     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5365     # The 'idna' encoding produces ASCII text.
5366     if iri_parts.port is not None and iri_parts.port != 80:
5367         net_location += ':' + str(iri_parts.port)
5368
5369     return urllib.parse.urlunparse(
5370         (iri_parts.scheme,
5371             net_location,
5372
5373             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5374
5375             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5376             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5377
5378             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5379             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5380
5381             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5382
5383     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5384
5385
5386 def to_high_limit_path(path):
5387     if sys.platform in ['win32', 'cygwin']:
5388         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5389         return '\\\\?\\' + os.path.abspath(path)
5390
5391     return path
5392
5393
5394 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5395     val = traverse_obj(obj, *variadic(field))
5396     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5397         return default
5398     return template % func(val)
5399
5400
5401 def clean_podcast_url(url):
5402     return re.sub(r'''(?x)
5403         (?:
5404             (?:
5405                 chtbl\.com/track|
5406                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5407                 play\.podtrac\.com
5408             )/[^/]+|
5409             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5410             flex\.acast\.com|
5411             pd(?:
5412                 cn\.co| # https://podcorn.com/analytics-prefix/
5413                 st\.fm # https://podsights.com/docs/
5414             )/e
5415         )/''', '', url)
5416
5417
5418 _HEX_TABLE = '0123456789abcdef'
5419
5420
5421 def random_uuidv4():
5422     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5423
5424
5425 def make_dir(path, to_screen=None):
5426     try:
5427         dn = os.path.dirname(path)
5428         if dn:
5429             os.makedirs(dn, exist_ok=True)
5430         return True
5431     except OSError as err:
5432         if callable(to_screen) is not None:
5433             to_screen('unable to create directory ' + error_to_compat_str(err))
5434         return False
5435
5436
5437 def get_executable_path():
5438     from .update import _get_variant_and_executable_path
5439
5440     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5441
5442
5443 def get_user_config_dirs(package_name):
5444     # .config (e.g. ~/.config/package_name)
5445     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5446     yield os.path.join(xdg_config_home, package_name)
5447
5448     # appdata (%APPDATA%/package_name)
5449     appdata_dir = os.getenv('appdata')
5450     if appdata_dir:
5451         yield os.path.join(appdata_dir, package_name)
5452
5453     # home (~/.package_name)
5454     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5455
5456
5457 def get_system_config_dirs(package_name):
5458     # /etc/package_name
5459     yield os.path.join('/etc', package_name)
5460
5461
5462 def traverse_obj(
5463         obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5464         casesense=True, is_user_input=False, traverse_string=False):
5465     """
5466     Safely traverse nested `dict`s and `Sequence`s
5467
5468     >>> obj = [{}, {"key": "value"}]
5469     >>> traverse_obj(obj, (1, "key"))
5470     "value"
5471
5472     Each of the provided `paths` is tested and the first producing a valid result will be returned.
5473     The next path will also be tested if the path branched but no results could be found.
5474     Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5475     Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
5476
5477     The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5478
5479     The keys in the path can be one of:
5480         - `None`:           Return the current object.
5481         - `set`:            Requires the only item in the set to be a type or function,
5482                             like `{type}`/`{func}`. If a `type`, returns only values
5483                             of this type. If a function, returns `func(obj)`.
5484         - `str`/`int`:      Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
5485         - `slice`:          Branch out and return all values in `obj[key]`.
5486         - `Ellipsis`:       Branch out and return a list of all values.
5487         - `tuple`/`list`:   Branch out and return a list of all matching values.
5488                             Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5489         - `function`:       Branch out and return values filtered by the function.
5490                             Read as: `[value for key, value in obj if function(key, value)]`.
5491                             For `Sequence`s, `key` is the index of the value.
5492                             For `re.Match`es, `key` is the group number (0 = full match)
5493                             as well as additionally any group names, if given.
5494         - `dict`            Transform the current object and return a matching dict.
5495                             Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5496
5497         `tuple`, `list`, and `dict` all support nested paths and branches.
5498
5499     @params paths           Paths which to traverse by.
5500     @param default          Value to return if the paths do not match.
5501                             If the last key in the path is a `dict`, it will apply to each value inside
5502                             the dict instead, depth first. Try to avoid if using nested `dict` keys.
5503     @param expected_type    If a `type`, only accept final values of this type.
5504                             If any other callable, try to call the function on each result.
5505                             If the last key in the path is a `dict`, it will apply to each value inside
5506                             the dict instead, recursively. This does respect branching paths.
5507     @param get_all          If `False`, return the first matching result, otherwise all matching ones.
5508     @param casesense        If `False`, consider string dictionary keys as case insensitive.
5509
5510     The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5511
5512     @param is_user_input    Whether the keys are generated from user input.
5513                             If `True` strings get converted to `int`/`slice` if needed.
5514     @param traverse_string  Whether to traverse into objects as strings.
5515                             If `True`, any non-compatible object will first be
5516                             converted into a string and then traversed into.
5517                             The return value of that path will be a string instead,
5518                             not respecting any further branching.
5519
5520
5521     @returns                The result of the object traversal.
5522                             If successful, `get_all=True`, and the path branches at least once,
5523                             then a list of results is returned instead.
5524                             If no `default` is given and the last path branches, a `list` of results
5525                             is always returned. If a path ends on a `dict` that result will always be a `dict`.
5526     """
5527     is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5528     casefold = lambda k: k.casefold() if isinstance(k, str) else k
5529
5530     if isinstance(expected_type, type):
5531         type_test = lambda val: val if isinstance(val, expected_type) else None
5532     else:
5533         type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5534
5535     def apply_key(key, obj, is_last):
5536         branching = False
5537         result = None
5538
5539         if obj is None and traverse_string:
5540             pass
5541
5542         elif key is None:
5543             result = obj
5544
5545         elif isinstance(key, set):
5546             assert len(key) == 1, 'Set should only be used to wrap a single item'
5547             item = next(iter(key))
5548             if isinstance(item, type):
5549                 if isinstance(obj, item):
5550                     result = obj
5551             else:
5552                 result = try_call(item, args=(obj,))
5553
5554         elif isinstance(key, (list, tuple)):
5555             branching = True
5556             result = itertools.chain.from_iterable(
5557                 apply_path(obj, branch, is_last)[0] for branch in key)
5558
5559         elif key is ...:
5560             branching = True
5561             if isinstance(obj, collections.abc.Mapping):
5562                 result = obj.values()
5563             elif is_sequence(obj):
5564                 result = obj
5565             elif isinstance(obj, re.Match):
5566                 result = obj.groups()
5567             elif traverse_string:
5568                 branching = False
5569                 result = str(obj)
5570             else:
5571                 result = ()
5572
5573         elif callable(key):
5574             branching = True
5575             if isinstance(obj, collections.abc.Mapping):
5576                 iter_obj = obj.items()
5577             elif is_sequence(obj):
5578                 iter_obj = enumerate(obj)
5579             elif isinstance(obj, re.Match):
5580                 iter_obj = itertools.chain(
5581                     enumerate((obj.group(), *obj.groups())),
5582                     obj.groupdict().items())
5583             elif traverse_string:
5584                 branching = False
5585                 iter_obj = enumerate(str(obj))
5586             else:
5587                 iter_obj = ()
5588
5589             result = (v for k, v in iter_obj if try_call(key, args=(k, v)))
5590             if not branching:  # string traversal
5591                 result = ''.join(result)
5592
5593         elif isinstance(key, dict):
5594             iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items())
5595             result = {
5596                 k: v if v is not None else default for k, v in iter_obj
5597                 if v is not None or default is not NO_DEFAULT
5598             } or None
5599
5600         elif isinstance(obj, collections.abc.Mapping):
5601             result = (obj.get(key) if casesense or (key in obj) else
5602                       next((v for k, v in obj.items() if casefold(k) == key), None))
5603
5604         elif isinstance(obj, re.Match):
5605             if isinstance(key, int) or casesense:
5606                 with contextlib.suppress(IndexError):
5607                     result = obj.group(key)
5608
5609             elif isinstance(key, str):
5610                 result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5611
5612         elif isinstance(key, (int, slice)):
5613             if is_sequence(obj):
5614                 branching = isinstance(key, slice)
5615                 with contextlib.suppress(IndexError):
5616                     result = obj[key]
5617             elif traverse_string:
5618                 with contextlib.suppress(IndexError):
5619                     result = str(obj)[key]
5620
5621         return branching, result if branching else (result,)
5622
5623     def lazy_last(iterable):
5624         iterator = iter(iterable)
5625         prev = next(iterator, NO_DEFAULT)
5626         if prev is NO_DEFAULT:
5627             return
5628
5629         for item in iterator:
5630             yield False, prev
5631             prev = item
5632
5633         yield True, prev
5634
5635     def apply_path(start_obj, path, test_type):
5636         objs = (start_obj,)
5637         has_branched = False
5638
5639         key = None
5640         for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
5641             if is_user_input and isinstance(key, str):
5642                 if key == ':':
5643                     key = ...
5644                 elif ':' in key:
5645                     key = slice(*map(int_or_none, key.split(':')))
5646                 elif int_or_none(key) is not None:
5647                     key = int(key)
5648
5649             if not casesense and isinstance(key, str):
5650                 key = key.casefold()
5651
5652             if __debug__ and callable(key):
5653                 # Verify function signature
5654                 inspect.signature(key).bind(None, None)
5655
5656             new_objs = []
5657             for obj in objs:
5658                 branching, results = apply_key(key, obj, last)
5659                 has_branched |= branching
5660                 new_objs.append(results)
5661
5662             objs = itertools.chain.from_iterable(new_objs)
5663
5664         if test_type and not isinstance(key, (dict, list, tuple)):
5665             objs = map(type_test, objs)
5666
5667         return objs, has_branched, isinstance(key, dict)
5668
5669     def _traverse_obj(obj, path, allow_empty, test_type):
5670         results, has_branched, is_dict = apply_path(obj, path, test_type)
5671         results = LazyList(item for item in results if item not in (None, {}))
5672         if get_all and has_branched:
5673             if results:
5674                 return results.exhaust()
5675             if allow_empty:
5676                 return [] if default is NO_DEFAULT else default
5677             return None
5678
5679         return results[0] if results else {} if allow_empty and is_dict else None
5680
5681     for index, path in enumerate(paths, 1):
5682         result = _traverse_obj(obj, path, index == len(paths), True)
5683         if result is not None:
5684             return result
5685
5686     return None if default is NO_DEFAULT else default
5687
5688
5689 def traverse_dict(dictn, keys, casesense=True):
5690     deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5691                         f'in a future version. Use "{__name__}.traverse_obj" instead')
5692     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5693
5694
5695 def get_first(obj, keys, **kwargs):
5696     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5697
5698
5699 def time_seconds(**kwargs):
5700     """
5701     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5702     """
5703     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5704
5705
5706 # create a JSON Web Signature (jws) with HS256 algorithm
5707 # the resulting format is in JWS Compact Serialization
5708 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5709 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5710 def jwt_encode_hs256(payload_data, key, headers={}):
5711     header_data = {
5712         'alg': 'HS256',
5713         'typ': 'JWT',
5714     }
5715     if headers:
5716         header_data.update(headers)
5717     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5718     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5719     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5720     signature_b64 = base64.b64encode(h.digest())
5721     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5722     return token
5723
5724
5725 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5726 def jwt_decode_hs256(jwt):
5727     header_b64, payload_b64, signature_b64 = jwt.split('.')
5728     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5729     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5730     return payload_data
5731
5732
5733 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5734
5735
5736 @functools.cache
5737 def supports_terminal_sequences(stream):
5738     if compat_os_name == 'nt':
5739         if not WINDOWS_VT_MODE:
5740             return False
5741     elif not os.getenv('TERM'):
5742         return False
5743     try:
5744         return stream.isatty()
5745     except BaseException:
5746         return False
5747
5748
5749 def windows_enable_vt_mode():
5750     """Ref: https://bugs.python.org/issue30075 """
5751     if get_windows_version() < (10, 0, 10586):
5752         return
5753
5754     import ctypes
5755     import ctypes.wintypes
5756     import msvcrt
5757
5758     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5759
5760     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5761     handle = os.open('CONOUT$', os.O_RDWR)
5762     try:
5763         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5764         dw_original_mode = ctypes.wintypes.DWORD()
5765         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5766         if not success:
5767             raise Exception('GetConsoleMode failed')
5768
5769         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5770             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5771         if not success:
5772             raise Exception('SetConsoleMode failed')
5773     finally:
5774         os.close(handle)
5775
5776     global WINDOWS_VT_MODE
5777     WINDOWS_VT_MODE = True
5778     supports_terminal_sequences.cache_clear()
5779
5780
5781 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5782
5783
5784 def remove_terminal_sequences(string):
5785     return _terminal_sequences_re.sub('', string)
5786
5787
5788 def number_of_digits(number):
5789     return len('%d' % number)
5790
5791
5792 def join_nonempty(*values, delim='-', from_dict=None):
5793     if from_dict is not None:
5794         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5795     return delim.join(map(str, filter(None, values)))
5796
5797
5798 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5799     """
5800     Find the largest format dimensions in terms of video width and, for each thumbnail:
5801     * Modify the URL: Match the width with the provided regex and replace with the former width
5802     * Update dimensions
5803
5804     This function is useful with video services that scale the provided thumbnails on demand
5805     """
5806     _keys = ('width', 'height')
5807     max_dimensions = max(
5808         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5809         default=(0, 0))
5810     if not max_dimensions[0]:
5811         return thumbnails
5812     return [
5813         merge_dicts(
5814             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5815             dict(zip(_keys, max_dimensions)), thumbnail)
5816         for thumbnail in thumbnails
5817     ]
5818
5819
5820 def parse_http_range(range):
5821     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5822     if not range:
5823         return None, None, None
5824     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5825     if not crg:
5826         return None, None, None
5827     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5828
5829
5830 def read_stdin(what):
5831     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5832     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5833     return sys.stdin
5834
5835
5836 def determine_file_encoding(data):
5837     """
5838     Detect the text encoding used
5839     @returns (encoding, bytes to skip)
5840     """
5841
5842     # BOM marks are given priority over declarations
5843     for bom, enc in BOMS:
5844         if data.startswith(bom):
5845             return enc, len(bom)
5846
5847     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5848     # We ignore the endianness to get a good enough match
5849     data = data.replace(b'\0', b'')
5850     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5851     return mobj.group(1).decode() if mobj else None, 0
5852
5853
5854 class Config:
5855     own_args = None
5856     parsed_args = None
5857     filename = None
5858     __initialized = False
5859
5860     def __init__(self, parser, label=None):
5861         self.parser, self.label = parser, label
5862         self._loaded_paths, self.configs = set(), []
5863
5864     def init(self, args=None, filename=None):
5865         assert not self.__initialized
5866         self.own_args, self.filename = args, filename
5867         return self.load_configs()
5868
5869     def load_configs(self):
5870         directory = ''
5871         if self.filename:
5872             location = os.path.realpath(self.filename)
5873             directory = os.path.dirname(location)
5874             if location in self._loaded_paths:
5875                 return False
5876             self._loaded_paths.add(location)
5877
5878         self.__initialized = True
5879         opts, _ = self.parser.parse_known_args(self.own_args)
5880         self.parsed_args = self.own_args
5881         for location in opts.config_locations or []:
5882             if location == '-':
5883                 if location in self._loaded_paths:
5884                     continue
5885                 self._loaded_paths.add(location)
5886                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5887                 continue
5888             location = os.path.join(directory, expand_path(location))
5889             if os.path.isdir(location):
5890                 location = os.path.join(location, 'yt-dlp.conf')
5891             if not os.path.exists(location):
5892                 self.parser.error(f'config location {location} does not exist')
5893             self.append_config(self.read_file(location), location)
5894         return True
5895
5896     def __str__(self):
5897         label = join_nonempty(
5898             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5899             delim=' ')
5900         return join_nonempty(
5901             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5902             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5903             delim='\n')
5904
5905     @staticmethod
5906     def read_file(filename, default=[]):
5907         try:
5908             optionf = open(filename, 'rb')
5909         except OSError:
5910             return default  # silently skip if file is not present
5911         try:
5912             enc, skip = determine_file_encoding(optionf.read(512))
5913             optionf.seek(skip, io.SEEK_SET)
5914         except OSError:
5915             enc = None  # silently skip read errors
5916         try:
5917             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5918             contents = optionf.read().decode(enc or preferredencoding())
5919             res = shlex.split(contents, comments=True)
5920         except Exception as err:
5921             raise ValueError(f'Unable to parse "{filename}": {err}')
5922         finally:
5923             optionf.close()
5924         return res
5925
5926     @staticmethod
5927     def hide_login_info(opts):
5928         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5929         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5930
5931         def _scrub_eq(o):
5932             m = eqre.match(o)
5933             if m:
5934                 return m.group('key') + '=PRIVATE'
5935             else:
5936                 return o
5937
5938         opts = list(map(_scrub_eq, opts))
5939         for idx, opt in enumerate(opts):
5940             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5941                 opts[idx + 1] = 'PRIVATE'
5942         return opts
5943
5944     def append_config(self, *args, label=None):
5945         config = type(self)(self.parser, label)
5946         config._loaded_paths = self._loaded_paths
5947         if config.init(*args):
5948             self.configs.append(config)
5949
5950     @property
5951     def all_args(self):
5952         for config in reversed(self.configs):
5953             yield from config.all_args
5954         yield from self.parsed_args or []
5955
5956     def parse_known_args(self, **kwargs):
5957         return self.parser.parse_known_args(self.all_args, **kwargs)
5958
5959     def parse_args(self):
5960         return self.parser.parse_args(self.all_args)
5961
5962
5963 class WebSocketsWrapper:
5964     """Wraps websockets module to use in non-async scopes"""
5965     pool = None
5966
5967     def __init__(self, url, headers=None, connect=True):
5968         self.loop = asyncio.new_event_loop()
5969         # XXX: "loop" is deprecated
5970         self.conn = websockets.connect(
5971             url, extra_headers=headers, ping_interval=None,
5972             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5973         if connect:
5974             self.__enter__()
5975         atexit.register(self.__exit__, None, None, None)
5976
5977     def __enter__(self):
5978         if not self.pool:
5979             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5980         return self
5981
5982     def send(self, *args):
5983         self.run_with_loop(self.pool.send(*args), self.loop)
5984
5985     def recv(self, *args):
5986         return self.run_with_loop(self.pool.recv(*args), self.loop)
5987
5988     def __exit__(self, type, value, traceback):
5989         try:
5990             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5991         finally:
5992             self.loop.close()
5993             self._cancel_all_tasks(self.loop)
5994
5995     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5996     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5997     @staticmethod
5998     def run_with_loop(main, loop):
5999         if not asyncio.iscoroutine(main):
6000             raise ValueError(f'a coroutine was expected, got {main!r}')
6001
6002         try:
6003             return loop.run_until_complete(main)
6004         finally:
6005             loop.run_until_complete(loop.shutdown_asyncgens())
6006             if hasattr(loop, 'shutdown_default_executor'):
6007                 loop.run_until_complete(loop.shutdown_default_executor())
6008
6009     @staticmethod
6010     def _cancel_all_tasks(loop):
6011         to_cancel = asyncio.all_tasks(loop)
6012
6013         if not to_cancel:
6014             return
6015
6016         for task in to_cancel:
6017             task.cancel()
6018
6019         # XXX: "loop" is removed in python 3.10+
6020         loop.run_until_complete(
6021             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
6022
6023         for task in to_cancel:
6024             if task.cancelled():
6025                 continue
6026             if task.exception() is not None:
6027                 loop.call_exception_handler({
6028                     'message': 'unhandled exception during asyncio.run() shutdown',
6029                     'exception': task.exception(),
6030                     'task': task,
6031                 })
6032
6033
6034 def merge_headers(*dicts):
6035     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
6036     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
6037
6038
6039 def cached_method(f):
6040     """Cache a method"""
6041     signature = inspect.signature(f)
6042
6043     @functools.wraps(f)
6044     def wrapper(self, *args, **kwargs):
6045         bound_args = signature.bind(self, *args, **kwargs)
6046         bound_args.apply_defaults()
6047         key = tuple(bound_args.arguments.values())[1:]
6048
6049         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
6050         if key not in cache:
6051             cache[key] = f(self, *args, **kwargs)
6052         return cache[key]
6053     return wrapper
6054
6055
6056 class classproperty:
6057     """property access for class methods with optional caching"""
6058     def __new__(cls, func=None, *args, **kwargs):
6059         if not func:
6060             return functools.partial(cls, *args, **kwargs)
6061         return super().__new__(cls)
6062
6063     def __init__(self, func, *, cache=False):
6064         functools.update_wrapper(self, func)
6065         self.func = func
6066         self._cache = {} if cache else None
6067
6068     def __get__(self, _, cls):
6069         if self._cache is None:
6070             return self.func(cls)
6071         elif cls not in self._cache:
6072             self._cache[cls] = self.func(cls)
6073         return self._cache[cls]
6074
6075
6076 class function_with_repr:
6077     def __init__(self, func, repr_=None):
6078         functools.update_wrapper(self, func)
6079         self.func, self.__repr = func, repr_
6080
6081     def __call__(self, *args, **kwargs):
6082         return self.func(*args, **kwargs)
6083
6084     def __repr__(self):
6085         if self.__repr:
6086             return self.__repr
6087         return f'{self.func.__module__}.{self.func.__qualname__}'
6088
6089
6090 class Namespace(types.SimpleNamespace):
6091     """Immutable namespace"""
6092
6093     def __iter__(self):
6094         return iter(self.__dict__.values())
6095
6096     @property
6097     def items_(self):
6098         return self.__dict__.items()
6099
6100
6101 MEDIA_EXTENSIONS = Namespace(
6102     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
6103     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
6104     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
6105     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
6106     thumbnails=('jpg', 'png', 'webp'),
6107     storyboards=('mhtml', ),
6108     subtitles=('srt', 'vtt', 'ass', 'lrc'),
6109     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
6110 )
6111 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
6112 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
6113
6114 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
6115
6116
6117 class RetryManager:
6118     """Usage:
6119         for retry in RetryManager(...):
6120             try:
6121                 ...
6122             except SomeException as err:
6123                 retry.error = err
6124                 continue
6125     """
6126     attempt, _error = 0, None
6127
6128     def __init__(self, _retries, _error_callback, **kwargs):
6129         self.retries = _retries or 0
6130         self.error_callback = functools.partial(_error_callback, **kwargs)
6131
6132     def _should_retry(self):
6133         return self._error is not NO_DEFAULT and self.attempt <= self.retries
6134
6135     @property
6136     def error(self):
6137         if self._error is NO_DEFAULT:
6138             return None
6139         return self._error
6140
6141     @error.setter
6142     def error(self, value):
6143         self._error = value
6144
6145     def __iter__(self):
6146         while self._should_retry():
6147             self.error = NO_DEFAULT
6148             self.attempt += 1
6149             yield self
6150             if self.error:
6151                 self.error_callback(self.error, self.attempt, self.retries)
6152
6153     @staticmethod
6154     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6155         """Utility function for reporting retries"""
6156         if count > retries:
6157             if error:
6158                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6159             raise e
6160
6161         if not count:
6162             return warn(e)
6163         elif isinstance(e, ExtractorError):
6164             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
6165         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6166
6167         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6168         if delay:
6169             info(f'Sleeping {delay:.2f} seconds ...')
6170             time.sleep(delay)
6171
6172
6173 def make_archive_id(ie, video_id):
6174     ie_key = ie if isinstance(ie, str) else ie.ie_key()
6175     return f'{ie_key.lower()} {video_id}'
6176
6177
6178 def truncate_string(s, left, right=0):
6179     assert left > 3 and right >= 0
6180     if s is None or len(s) <= left + right:
6181         return s
6182     return f'{s[:left-3]}...{s[-right:] if right else ""}'
6183
6184
6185 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6186     assert 'all' in alias_dict, '"all" alias is required'
6187     requested = list(start or [])
6188     for val in options:
6189         discard = val.startswith('-')
6190         if discard:
6191             val = val[1:]
6192
6193         if val in alias_dict:
6194             val = alias_dict[val] if not discard else [
6195                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6196             # NB: Do not allow regex in aliases for performance
6197             requested = orderedSet_from_options(val, alias_dict, start=requested)
6198             continue
6199
6200         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6201                    else [val] if val in alias_dict['all'] else None)
6202         if current is None:
6203             raise ValueError(val)
6204
6205         if discard:
6206             for item in current:
6207                 while item in requested:
6208                     requested.remove(item)
6209         else:
6210             requested.extend(current)
6211
6212     return orderedSet(requested)
6213
6214
6215 class FormatSorter:
6216     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6217
6218     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6219                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6220                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
6221     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6222                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
6223                     'fps', 'fs_approx', 'source', 'id')
6224
6225     settings = {
6226         'vcodec': {'type': 'ordered', 'regex': True,
6227                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6228         'acodec': {'type': 'ordered', 'regex': True,
6229                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6230         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6231                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6232         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6233                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6234         'vext': {'type': 'ordered', 'field': 'video_ext',
6235                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6236                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6237         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
6238                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
6239                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
6240         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6241         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6242                        'field': ('vcodec', 'acodec'),
6243                        'function': lambda it: int(any(v != 'none' for v in it))},
6244         'ie_pref': {'priority': True, 'type': 'extractor'},
6245         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6246         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6247         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6248         'quality': {'convert': 'float', 'default': -1},
6249         'filesize': {'convert': 'bytes'},
6250         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6251         'id': {'convert': 'string', 'field': 'format_id'},
6252         'height': {'convert': 'float_none'},
6253         'width': {'convert': 'float_none'},
6254         'fps': {'convert': 'float_none'},
6255         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6256         'tbr': {'convert': 'float_none'},
6257         'vbr': {'convert': 'float_none'},
6258         'abr': {'convert': 'float_none'},
6259         'asr': {'convert': 'float_none'},
6260         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6261
6262         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6263         'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6264         'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6265         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6266         'res': {'type': 'multiple', 'field': ('height', 'width'),
6267                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6268
6269         # Actual field names
6270         'format_id': {'type': 'alias', 'field': 'id'},
6271         'preference': {'type': 'alias', 'field': 'ie_pref'},
6272         'language_preference': {'type': 'alias', 'field': 'lang'},
6273         'source_preference': {'type': 'alias', 'field': 'source'},
6274         'protocol': {'type': 'alias', 'field': 'proto'},
6275         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6276         'audio_channels': {'type': 'alias', 'field': 'channels'},
6277
6278         # Deprecated
6279         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6280         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6281         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6282         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6283         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6284         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6285         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6286         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6287         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6288         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6289         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6290         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6291         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6292         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6293         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6294         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6295         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6296         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6297         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6298         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6299     }
6300
6301     def __init__(self, ydl, field_preference):
6302         self.ydl = ydl
6303         self._order = []
6304         self.evaluate_params(self.ydl.params, field_preference)
6305         if ydl.params.get('verbose'):
6306             self.print_verbose_info(self.ydl.write_debug)
6307
6308     def _get_field_setting(self, field, key):
6309         if field not in self.settings:
6310             if key in ('forced', 'priority'):
6311                 return False
6312             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6313                                         'deprecated and may be removed in a future version')
6314             self.settings[field] = {}
6315         propObj = self.settings[field]
6316         if key not in propObj:
6317             type = propObj.get('type')
6318             if key == 'field':
6319                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6320             elif key == 'convert':
6321                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6322             else:
6323                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6324             propObj[key] = default
6325         return propObj[key]
6326
6327     def _resolve_field_value(self, field, value, convertNone=False):
6328         if value is None:
6329             if not convertNone:
6330                 return None
6331         else:
6332             value = value.lower()
6333         conversion = self._get_field_setting(field, 'convert')
6334         if conversion == 'ignore':
6335             return None
6336         if conversion == 'string':
6337             return value
6338         elif conversion == 'float_none':
6339             return float_or_none(value)
6340         elif conversion == 'bytes':
6341             return parse_bytes(value)
6342         elif conversion == 'order':
6343             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6344             use_regex = self._get_field_setting(field, 'regex')
6345             list_length = len(order_list)
6346             empty_pos = order_list.index('') if '' in order_list else list_length + 1
6347             if use_regex and value is not None:
6348                 for i, regex in enumerate(order_list):
6349                     if regex and re.match(regex, value):
6350                         return list_length - i
6351                 return list_length - empty_pos  # not in list
6352             else:  # not regex or  value = None
6353                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6354         else:
6355             if value.isnumeric():
6356                 return float(value)
6357             else:
6358                 self.settings[field]['convert'] = 'string'
6359                 return value
6360
6361     def evaluate_params(self, params, sort_extractor):
6362         self._use_free_order = params.get('prefer_free_formats', False)
6363         self._sort_user = params.get('format_sort', [])
6364         self._sort_extractor = sort_extractor
6365
6366         def add_item(field, reverse, closest, limit_text):
6367             field = field.lower()
6368             if field in self._order:
6369                 return
6370             self._order.append(field)
6371             limit = self._resolve_field_value(field, limit_text)
6372             data = {
6373                 'reverse': reverse,
6374                 'closest': False if limit is None else closest,
6375                 'limit_text': limit_text,
6376                 'limit': limit}
6377             if field in self.settings:
6378                 self.settings[field].update(data)
6379             else:
6380                 self.settings[field] = data
6381
6382         sort_list = (
6383             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6384             + (tuple() if params.get('format_sort_force', False)
6385                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6386             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6387
6388         for item in sort_list:
6389             match = re.match(self.regex, item)
6390             if match is None:
6391                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6392             field = match.group('field')
6393             if field is None:
6394                 continue
6395             if self._get_field_setting(field, 'type') == 'alias':
6396                 alias, field = field, self._get_field_setting(field, 'field')
6397                 if self._get_field_setting(alias, 'deprecated'):
6398                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6399                                                 f'be removed in a future version. Please use {field} instead')
6400             reverse = match.group('reverse') is not None
6401             closest = match.group('separator') == '~'
6402             limit_text = match.group('limit')
6403
6404             has_limit = limit_text is not None
6405             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6406             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6407
6408             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6409             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6410             limit_count = len(limits)
6411             for (i, f) in enumerate(fields):
6412                 add_item(f, reverse, closest,
6413                          limits[i] if i < limit_count
6414                          else limits[0] if has_limit and not has_multiple_limits
6415                          else None)
6416
6417     def print_verbose_info(self, write_debug):
6418         if self._sort_user:
6419             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6420         if self._sort_extractor:
6421             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6422         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6423             '+' if self._get_field_setting(field, 'reverse') else '', field,
6424             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6425                           self._get_field_setting(field, 'limit_text'),
6426                           self._get_field_setting(field, 'limit'))
6427             if self._get_field_setting(field, 'limit_text') is not None else '')
6428             for field in self._order if self._get_field_setting(field, 'visible')]))
6429
6430     def _calculate_field_preference_from_value(self, format, field, type, value):
6431         reverse = self._get_field_setting(field, 'reverse')
6432         closest = self._get_field_setting(field, 'closest')
6433         limit = self._get_field_setting(field, 'limit')
6434
6435         if type == 'extractor':
6436             maximum = self._get_field_setting(field, 'max')
6437             if value is None or (maximum is not None and value >= maximum):
6438                 value = -1
6439         elif type == 'boolean':
6440             in_list = self._get_field_setting(field, 'in_list')
6441             not_in_list = self._get_field_setting(field, 'not_in_list')
6442             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6443         elif type == 'ordered':
6444             value = self._resolve_field_value(field, value, True)
6445
6446         # try to convert to number
6447         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6448         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6449         if is_num:
6450             value = val_num
6451
6452         return ((-10, 0) if value is None
6453                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
6454                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6455                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6456                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6457                 else (-1, value, 0))
6458
6459     def _calculate_field_preference(self, format, field):
6460         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
6461         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6462         if type == 'multiple':
6463             type = 'field'  # Only 'field' is allowed in multiple for now
6464             actual_fields = self._get_field_setting(field, 'field')
6465
6466             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6467         else:
6468             value = get_value(field)
6469         return self._calculate_field_preference_from_value(format, field, type, value)
6470
6471     def calculate_preference(self, format):
6472         # Determine missing protocol
6473         if not format.get('protocol'):
6474             format['protocol'] = determine_protocol(format)
6475
6476         # Determine missing ext
6477         if not format.get('ext') and 'url' in format:
6478             format['ext'] = determine_ext(format['url'])
6479         if format.get('vcodec') == 'none':
6480             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6481             format['video_ext'] = 'none'
6482         else:
6483             format['video_ext'] = format['ext']
6484             format['audio_ext'] = 'none'
6485         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
6486         #    format['preference'] = -1000
6487
6488         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6489             # HEVC-over-FLV is out-of-spec by FLV's original spec
6490             # ref. https://trac.ffmpeg.org/ticket/6389
6491             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6492             format['preference'] = -100
6493
6494         # Determine missing bitrates
6495         if format.get('tbr') is None:
6496             if format.get('vbr') is not None and format.get('abr') is not None:
6497                 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6498         else:
6499             if format.get('vcodec') != 'none' and format.get('vbr') is None:
6500                 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6501             if format.get('acodec') != 'none' and format.get('abr') is None:
6502                 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6503
6504         return tuple(self._calculate_field_preference(format, field) for field in self._order)
6505
6506
6507 # Deprecated
6508 has_certifi = bool(certifi)
6509 has_websockets = bool(websockets)
6510
6511
6512 def load_plugins(name, suffix, namespace):
6513     from .plugins import load_plugins
6514     ret = load_plugins(name, suffix)
6515     namespace.update(ret)
6516     return ret