yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from .compat import functools  # isort: split
  51 from .compat import (
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_HTMLParseError,
  55     compat_os_name,
  56     compat_shlex_quote,
  57 )
  58 from .dependencies import brotli, certifi, websockets, xattr
  59 from .socks import ProxyType, sockssocket
  60
  61
  62 def register_socks_protocols():
  63     # "Register" SOCKS protocols
  64     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  65     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  66     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  67         if scheme not in urllib.parse.uses_netloc:
  68             urllib.parse.uses_netloc.append(scheme)
  69
  70
  71 # This is not clearly defined otherwise
  72 compiled_regex_type = type(re.compile(''))
  73
  74
  75 def random_user_agent():
  76     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  77     _CHROME_VERSIONS = (
  78         '90.0.4430.212',
  79         '90.0.4430.24',
  80         '90.0.4430.70',
  81         '90.0.4430.72',
  82         '90.0.4430.85',
  83         '90.0.4430.93',
  84         '91.0.4472.101',
  85         '91.0.4472.106',
  86         '91.0.4472.114',
  87         '91.0.4472.124',
  88         '91.0.4472.164',
  89         '91.0.4472.19',
  90         '91.0.4472.77',
  91         '92.0.4515.107',
  92         '92.0.4515.115',
  93         '92.0.4515.131',
  94         '92.0.4515.159',
  95         '92.0.4515.43',
  96         '93.0.4556.0',
  97         '93.0.4577.15',
  98         '93.0.4577.63',
  99         '93.0.4577.82',
 100         '94.0.4606.41',
 101         '94.0.4606.54',
 102         '94.0.4606.61',
 103         '94.0.4606.71',
 104         '94.0.4606.81',
 105         '94.0.4606.85',
 106         '95.0.4638.17',
 107         '95.0.4638.50',
 108         '95.0.4638.54',
 109         '95.0.4638.69',
 110         '95.0.4638.74',
 111         '96.0.4664.18',
 112         '96.0.4664.45',
 113         '96.0.4664.55',
 114         '96.0.4664.93',
 115         '97.0.4692.20',
 116     )
 117     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 118
 119
 120 SUPPORTED_ENCODINGS = [
 121     'gzip', 'deflate'
 122 ]
 123 if brotli:
 124     SUPPORTED_ENCODINGS.append('br')
 125
 126 std_headers = {
 127     'User-Agent': random_user_agent(),
 128     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 129     'Accept-Language': 'en-us,en;q=0.5',
 130     'Sec-Fetch-Mode': 'navigate',
 131 }
 132
 133
 134 USER_AGENTS = {
 135     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 136 }
 137
 138
 139 NO_DEFAULT = object()
 140 IDENTITY = lambda x: x
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151 }
 152
 153 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 154 TIMEZONE_NAMES = {
 155     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 156     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 157     'EST': -5, 'EDT': -4,  # Eastern
 158     'CST': -6, 'CDT': -5,  # Central
 159     'MST': -7, 'MDT': -6,  # Mountain
 160     'PST': -8, 'PDT': -7   # Pacific
 161 }
 162
 163 # needed for sanitizing filenames in restricted mode
 164 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 165                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 166                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 167
 168 DATE_FORMATS = (
 169     '%d %B %Y',
 170     '%d %b %Y',
 171     '%B %d %Y',
 172     '%B %dst %Y',
 173     '%B %dnd %Y',
 174     '%B %drd %Y',
 175     '%B %dth %Y',
 176     '%b %d %Y',
 177     '%b %dst %Y',
 178     '%b %dnd %Y',
 179     '%b %drd %Y',
 180     '%b %dth %Y',
 181     '%b %dst %Y %I:%M',
 182     '%b %dnd %Y %I:%M',
 183     '%b %drd %Y %I:%M',
 184     '%b %dth %Y %I:%M',
 185     '%Y %m %d',
 186     '%Y-%m-%d',
 187     '%Y.%m.%d.',
 188     '%Y/%m/%d',
 189     '%Y/%m/%d %H:%M',
 190     '%Y/%m/%d %H:%M:%S',
 191     '%Y%m%d%H%M',
 192     '%Y%m%d%H%M%S',
 193     '%Y%m%d',
 194     '%Y-%m-%d %H:%M',
 195     '%Y-%m-%d %H:%M:%S',
 196     '%Y-%m-%d %H:%M:%S.%f',
 197     '%Y-%m-%d %H:%M:%S:%f',
 198     '%d.%m.%Y %H:%M',
 199     '%d.%m.%Y %H.%M',
 200     '%Y-%m-%dT%H:%M:%SZ',
 201     '%Y-%m-%dT%H:%M:%S.%fZ',
 202     '%Y-%m-%dT%H:%M:%S.%f0Z',
 203     '%Y-%m-%dT%H:%M:%S',
 204     '%Y-%m-%dT%H:%M:%S.%f',
 205     '%Y-%m-%dT%H:%M',
 206     '%b %d %Y at %H:%M',
 207     '%b %d %Y at %H:%M:%S',
 208     '%B %d %Y at %H:%M',
 209     '%B %d %Y at %H:%M:%S',
 210     '%H:%M %d-%b-%Y',
 211 )
 212
 213 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 214 DATE_FORMATS_DAY_FIRST.extend([
 215     '%d-%m-%Y',
 216     '%d.%m.%Y',
 217     '%d.%m.%y',
 218     '%d/%m/%Y',
 219     '%d/%m/%y',
 220     '%d/%m/%Y %H:%M:%S',
 221     '%d-%m-%Y %H:%M',
 222 ])
 223
 224 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 225 DATE_FORMATS_MONTH_FIRST.extend([
 226     '%m-%d-%Y',
 227     '%m.%d.%Y',
 228     '%m/%d/%Y',
 229     '%m/%d/%y',
 230     '%m/%d/%Y %H:%M:%S',
 231 ])
 232
 233 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 234 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 235
 236 NUMBER_RE = r'\d+(?:\.\d+)?'
 237
 238
 239 @functools.cache
 240 def preferredencoding():
 241     """Get preferred encoding.
 242
 243     Returns the best encoding scheme for the system, based on
 244     locale.getpreferredencoding() and some further tweaks.
 245     """
 246     try:
 247         pref = locale.getpreferredencoding()
 248         'TEST'.encode(pref)
 249     except Exception:
 250         pref = 'UTF-8'
 251
 252     return pref
 253
 254
 255 def write_json_file(obj, fn):
 256     """ Encode obj as JSON and write it to fn, atomically if possible """
 257
 258     tf = tempfile.NamedTemporaryFile(
 259         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 260         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf, ensure_ascii=False)
 265         if sys.platform == 'win32':
 266             # Need to remove existing file on Windows, else os.rename raises
 267             # WindowsError or FileExistsError.
 268             with contextlib.suppress(OSError):
 269                 os.unlink(fn)
 270         with contextlib.suppress(OSError):
 271             mask = os.umask(0)
 272             os.umask(mask)
 273             os.chmod(tf.name, 0o666 & ~mask)
 274         os.rename(tf.name, fn)
 275     except Exception:
 276         with contextlib.suppress(OSError):
 277             os.remove(tf.name)
 278         raise
 279
 280
 281 def find_xpath_attr(node, xpath, key, val=None):
 282     """ Find the xpath xpath[@key=val] """
 283     assert re.match(r'^[a-zA-Z_-]+$', key)
 284     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 285     return node.find(expr)
 286
 287 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 288 # the namespace parameter
 289
 290
 291 def xpath_with_ns(path, ns_map):
 292     components = [c.split(':') for c in path.split('/')]
 293     replaced = []
 294     for c in components:
 295         if len(c) == 1:
 296             replaced.append(c[0])
 297         else:
 298             ns, tag = c
 299             replaced.append('{%s}%s' % (ns_map[ns], tag))
 300     return '/'.join(replaced)
 301
 302
 303 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 304     def _find_xpath(xpath):
 305         return node.find(xpath)
 306
 307     if isinstance(xpath, str):
 308         n = _find_xpath(xpath)
 309     else:
 310         for xp in xpath:
 311             n = _find_xpath(xp)
 312             if n is not None:
 313                 break
 314
 315     if n is None:
 316         if default is not NO_DEFAULT:
 317             return default
 318         elif fatal:
 319             name = xpath if name is None else name
 320             raise ExtractorError('Could not find XML element %s' % name)
 321         else:
 322             return None
 323     return n
 324
 325
 326 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 327     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 328     if n is None or n == default:
 329         return n
 330     if n.text is None:
 331         if default is not NO_DEFAULT:
 332             return default
 333         elif fatal:
 334             name = xpath if name is None else name
 335             raise ExtractorError('Could not find XML element\'s text %s' % name)
 336         else:
 337             return None
 338     return n.text
 339
 340
 341 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 342     n = find_xpath_attr(node, xpath, key)
 343     if n is None:
 344         if default is not NO_DEFAULT:
 345             return default
 346         elif fatal:
 347             name = f'{xpath}[@{key}]' if name is None else name
 348             raise ExtractorError('Could not find XML attribute %s' % name)
 349         else:
 350             return None
 351     return n.attrib[key]
 352
 353
 354 def get_element_by_id(id, html, **kwargs):
 355     """Return the content of the tag with the specified ID in the passed HTML document"""
 356     return get_element_by_attribute('id', id, html, **kwargs)
 357
 358
 359 def get_element_html_by_id(id, html, **kwargs):
 360     """Return the html of the tag with the specified ID in the passed HTML document"""
 361     return get_element_html_by_attribute('id', id, html, **kwargs)
 362
 363
 364 def get_element_by_class(class_name, html):
 365     """Return the content of the first tag with the specified class in the passed HTML document"""
 366     retval = get_elements_by_class(class_name, html)
 367     return retval[0] if retval else None
 368
 369
 370 def get_element_html_by_class(class_name, html):
 371     """Return the html of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_html_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_by_attribute(attribute, value, html, **kwargs):
 377     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_html_by_attribute(attribute, value, html, **kargs):
 382     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 383     return retval[0] if retval else None
 384
 385
 386 def get_elements_by_class(class_name, html, **kargs):
 387     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 388     return get_elements_by_attribute(
 389         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 390         html, escape_value=False)
 391
 392
 393 def get_elements_html_by_class(class_name, html):
 394     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 395     return get_elements_html_by_attribute(
 396         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 397         html, escape_value=False)
 398
 399
 400 def get_elements_by_attribute(*args, **kwargs):
 401     """Return the content of the tag with the specified attribute in the passed HTML document"""
 402     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 403
 404
 405 def get_elements_html_by_attribute(*args, **kwargs):
 406     """Return the html of the tag with the specified attribute in the passed HTML document"""
 407     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 408
 409
 410 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 411     """
 412     Return the text (content) and the html (whole) of the tag with the specified
 413     attribute in the passed HTML document
 414     """
 415
 416     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 417
 418     value = re.escape(value) if escape_value else value
 419
 420     partial_element_re = rf'''(?x)
 421         <(?P<tag>[a-zA-Z0-9:._-]+)
 422          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 423          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 424         '''
 425
 426     for m in re.finditer(partial_element_re, html):
 427         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 428
 429         yield (
 430             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 431             whole
 432         )
 433
 434
 435 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 436     """
 437     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 438     closing tag for the first opening tag it has encountered, and can be used
 439     as a context manager
 440     """
 441
 442     class HTMLBreakOnClosingTagException(Exception):
 443         pass
 444
 445     def __init__(self):
 446         self.tagstack = collections.deque()
 447         html.parser.HTMLParser.__init__(self)
 448
 449     def __enter__(self):
 450         return self
 451
 452     def __exit__(self, *_):
 453         self.close()
 454
 455     def close(self):
 456         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 457         # so data remains buffered; we no longer have any interest in it, thus
 458         # override this method to discard it
 459         pass
 460
 461     def handle_starttag(self, tag, _):
 462         self.tagstack.append(tag)
 463
 464     def handle_endtag(self, tag):
 465         if not self.tagstack:
 466             raise compat_HTMLParseError('no tags in the stack')
 467         while self.tagstack:
 468             inner_tag = self.tagstack.pop()
 469             if inner_tag == tag:
 470                 break
 471         else:
 472             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 473         if not self.tagstack:
 474             raise self.HTMLBreakOnClosingTagException()
 475
 476
 477 def get_element_text_and_html_by_tag(tag, html):
 478     """
 479     For the first element with the specified tag in the passed HTML document
 480     return its' content (text) and the whole element (html)
 481     """
 482     def find_or_raise(haystack, needle, exc):
 483         try:
 484             return haystack.index(needle)
 485         except ValueError:
 486             raise exc
 487     closing_tag = f'</{tag}>'
 488     whole_start = find_or_raise(
 489         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 490     content_start = find_or_raise(
 491         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 492     content_start += whole_start + 1
 493     with HTMLBreakOnClosingTagParser() as parser:
 494         parser.feed(html[whole_start:content_start])
 495         if not parser.tagstack or parser.tagstack[0] != tag:
 496             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 497         offset = content_start
 498         while offset < len(html):
 499             next_closing_tag_start = find_or_raise(
 500                 html[offset:], closing_tag,
 501                 compat_HTMLParseError(f'closing {tag} tag not found'))
 502             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 503             try:
 504                 parser.feed(html[offset:offset + next_closing_tag_end])
 505                 offset += next_closing_tag_end
 506             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 507                 return html[content_start:offset + next_closing_tag_start], \
 508                     html[whole_start:offset + next_closing_tag_end]
 509         raise compat_HTMLParseError('unexpected end of html')
 510
 511
 512 class HTMLAttributeParser(html.parser.HTMLParser):
 513     """Trivial HTML parser to gather the attributes for a single element"""
 514
 515     def __init__(self):
 516         self.attrs = {}
 517         html.parser.HTMLParser.__init__(self)
 518
 519     def handle_starttag(self, tag, attrs):
 520         self.attrs = dict(attrs)
 521
 522
 523 class HTMLListAttrsParser(html.parser.HTMLParser):
 524     """HTML parser to gather the attributes for the elements of a list"""
 525
 526     def __init__(self):
 527         html.parser.HTMLParser.__init__(self)
 528         self.items = []
 529         self._level = 0
 530
 531     def handle_starttag(self, tag, attrs):
 532         if tag == 'li' and self._level == 0:
 533             self.items.append(dict(attrs))
 534         self._level += 1
 535
 536     def handle_endtag(self, tag):
 537         self._level -= 1
 538
 539
 540 def extract_attributes(html_element):
 541     """Given a string for an HTML element such as
 542     <el
 543          a="foo" B="bar" c="&98;az" d=boz
 544          empty= noval entity="&amp;"
 545          sq='"' dq="'"
 546     >
 547     Decode and return a dictionary of attributes.
 548     {
 549         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 550         'empty': '', 'noval': None, 'entity': '&',
 551         'sq': '"', 'dq': '\''
 552     }.
 553     """
 554     parser = HTMLAttributeParser()
 555     with contextlib.suppress(compat_HTMLParseError):
 556         parser.feed(html_element)
 557         parser.close()
 558     return parser.attrs
 559
 560
 561 def parse_list(webpage):
 562     """Given a string for an series of HTML <li> elements,
 563     return a dictionary of their attributes"""
 564     parser = HTMLListAttrsParser()
 565     parser.feed(webpage)
 566     parser.close()
 567     return parser.items
 568
 569
 570 def clean_html(html):
 571     """Clean an HTML snippet into a readable string"""
 572
 573     if html is None:  # Convenience for sanitizing descriptions etc.
 574         return html
 575
 576     html = re.sub(r'\s+', ' ', html)
 577     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 578     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 579     # Strip html tags
 580     html = re.sub('<.*?>', '', html)
 581     # Replace html entities
 582     html = unescapeHTML(html)
 583     return html.strip()
 584
 585
 586 class LenientJSONDecoder(json.JSONDecoder):
 587     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 588         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 589         super().__init__(*args, **kwargs)
 590
 591     def decode(self, s):
 592         if self.transform_source:
 593             s = self.transform_source(s)
 594         if self.ignore_extra:
 595             return self.raw_decode(s.lstrip())[0]
 596         return super().decode(s)
 597
 598
 599 def sanitize_open(filename, open_mode):
 600     """Try to open the given filename, and slightly tweak it if this fails.
 601
 602     Attempts to open the given filename. If this fails, it tries to change
 603     the filename slightly, step by step, until it's either able to open it
 604     or it fails and raises a final exception, like the standard open()
 605     function.
 606
 607     It returns the tuple (stream, definitive_file_name).
 608     """
 609     if filename == '-':
 610         if sys.platform == 'win32':
 611             import msvcrt
 612
 613             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 614             with contextlib.suppress(io.UnsupportedOperation):
 615                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 616         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 617
 618     for attempt in range(2):
 619         try:
 620             try:
 621                 if sys.platform == 'win32':
 622                     # FIXME: An exclusive lock also locks the file from being read.
 623                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 624                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 625                     raise LockingUnsupportedError()
 626                 stream = locked_file(filename, open_mode, block=False).__enter__()
 627             except OSError:
 628                 stream = open(filename, open_mode)
 629             return stream, filename
 630         except OSError as err:
 631             if attempt or err.errno in (errno.EACCES,):
 632                 raise
 633             old_filename, filename = filename, sanitize_path(filename)
 634             if old_filename == filename:
 635                 raise
 636
 637
 638 def timeconvert(timestr):
 639     """Convert RFC 2822 defined time string into system timestamp"""
 640     timestamp = None
 641     timetuple = email.utils.parsedate_tz(timestr)
 642     if timetuple is not None:
 643         timestamp = email.utils.mktime_tz(timetuple)
 644     return timestamp
 645
 646
 647 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 648     """Sanitizes a string so it could be used as part of a filename.
 649     @param restricted   Use a stricter subset of allowed characters
 650     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 651                         If unset, yt-dlp's new sanitization rules are in effect
 652     """
 653     if s == '':
 654         return ''
 655
 656     def replace_insane(char):
 657         if restricted and char in ACCENT_CHARS:
 658             return ACCENT_CHARS[char]
 659         elif not restricted and char == '\n':
 660             return '\0 '
 661         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 662             # Replace with their full-width unicode counterparts
 663             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 664         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 665             return ''
 666         elif char == '"':
 667             return '' if restricted else '\''
 668         elif char == ':':
 669             return '\0_\0-' if restricted else '\0 \0-'
 670         elif char in '\\/|*<>':
 671             return '\0_'
 672         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 673             return '\0_'
 674         return char
 675
 676     if restricted and is_id is NO_DEFAULT:
 677         s = unicodedata.normalize('NFKC', s)
 678     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 679     result = ''.join(map(replace_insane, s))
 680     if is_id is NO_DEFAULT:
 681         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 682         STRIP_RE = r'(?:\0.|[ _-])*'
 683         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 684     result = result.replace('\0', '') or '_'
 685
 686     if not is_id:
 687         while '__' in result:
 688             result = result.replace('__', '_')
 689         result = result.strip('_')
 690         # Common case of "Foreign band name - English song title"
 691         if restricted and result.startswith('-_'):
 692             result = result[2:]
 693         if result.startswith('-'):
 694             result = '_' + result[len('-'):]
 695         result = result.lstrip('.')
 696         if not result:
 697             result = '_'
 698     return result
 699
 700
 701 def sanitize_path(s, force=False):
 702     """Sanitizes and normalizes path on Windows"""
 703     if sys.platform == 'win32':
 704         force = False
 705         drive_or_unc, _ = os.path.splitdrive(s)
 706     elif force:
 707         drive_or_unc = ''
 708     else:
 709         return s
 710
 711     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 712     if drive_or_unc:
 713         norm_path.pop(0)
 714     sanitized_path = [
 715         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 716         for path_part in norm_path]
 717     if drive_or_unc:
 718         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 719     elif force and s and s[0] == os.path.sep:
 720         sanitized_path.insert(0, os.path.sep)
 721     return os.path.join(*sanitized_path)
 722
 723
 724 def sanitize_url(url, *, scheme='http'):
 725     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 726     # the number of unwanted failures due to missing protocol
 727     if url is None:
 728         return
 729     elif url.startswith('//'):
 730         return f'{scheme}:{url}'
 731     # Fix some common typos seen so far
 732     COMMON_TYPOS = (
 733         # https://github.com/ytdl-org/youtube-dl/issues/15649
 734         (r'^httpss://', r'https://'),
 735         # https://bx1.be/lives/direct-tv/
 736         (r'^rmtp([es]?)://', r'rtmp\1://'),
 737     )
 738     for mistake, fixup in COMMON_TYPOS:
 739         if re.match(mistake, url):
 740             return re.sub(mistake, fixup, url)
 741     return url
 742
 743
 744 def extract_basic_auth(url):
 745     parts = urllib.parse.urlsplit(url)
 746     if parts.username is None:
 747         return url, None
 748     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 749         parts.hostname if parts.port is None
 750         else '%s:%d' % (parts.hostname, parts.port))))
 751     auth_payload = base64.b64encode(
 752         ('%s:%s' % (parts.username, parts.password or '')).encode())
 753     return url, f'Basic {auth_payload.decode()}'
 754
 755
 756 def sanitized_Request(url, *args, **kwargs):
 757     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 758     if auth_header is not None:
 759         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 760         headers['Authorization'] = auth_header
 761     return urllib.request.Request(url, *args, **kwargs)
 762
 763
 764 def expand_path(s):
 765     """Expand shell variables and ~"""
 766     return os.path.expandvars(compat_expanduser(s))
 767
 768
 769 def orderedSet(iterable, *, lazy=False):
 770     """Remove all duplicates from the input iterable"""
 771     def _iter():
 772         seen = []  # Do not use set since the items can be unhashable
 773         for x in iterable:
 774             if x not in seen:
 775                 seen.append(x)
 776                 yield x
 777
 778     return _iter() if lazy else list(_iter())
 779
 780
 781 def _htmlentity_transform(entity_with_semicolon):
 782     """Transforms an HTML entity to a character."""
 783     entity = entity_with_semicolon[:-1]
 784
 785     # Known non-numeric HTML entity
 786     if entity in html.entities.name2codepoint:
 787         return chr(html.entities.name2codepoint[entity])
 788
 789     # TODO: HTML5 allows entities without a semicolon.
 790     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 791     if entity_with_semicolon in html.entities.html5:
 792         return html.entities.html5[entity_with_semicolon]
 793
 794     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 795     if mobj is not None:
 796         numstr = mobj.group(1)
 797         if numstr.startswith('x'):
 798             base = 16
 799             numstr = '0%s' % numstr
 800         else:
 801             base = 10
 802         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 803         with contextlib.suppress(ValueError):
 804             return chr(int(numstr, base))
 805
 806     # Unknown entity in name, return its literal representation
 807     return '&%s;' % entity
 808
 809
 810 def unescapeHTML(s):
 811     if s is None:
 812         return None
 813     assert isinstance(s, str)
 814
 815     return re.sub(
 816         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 817
 818
 819 def escapeHTML(text):
 820     return (
 821         text
 822         .replace('&', '&amp;')
 823         .replace('<', '&lt;')
 824         .replace('>', '&gt;')
 825         .replace('"', '&quot;')
 826         .replace("'", '&#39;')
 827     )
 828
 829
 830 def process_communicate_or_kill(p, *args, **kwargs):
 831     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 832                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 833     return Popen.communicate_or_kill(p, *args, **kwargs)
 834
 835
 836 class Popen(subprocess.Popen):
 837     if sys.platform == 'win32':
 838         _startupinfo = subprocess.STARTUPINFO()
 839         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 840     else:
 841         _startupinfo = None
 842
 843     @staticmethod
 844     def _fix_pyinstaller_ld_path(env):
 845         """Restore LD_LIBRARY_PATH when using PyInstaller
 846             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 847                  https://github.com/yt-dlp/yt-dlp/issues/4573
 848         """
 849         if not hasattr(sys, '_MEIPASS'):
 850             return
 851
 852         def _fix(key):
 853             orig = env.get(f'{key}_ORIG')
 854             if orig is None:
 855                 env.pop(key, None)
 856             else:
 857                 env[key] = orig
 858
 859         _fix('LD_LIBRARY_PATH')  # Linux
 860         _fix('DYLD_LIBRARY_PATH')  # macOS
 861
 862     def __init__(self, *args, env=None, text=False, **kwargs):
 863         if env is None:
 864             env = os.environ.copy()
 865         self._fix_pyinstaller_ld_path(env)
 866
 867         if text is True:
 868             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 869             kwargs.setdefault('encoding', 'utf-8')
 870             kwargs.setdefault('errors', 'replace')
 871         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 872
 873     def communicate_or_kill(self, *args, **kwargs):
 874         try:
 875             return self.communicate(*args, **kwargs)
 876         except BaseException:  # Including KeyboardInterrupt
 877             self.kill(timeout=None)
 878             raise
 879
 880     def kill(self, *, timeout=0):
 881         super().kill()
 882         if timeout != 0:
 883             self.wait(timeout=timeout)
 884
 885     @classmethod
 886     def run(cls, *args, timeout=None, **kwargs):
 887         with cls(*args, **kwargs) as proc:
 888             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 889             return stdout or '', stderr or '', proc.returncode
 890
 891
 892 def get_subprocess_encoding():
 893     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 894         # For subprocess calls, encode with locale encoding
 895         # Refer to http://stackoverflow.com/a/9951851/35070
 896         encoding = preferredencoding()
 897     else:
 898         encoding = sys.getfilesystemencoding()
 899     if encoding is None:
 900         encoding = 'utf-8'
 901     return encoding
 902
 903
 904 def encodeFilename(s, for_subprocess=False):
 905     assert isinstance(s, str)
 906     return s
 907
 908
 909 def decodeFilename(b, for_subprocess=False):
 910     return b
 911
 912
 913 def encodeArgument(s):
 914     # Legacy code that uses byte strings
 915     # Uncomment the following line after fixing all post processors
 916     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 917     return s if isinstance(s, str) else s.decode('ascii')
 918
 919
 920 def decodeArgument(b):
 921     return b
 922
 923
 924 def decodeOption(optval):
 925     if optval is None:
 926         return optval
 927     if isinstance(optval, bytes):
 928         optval = optval.decode(preferredencoding())
 929
 930     assert isinstance(optval, str)
 931     return optval
 932
 933
 934 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 935
 936
 937 def timetuple_from_msec(msec):
 938     secs, msec = divmod(msec, 1000)
 939     mins, secs = divmod(secs, 60)
 940     hrs, mins = divmod(mins, 60)
 941     return _timetuple(hrs, mins, secs, msec)
 942
 943
 944 def formatSeconds(secs, delim=':', msec=False):
 945     time = timetuple_from_msec(secs * 1000)
 946     if time.hours:
 947         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 948     elif time.minutes:
 949         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 950     else:
 951         ret = '%d' % time.seconds
 952     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 953
 954
 955 def _ssl_load_windows_store_certs(ssl_context, storename):
 956     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 957     try:
 958         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 959                  if encoding == 'x509_asn' and (
 960                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 961     except PermissionError:
 962         return
 963     for cert in certs:
 964         with contextlib.suppress(ssl.SSLError):
 965             ssl_context.load_verify_locations(cadata=cert)
 966
 967
 968 def make_HTTPS_handler(params, **kwargs):
 969     opts_check_certificate = not params.get('nocheckcertificate')
 970     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 971     context.check_hostname = opts_check_certificate
 972     if params.get('legacyserverconnect'):
 973         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 974         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 975         context.set_ciphers('DEFAULT')
 976
 977     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 978     if opts_check_certificate:
 979         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 980             context.load_verify_locations(cafile=certifi.where())
 981         else:
 982             try:
 983                 context.load_default_certs()
 984                 # Work around the issue in load_default_certs when there are bad certificates. See:
 985                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 986                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 987             except ssl.SSLError:
 988                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 989                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 990                     for storename in ('CA', 'ROOT'):
 991                         _ssl_load_windows_store_certs(context, storename)
 992                 context.set_default_verify_paths()
 993
 994     client_certfile = params.get('client_certificate')
 995     if client_certfile:
 996         try:
 997             context.load_cert_chain(
 998                 client_certfile, keyfile=params.get('client_certificate_key'),
 999                 password=params.get('client_certificate_password'))
1000         except ssl.SSLError:
1001             raise YoutubeDLError('Unable to load client certificate')
1002
1003     # Some servers may reject requests if ALPN extension is not sent. See:
1004     # https://github.com/python/cpython/issues/85140
1005     # https://github.com/yt-dlp/yt-dlp/issues/3878
1006     with contextlib.suppress(NotImplementedError):
1007         context.set_alpn_protocols(['http/1.1'])
1008
1009     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1010
1011
1012 def bug_reports_message(before=';'):
1013     from .update import REPOSITORY
1014
1015     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1016            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1017
1018     before = before.rstrip()
1019     if not before or before.endswith(('.', '!', '?')):
1020         msg = msg[0].title() + msg[1:]
1021
1022     return (before + ' ' if before else '') + msg
1023
1024
1025 class YoutubeDLError(Exception):
1026     """Base exception for YoutubeDL errors."""
1027     msg = None
1028
1029     def __init__(self, msg=None):
1030         if msg is not None:
1031             self.msg = msg
1032         elif self.msg is None:
1033             self.msg = type(self).__name__
1034         super().__init__(self.msg)
1035
1036
1037 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1038 if hasattr(ssl, 'CertificateError'):
1039     network_exceptions.append(ssl.CertificateError)
1040 network_exceptions = tuple(network_exceptions)
1041
1042
1043 class ExtractorError(YoutubeDLError):
1044     """Error during info extraction."""
1045
1046     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1047         """ tb, if given, is the original traceback (so that it can be printed out).
1048         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1049         """
1050         if sys.exc_info()[0] in network_exceptions:
1051             expected = True
1052
1053         self.orig_msg = str(msg)
1054         self.traceback = tb
1055         self.expected = expected
1056         self.cause = cause
1057         self.video_id = video_id
1058         self.ie = ie
1059         self.exc_info = sys.exc_info()  # preserve original exception
1060         if isinstance(self.exc_info[1], ExtractorError):
1061             self.exc_info = self.exc_info[1].exc_info
1062
1063         super().__init__(''.join((
1064             format_field(ie, None, '[%s] '),
1065             format_field(video_id, None, '%s: '),
1066             msg,
1067             format_field(cause, None, ' (caused by %r)'),
1068             '' if expected else bug_reports_message())))
1069
1070     def format_traceback(self):
1071         return join_nonempty(
1072             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1073             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1074             delim='\n') or None
1075
1076
1077 class UnsupportedError(ExtractorError):
1078     def __init__(self, url):
1079         super().__init__(
1080             'Unsupported URL: %s' % url, expected=True)
1081         self.url = url
1082
1083
1084 class RegexNotFoundError(ExtractorError):
1085     """Error when a regex didn't match"""
1086     pass
1087
1088
1089 class GeoRestrictedError(ExtractorError):
1090     """Geographic restriction Error exception.
1091
1092     This exception may be thrown when a video is not available from your
1093     geographic location due to geographic restrictions imposed by a website.
1094     """
1095
1096     def __init__(self, msg, countries=None, **kwargs):
1097         kwargs['expected'] = True
1098         super().__init__(msg, **kwargs)
1099         self.countries = countries
1100
1101
1102 class UserNotLive(ExtractorError):
1103     """Error when a channel/user is not live"""
1104
1105     def __init__(self, msg=None, **kwargs):
1106         kwargs['expected'] = True
1107         super().__init__(msg or 'The channel is not currently live', **kwargs)
1108
1109
1110 class DownloadError(YoutubeDLError):
1111     """Download Error exception.
1112
1113     This exception may be thrown by FileDownloader objects if they are not
1114     configured to continue on errors. They will contain the appropriate
1115     error message.
1116     """
1117
1118     def __init__(self, msg, exc_info=None):
1119         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1120         super().__init__(msg)
1121         self.exc_info = exc_info
1122
1123
1124 class EntryNotInPlaylist(YoutubeDLError):
1125     """Entry not in playlist exception.
1126
1127     This exception will be thrown by YoutubeDL when a requested entry
1128     is not found in the playlist info_dict
1129     """
1130     msg = 'Entry not found in info'
1131
1132
1133 class SameFileError(YoutubeDLError):
1134     """Same File exception.
1135
1136     This exception will be thrown by FileDownloader objects if they detect
1137     multiple files would have to be downloaded to the same file on disk.
1138     """
1139     msg = 'Fixed output name but more than one file to download'
1140
1141     def __init__(self, filename=None):
1142         if filename is not None:
1143             self.msg += f': {filename}'
1144         super().__init__(self.msg)
1145
1146
1147 class PostProcessingError(YoutubeDLError):
1148     """Post Processing exception.
1149
1150     This exception may be raised by PostProcessor's .run() method to
1151     indicate an error in the postprocessing task.
1152     """
1153
1154
1155 class DownloadCancelled(YoutubeDLError):
1156     """ Exception raised when the download queue should be interrupted """
1157     msg = 'The download was cancelled'
1158
1159
1160 class ExistingVideoReached(DownloadCancelled):
1161     """ --break-on-existing triggered """
1162     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1163
1164
1165 class RejectedVideoReached(DownloadCancelled):
1166     """ --break-on-reject triggered """
1167     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1168
1169
1170 class MaxDownloadsReached(DownloadCancelled):
1171     """ --max-downloads limit has been reached. """
1172     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1173
1174
1175 class ReExtractInfo(YoutubeDLError):
1176     """ Video info needs to be re-extracted. """
1177
1178     def __init__(self, msg, expected=False):
1179         super().__init__(msg)
1180         self.expected = expected
1181
1182
1183 class ThrottledDownload(ReExtractInfo):
1184     """ Download speed below --throttled-rate. """
1185     msg = 'The download speed is below throttle limit'
1186
1187     def __init__(self):
1188         super().__init__(self.msg, expected=False)
1189
1190
1191 class UnavailableVideoError(YoutubeDLError):
1192     """Unavailable Format exception.
1193
1194     This exception will be thrown when a video is requested
1195     in a format that is not available for that video.
1196     """
1197     msg = 'Unable to download video'
1198
1199     def __init__(self, err=None):
1200         if err is not None:
1201             self.msg += f': {err}'
1202         super().__init__(self.msg)
1203
1204
1205 class ContentTooShortError(YoutubeDLError):
1206     """Content Too Short exception.
1207
1208     This exception may be raised by FileDownloader objects when a file they
1209     download is too small for what the server announced first, indicating
1210     the connection was probably interrupted.
1211     """
1212
1213     def __init__(self, downloaded, expected):
1214         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1215         # Both in bytes
1216         self.downloaded = downloaded
1217         self.expected = expected
1218
1219
1220 class XAttrMetadataError(YoutubeDLError):
1221     def __init__(self, code=None, msg='Unknown error'):
1222         super().__init__(msg)
1223         self.code = code
1224         self.msg = msg
1225
1226         # Parsing code and msg
1227         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1228                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1229             self.reason = 'NO_SPACE'
1230         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231             self.reason = 'VALUE_TOO_LONG'
1232         else:
1233             self.reason = 'NOT_SUPPORTED'
1234
1235
1236 class XAttrUnavailableError(YoutubeDLError):
1237     pass
1238
1239
1240 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1241     hc = http_class(*args, **kwargs)
1242     source_address = ydl_handler._params.get('source_address')
1243
1244     if source_address is not None:
1245         # This is to workaround _create_connection() from socket where it will try all
1246         # address data from getaddrinfo() including IPv6. This filters the result from
1247         # getaddrinfo() based on the source_address value.
1248         # This is based on the cpython socket.create_connection() function.
1249         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1250         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1251             host, port = address
1252             err = None
1253             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1254             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1255             ip_addrs = [addr for addr in addrs if addr[0] == af]
1256             if addrs and not ip_addrs:
1257                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1258                 raise OSError(
1259                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1260                     % (ip_version, source_address[0]))
1261             for res in ip_addrs:
1262                 af, socktype, proto, canonname, sa = res
1263                 sock = None
1264                 try:
1265                     sock = socket.socket(af, socktype, proto)
1266                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1267                         sock.settimeout(timeout)
1268                     sock.bind(source_address)
1269                     sock.connect(sa)
1270                     err = None  # Explicitly break reference cycle
1271                     return sock
1272                 except OSError as _:
1273                     err = _
1274                     if sock is not None:
1275                         sock.close()
1276             if err is not None:
1277                 raise err
1278             else:
1279                 raise OSError('getaddrinfo returns an empty list')
1280         if hasattr(hc, '_create_connection'):
1281             hc._create_connection = _create_connection
1282         hc.source_address = (source_address, 0)
1283
1284     return hc
1285
1286
1287 def handle_youtubedl_headers(headers):
1288     filtered_headers = headers
1289
1290     if 'Youtubedl-no-compression' in filtered_headers:
1291         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1292         del filtered_headers['Youtubedl-no-compression']
1293
1294     return filtered_headers
1295
1296
1297 class YoutubeDLHandler(urllib.request.HTTPHandler):
1298     """Handler for HTTP requests and responses.
1299
1300     This class, when installed with an OpenerDirector, automatically adds
1301     the standard headers to every HTTP request and handles gzipped and
1302     deflated responses from web servers. If compression is to be avoided in
1303     a particular request, the original request in the program code only has
1304     to include the HTTP header "Youtubedl-no-compression", which will be
1305     removed before making the real request.
1306
1307     Part of this code was copied from:
1308
1309     http://techknack.net/python-urllib2-handlers/
1310
1311     Andrew Rowls, the author of that code, agreed to release it to the
1312     public domain.
1313     """
1314
1315     def __init__(self, params, *args, **kwargs):
1316         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1317         self._params = params
1318
1319     def http_open(self, req):
1320         conn_class = http.client.HTTPConnection
1321
1322         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1323         if socks_proxy:
1324             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1325             del req.headers['Ytdl-socks-proxy']
1326
1327         return self.do_open(functools.partial(
1328             _create_http_connection, self, conn_class, False),
1329             req)
1330
1331     @staticmethod
1332     def deflate(data):
1333         if not data:
1334             return data
1335         try:
1336             return zlib.decompress(data, -zlib.MAX_WBITS)
1337         except zlib.error:
1338             return zlib.decompress(data)
1339
1340     @staticmethod
1341     def brotli(data):
1342         if not data:
1343             return data
1344         return brotli.decompress(data)
1345
1346     def http_request(self, req):
1347         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1348         # always respected by websites, some tend to give out URLs with non percent-encoded
1349         # non-ASCII characters (see telemb.py, ard.py [#3412])
1350         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1351         # To work around aforementioned issue we will replace request's original URL with
1352         # percent-encoded one
1353         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1354         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1355         url = req.get_full_url()
1356         url_escaped = escape_url(url)
1357
1358         # Substitute URL if any change after escaping
1359         if url != url_escaped:
1360             req = update_Request(req, url=url_escaped)
1361
1362         for h, v in self._params.get('http_headers', std_headers).items():
1363             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1364             # The dict keys are capitalized because of this bug by urllib
1365             if h.capitalize() not in req.headers:
1366                 req.add_header(h, v)
1367
1368         if 'Accept-encoding' not in req.headers:
1369             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1370
1371         req.headers = handle_youtubedl_headers(req.headers)
1372
1373         return super().do_request_(req)
1374
1375     def http_response(self, req, resp):
1376         old_resp = resp
1377         # gzip
1378         if resp.headers.get('Content-encoding', '') == 'gzip':
1379             content = resp.read()
1380             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1381             try:
1382                 uncompressed = io.BytesIO(gz.read())
1383             except OSError as original_ioerror:
1384                 # There may be junk add the end of the file
1385                 # See http://stackoverflow.com/q/4928560/35070 for details
1386                 for i in range(1, 1024):
1387                     try:
1388                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1389                         uncompressed = io.BytesIO(gz.read())
1390                     except OSError:
1391                         continue
1392                     break
1393                 else:
1394                     raise original_ioerror
1395             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1396             resp.msg = old_resp.msg
1397             del resp.headers['Content-encoding']
1398         # deflate
1399         if resp.headers.get('Content-encoding', '') == 'deflate':
1400             gz = io.BytesIO(self.deflate(resp.read()))
1401             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1402             resp.msg = old_resp.msg
1403             del resp.headers['Content-encoding']
1404         # brotli
1405         if resp.headers.get('Content-encoding', '') == 'br':
1406             resp = urllib.request.addinfourl(
1407                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1408             resp.msg = old_resp.msg
1409             del resp.headers['Content-encoding']
1410         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1411         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1412         if 300 <= resp.code < 400:
1413             location = resp.headers.get('Location')
1414             if location:
1415                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1416                 location = location.encode('iso-8859-1').decode()
1417                 location_escaped = escape_url(location)
1418                 if location != location_escaped:
1419                     del resp.headers['Location']
1420                     resp.headers['Location'] = location_escaped
1421         return resp
1422
1423     https_request = http_request
1424     https_response = http_response
1425
1426
1427 def make_socks_conn_class(base_class, socks_proxy):
1428     assert issubclass(base_class, (
1429         http.client.HTTPConnection, http.client.HTTPSConnection))
1430
1431     url_components = urllib.parse.urlparse(socks_proxy)
1432     if url_components.scheme.lower() == 'socks5':
1433         socks_type = ProxyType.SOCKS5
1434     elif url_components.scheme.lower() in ('socks', 'socks4'):
1435         socks_type = ProxyType.SOCKS4
1436     elif url_components.scheme.lower() == 'socks4a':
1437         socks_type = ProxyType.SOCKS4A
1438
1439     def unquote_if_non_empty(s):
1440         if not s:
1441             return s
1442         return urllib.parse.unquote_plus(s)
1443
1444     proxy_args = (
1445         socks_type,
1446         url_components.hostname, url_components.port or 1080,
1447         True,  # Remote DNS
1448         unquote_if_non_empty(url_components.username),
1449         unquote_if_non_empty(url_components.password),
1450     )
1451
1452     class SocksConnection(base_class):
1453         def connect(self):
1454             self.sock = sockssocket()
1455             self.sock.setproxy(*proxy_args)
1456             if isinstance(self.timeout, (int, float)):
1457                 self.sock.settimeout(self.timeout)
1458             self.sock.connect((self.host, self.port))
1459
1460             if isinstance(self, http.client.HTTPSConnection):
1461                 if hasattr(self, '_context'):  # Python > 2.6
1462                     self.sock = self._context.wrap_socket(
1463                         self.sock, server_hostname=self.host)
1464                 else:
1465                     self.sock = ssl.wrap_socket(self.sock)
1466
1467     return SocksConnection
1468
1469
1470 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1471     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1472         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1473         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1474         self._params = params
1475
1476     def https_open(self, req):
1477         kwargs = {}
1478         conn_class = self._https_conn_class
1479
1480         if hasattr(self, '_context'):  # python > 2.6
1481             kwargs['context'] = self._context
1482         if hasattr(self, '_check_hostname'):  # python 3.x
1483             kwargs['check_hostname'] = self._check_hostname
1484
1485         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1486         if socks_proxy:
1487             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1488             del req.headers['Ytdl-socks-proxy']
1489
1490         try:
1491             return self.do_open(
1492                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1493         except urllib.error.URLError as e:
1494             if (isinstance(e.reason, ssl.SSLError)
1495                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1496                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1497             raise
1498
1499
1500 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1501     """
1502     See [1] for cookie file format.
1503
1504     1. https://curl.haxx.se/docs/http-cookies.html
1505     """
1506     _HTTPONLY_PREFIX = '#HttpOnly_'
1507     _ENTRY_LEN = 7
1508     _HEADER = '''# Netscape HTTP Cookie File
1509 # This file is generated by yt-dlp.  Do not edit.
1510
1511 '''
1512     _CookieFileEntry = collections.namedtuple(
1513         'CookieFileEntry',
1514         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1515
1516     def __init__(self, filename=None, *args, **kwargs):
1517         super().__init__(None, *args, **kwargs)
1518         if self.is_path(filename):
1519             filename = os.fspath(filename)
1520         self.filename = filename
1521
1522     @staticmethod
1523     def _true_or_false(cndn):
1524         return 'TRUE' if cndn else 'FALSE'
1525
1526     @staticmethod
1527     def is_path(file):
1528         return isinstance(file, (str, bytes, os.PathLike))
1529
1530     @contextlib.contextmanager
1531     def open(self, file, *, write=False):
1532         if self.is_path(file):
1533             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1534                 yield f
1535         else:
1536             if write:
1537                 file.truncate(0)
1538             yield file
1539
1540     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1541         now = time.time()
1542         for cookie in self:
1543             if (not ignore_discard and cookie.discard
1544                     or not ignore_expires and cookie.is_expired(now)):
1545                 continue
1546             name, value = cookie.name, cookie.value
1547             if value is None:
1548                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1549                 # with no name, whereas http.cookiejar regards it as a
1550                 # cookie with no value.
1551                 name, value = '', name
1552             f.write('%s\n' % '\t'.join((
1553                 cookie.domain,
1554                 self._true_or_false(cookie.domain.startswith('.')),
1555                 cookie.path,
1556                 self._true_or_false(cookie.secure),
1557                 str_or_none(cookie.expires, default=''),
1558                 name, value
1559             )))
1560
1561     def save(self, filename=None, *args, **kwargs):
1562         """
1563         Save cookies to a file.
1564         Code is taken from CPython 3.6
1565         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1566
1567         if filename is None:
1568             if self.filename is not None:
1569                 filename = self.filename
1570             else:
1571                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1572
1573         # Store session cookies with `expires` set to 0 instead of an empty string
1574         for cookie in self:
1575             if cookie.expires is None:
1576                 cookie.expires = 0
1577
1578         with self.open(filename, write=True) as f:
1579             f.write(self._HEADER)
1580             self._really_save(f, *args, **kwargs)
1581
1582     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1583         """Load cookies from a file."""
1584         if filename is None:
1585             if self.filename is not None:
1586                 filename = self.filename
1587             else:
1588                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1589
1590         def prepare_line(line):
1591             if line.startswith(self._HTTPONLY_PREFIX):
1592                 line = line[len(self._HTTPONLY_PREFIX):]
1593             # comments and empty lines are fine
1594             if line.startswith('#') or not line.strip():
1595                 return line
1596             cookie_list = line.split('\t')
1597             if len(cookie_list) != self._ENTRY_LEN:
1598                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1599             cookie = self._CookieFileEntry(*cookie_list)
1600             if cookie.expires_at and not cookie.expires_at.isdigit():
1601                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1602             return line
1603
1604         cf = io.StringIO()
1605         with self.open(filename) as f:
1606             for line in f:
1607                 try:
1608                     cf.write(prepare_line(line))
1609                 except http.cookiejar.LoadError as e:
1610                     if f'{line.strip()} '[0] in '[{"':
1611                         raise http.cookiejar.LoadError(
1612                             'Cookies file must be Netscape formatted, not JSON. See  '
1613                             'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1614                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1615                     continue
1616         cf.seek(0)
1617         self._really_load(cf, filename, ignore_discard, ignore_expires)
1618         # Session cookies are denoted by either `expires` field set to
1619         # an empty string or 0. MozillaCookieJar only recognizes the former
1620         # (see [1]). So we need force the latter to be recognized as session
1621         # cookies on our own.
1622         # Session cookies may be important for cookies-based authentication,
1623         # e.g. usually, when user does not check 'Remember me' check box while
1624         # logging in on a site, some important cookies are stored as session
1625         # cookies so that not recognizing them will result in failed login.
1626         # 1. https://bugs.python.org/issue17164
1627         for cookie in self:
1628             # Treat `expires=0` cookies as session cookies
1629             if cookie.expires == 0:
1630                 cookie.expires = None
1631                 cookie.discard = True
1632
1633
1634 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1635     def __init__(self, cookiejar=None):
1636         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1637
1638     def http_response(self, request, response):
1639         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1640
1641     https_request = urllib.request.HTTPCookieProcessor.http_request
1642     https_response = http_response
1643
1644
1645 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1646     """YoutubeDL redirect handler
1647
1648     The code is based on HTTPRedirectHandler implementation from CPython [1].
1649
1650     This redirect handler solves two issues:
1651      - ensures redirect URL is always unicode under python 2
1652      - introduces support for experimental HTTP response status code
1653        308 Permanent Redirect [2] used by some sites [3]
1654
1655     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1656     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1657     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1658     """
1659
1660     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1661
1662     def redirect_request(self, req, fp, code, msg, headers, newurl):
1663         """Return a Request or None in response to a redirect.
1664
1665         This is called by the http_error_30x methods when a
1666         redirection response is received.  If a redirection should
1667         take place, return a new Request to allow http_error_30x to
1668         perform the redirect.  Otherwise, raise HTTPError if no-one
1669         else should try to handle this url.  Return None if you can't
1670         but another Handler might.
1671         """
1672         m = req.get_method()
1673         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1674                  or code in (301, 302, 303) and m == "POST")):
1675             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1676         # Strictly (according to RFC 2616), 301 or 302 in response to
1677         # a POST MUST NOT cause a redirection without confirmation
1678         # from the user (of urllib.request, in this case).  In practice,
1679         # essentially all clients do redirect in this case, so we do
1680         # the same.
1681
1682         # Be conciliant with URIs containing a space.  This is mainly
1683         # redundant with the more complete encoding done in http_error_302(),
1684         # but it is kept for compatibility with other callers.
1685         newurl = newurl.replace(' ', '%20')
1686
1687         CONTENT_HEADERS = ("content-length", "content-type")
1688         # NB: don't use dict comprehension for python 2.6 compatibility
1689         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1690
1691         # A 303 must either use GET or HEAD for subsequent request
1692         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1693         if code == 303 and m != 'HEAD':
1694             m = 'GET'
1695         # 301 and 302 redirects are commonly turned into a GET from a POST
1696         # for subsequent requests by browsers, so we'll do the same.
1697         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1698         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1699         if code in (301, 302) and m == 'POST':
1700             m = 'GET'
1701
1702         return urllib.request.Request(
1703             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1704             unverifiable=True, method=m)
1705
1706
1707 def extract_timezone(date_str):
1708     m = re.search(
1709         r'''(?x)
1710             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1711             (?P<tz>Z|                                            # just the UTC Z, or
1712                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1713                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714                    [ ]?                                          # optional space
1715                 (?P<sign>\+|-)                                   # +/-
1716                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1717             $)
1718         ''', date_str)
1719     if not m:
1720         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1721         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1722         if timezone is not None:
1723             date_str = date_str[:-len(m.group('tz'))]
1724         timezone = datetime.timedelta(hours=timezone or 0)
1725     else:
1726         date_str = date_str[:-len(m.group('tz'))]
1727         if not m.group('sign'):
1728             timezone = datetime.timedelta()
1729         else:
1730             sign = 1 if m.group('sign') == '+' else -1
1731             timezone = datetime.timedelta(
1732                 hours=sign * int(m.group('hours')),
1733                 minutes=sign * int(m.group('minutes')))
1734     return timezone, date_str
1735
1736
1737 def parse_iso8601(date_str, delimiter='T', timezone=None):
1738     """ Return a UNIX timestamp from the given date """
1739
1740     if date_str is None:
1741         return None
1742
1743     date_str = re.sub(r'\.[0-9]+', '', date_str)
1744
1745     if timezone is None:
1746         timezone, date_str = extract_timezone(date_str)
1747
1748     with contextlib.suppress(ValueError):
1749         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1750         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1751         return calendar.timegm(dt.timetuple())
1752
1753
1754 def date_formats(day_first=True):
1755     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1756
1757
1758 def unified_strdate(date_str, day_first=True):
1759     """Return a string with the date in the format YYYYMMDD"""
1760
1761     if date_str is None:
1762         return None
1763     upload_date = None
1764     # Replace commas
1765     date_str = date_str.replace(',', ' ')
1766     # Remove AM/PM + timezone
1767     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1768     _, date_str = extract_timezone(date_str)
1769
1770     for expression in date_formats(day_first):
1771         with contextlib.suppress(ValueError):
1772             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1773     if upload_date is None:
1774         timetuple = email.utils.parsedate_tz(date_str)
1775         if timetuple:
1776             with contextlib.suppress(ValueError):
1777                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1778     if upload_date is not None:
1779         return str(upload_date)
1780
1781
1782 def unified_timestamp(date_str, day_first=True):
1783     if date_str is None:
1784         return None
1785
1786     date_str = re.sub(r'\s+', ' ', re.sub(
1787         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1788
1789     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1790     timezone, date_str = extract_timezone(date_str)
1791
1792     # Remove AM/PM + timezone
1793     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1794
1795     # Remove unrecognized timezones from ISO 8601 alike timestamps
1796     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1797     if m:
1798         date_str = date_str[:-len(m.group('tz'))]
1799
1800     # Python only supports microseconds, so remove nanoseconds
1801     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1802     if m:
1803         date_str = m.group(1)
1804
1805     for expression in date_formats(day_first):
1806         with contextlib.suppress(ValueError):
1807             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1808             return calendar.timegm(dt.timetuple())
1809
1810     timetuple = email.utils.parsedate_tz(date_str)
1811     if timetuple:
1812         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1813
1814
1815 def determine_ext(url, default_ext='unknown_video'):
1816     if url is None or '.' not in url:
1817         return default_ext
1818     guess = url.partition('?')[0].rpartition('.')[2]
1819     if re.match(r'^[A-Za-z0-9]+$', guess):
1820         return guess
1821     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1822     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1823         return guess.rstrip('/')
1824     else:
1825         return default_ext
1826
1827
1828 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1829     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1830
1831
1832 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1833     R"""
1834     Return a datetime object from a string.
1835     Supported format:
1836         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1837
1838     @param format       strftime format of DATE
1839     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1840                         auto: round to the unit provided in date_str (if applicable).
1841     """
1842     auto_precision = False
1843     if precision == 'auto':
1844         auto_precision = True
1845         precision = 'microsecond'
1846     today = datetime_round(datetime.datetime.utcnow(), precision)
1847     if date_str in ('now', 'today'):
1848         return today
1849     if date_str == 'yesterday':
1850         return today - datetime.timedelta(days=1)
1851     match = re.match(
1852         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1853         date_str)
1854     if match is not None:
1855         start_time = datetime_from_str(match.group('start'), precision, format)
1856         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1857         unit = match.group('unit')
1858         if unit == 'month' or unit == 'year':
1859             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1860             unit = 'day'
1861         else:
1862             if unit == 'week':
1863                 unit = 'day'
1864                 time *= 7
1865             delta = datetime.timedelta(**{unit + 's': time})
1866             new_date = start_time + delta
1867         if auto_precision:
1868             return datetime_round(new_date, unit)
1869         return new_date
1870
1871     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1872
1873
1874 def date_from_str(date_str, format='%Y%m%d', strict=False):
1875     R"""
1876     Return a date object from a string using datetime_from_str
1877
1878     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1879                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1880     """
1881     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1882         raise ValueError(f'Invalid date format "{date_str}"')
1883     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1884
1885
1886 def datetime_add_months(dt, months):
1887     """Increment/Decrement a datetime object by months."""
1888     month = dt.month + months - 1
1889     year = dt.year + month // 12
1890     month = month % 12 + 1
1891     day = min(dt.day, calendar.monthrange(year, month)[1])
1892     return dt.replace(year, month, day)
1893
1894
1895 def datetime_round(dt, precision='day'):
1896     """
1897     Round a datetime object's time to a specific precision
1898     """
1899     if precision == 'microsecond':
1900         return dt
1901
1902     unit_seconds = {
1903         'day': 86400,
1904         'hour': 3600,
1905         'minute': 60,
1906         'second': 1,
1907     }
1908     roundto = lambda x, n: ((x + n / 2) // n) * n
1909     timestamp = calendar.timegm(dt.timetuple())
1910     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1911
1912
1913 def hyphenate_date(date_str):
1914     """
1915     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1916     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1917     if match is not None:
1918         return '-'.join(match.groups())
1919     else:
1920         return date_str
1921
1922
1923 class DateRange:
1924     """Represents a time interval between two dates"""
1925
1926     def __init__(self, start=None, end=None):
1927         """start and end must be strings in the format accepted by date"""
1928         if start is not None:
1929             self.start = date_from_str(start, strict=True)
1930         else:
1931             self.start = datetime.datetime.min.date()
1932         if end is not None:
1933             self.end = date_from_str(end, strict=True)
1934         else:
1935             self.end = datetime.datetime.max.date()
1936         if self.start > self.end:
1937             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1938
1939     @classmethod
1940     def day(cls, day):
1941         """Returns a range that only contains the given day"""
1942         return cls(day, day)
1943
1944     def __contains__(self, date):
1945         """Check if the date is in the range"""
1946         if not isinstance(date, datetime.date):
1947             date = date_from_str(date)
1948         return self.start <= date <= self.end
1949
1950     def __str__(self):
1951         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1952
1953     def __eq__(self, other):
1954         return (isinstance(other, DateRange)
1955                 and self.start == other.start and self.end == other.end)
1956
1957
1958 def platform_name():
1959     """ Returns the platform name as a str """
1960     deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
1961     return platform.platform()
1962
1963
1964 @functools.cache
1965 def system_identifier():
1966     python_implementation = platform.python_implementation()
1967     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1968         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1969
1970     return 'Python %s (%s %s) - %s %s' % (
1971         platform.python_version(),
1972         python_implementation,
1973         platform.architecture()[0],
1974         platform.platform(),
1975         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1976     )
1977
1978
1979 @functools.cache
1980 def get_windows_version():
1981     ''' Get Windows version. returns () if it's not running on Windows '''
1982     if compat_os_name == 'nt':
1983         return version_tuple(platform.win32_ver()[1])
1984     else:
1985         return ()
1986
1987
1988 def write_string(s, out=None, encoding=None):
1989     assert isinstance(s, str)
1990     out = out or sys.stderr
1991
1992     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1993         s = re.sub(r'([\r\n]+)', r' \1', s)
1994
1995     enc, buffer = None, out
1996     if 'b' in getattr(out, 'mode', ''):
1997         enc = encoding or preferredencoding()
1998     elif hasattr(out, 'buffer'):
1999         buffer = out.buffer
2000         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2001
2002     buffer.write(s.encode(enc, 'ignore') if enc else s)
2003     out.flush()
2004
2005
2006 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2007     from . import _IN_CLI
2008     if _IN_CLI:
2009         if msg in deprecation_warning._cache:
2010             return
2011         deprecation_warning._cache.add(msg)
2012         if printer:
2013             return printer(f'{msg}{bug_reports_message()}', **kwargs)
2014         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2015     else:
2016         import warnings
2017         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2018
2019
2020 deprecation_warning._cache = set()
2021
2022
2023 def bytes_to_intlist(bs):
2024     if not bs:
2025         return []
2026     if isinstance(bs[0], int):  # Python 3
2027         return list(bs)
2028     else:
2029         return [ord(c) for c in bs]
2030
2031
2032 def intlist_to_bytes(xs):
2033     if not xs:
2034         return b''
2035     return struct.pack('%dB' % len(xs), *xs)
2036
2037
2038 class LockingUnsupportedError(OSError):
2039     msg = 'File locking is not supported'
2040
2041     def __init__(self):
2042         super().__init__(self.msg)
2043
2044
2045 # Cross-platform file locking
2046 if sys.platform == 'win32':
2047     import ctypes
2048     import ctypes.wintypes
2049     import msvcrt
2050
2051     class OVERLAPPED(ctypes.Structure):
2052         _fields_ = [
2053             ('Internal', ctypes.wintypes.LPVOID),
2054             ('InternalHigh', ctypes.wintypes.LPVOID),
2055             ('Offset', ctypes.wintypes.DWORD),
2056             ('OffsetHigh', ctypes.wintypes.DWORD),
2057             ('hEvent', ctypes.wintypes.HANDLE),
2058         ]
2059
2060     kernel32 = ctypes.windll.kernel32
2061     LockFileEx = kernel32.LockFileEx
2062     LockFileEx.argtypes = [
2063         ctypes.wintypes.HANDLE,     # hFile
2064         ctypes.wintypes.DWORD,      # dwFlags
2065         ctypes.wintypes.DWORD,      # dwReserved
2066         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2067         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2068         ctypes.POINTER(OVERLAPPED)  # Overlapped
2069     ]
2070     LockFileEx.restype = ctypes.wintypes.BOOL
2071     UnlockFileEx = kernel32.UnlockFileEx
2072     UnlockFileEx.argtypes = [
2073         ctypes.wintypes.HANDLE,     # hFile
2074         ctypes.wintypes.DWORD,      # dwReserved
2075         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2076         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2077         ctypes.POINTER(OVERLAPPED)  # Overlapped
2078     ]
2079     UnlockFileEx.restype = ctypes.wintypes.BOOL
2080     whole_low = 0xffffffff
2081     whole_high = 0x7fffffff
2082
2083     def _lock_file(f, exclusive, block):
2084         overlapped = OVERLAPPED()
2085         overlapped.Offset = 0
2086         overlapped.OffsetHigh = 0
2087         overlapped.hEvent = 0
2088         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2089
2090         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2091                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2092                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2093             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2094             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2095
2096     def _unlock_file(f):
2097         assert f._lock_file_overlapped_p
2098         handle = msvcrt.get_osfhandle(f.fileno())
2099         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2100             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2101
2102 else:
2103     try:
2104         import fcntl
2105
2106         def _lock_file(f, exclusive, block):
2107             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2108             if not block:
2109                 flags |= fcntl.LOCK_NB
2110             try:
2111                 fcntl.flock(f, flags)
2112             except BlockingIOError:
2113                 raise
2114             except OSError:  # AOSP does not have flock()
2115                 fcntl.lockf(f, flags)
2116
2117         def _unlock_file(f):
2118             try:
2119                 fcntl.flock(f, fcntl.LOCK_UN)
2120             except OSError:
2121                 fcntl.lockf(f, fcntl.LOCK_UN)
2122
2123     except ImportError:
2124
2125         def _lock_file(f, exclusive, block):
2126             raise LockingUnsupportedError()
2127
2128         def _unlock_file(f):
2129             raise LockingUnsupportedError()
2130
2131
2132 class locked_file:
2133     locked = False
2134
2135     def __init__(self, filename, mode, block=True, encoding=None):
2136         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2137             raise NotImplementedError(mode)
2138         self.mode, self.block = mode, block
2139
2140         writable = any(f in mode for f in 'wax+')
2141         readable = any(f in mode for f in 'r+')
2142         flags = functools.reduce(operator.ior, (
2143             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2144             getattr(os, 'O_BINARY', 0),  # Windows only
2145             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2146             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2147             os.O_APPEND if 'a' in mode else 0,
2148             os.O_EXCL if 'x' in mode else 0,
2149             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2150         ))
2151
2152         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2153
2154     def __enter__(self):
2155         exclusive = 'r' not in self.mode
2156         try:
2157             _lock_file(self.f, exclusive, self.block)
2158             self.locked = True
2159         except OSError:
2160             self.f.close()
2161             raise
2162         if 'w' in self.mode:
2163             try:
2164                 self.f.truncate()
2165             except OSError as e:
2166                 if e.errno not in (
2167                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2168                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2169                 ):
2170                     raise
2171         return self
2172
2173     def unlock(self):
2174         if not self.locked:
2175             return
2176         try:
2177             _unlock_file(self.f)
2178         finally:
2179             self.locked = False
2180
2181     def __exit__(self, *_):
2182         try:
2183             self.unlock()
2184         finally:
2185             self.f.close()
2186
2187     open = __enter__
2188     close = __exit__
2189
2190     def __getattr__(self, attr):
2191         return getattr(self.f, attr)
2192
2193     def __iter__(self):
2194         return iter(self.f)
2195
2196
2197 @functools.cache
2198 def get_filesystem_encoding():
2199     encoding = sys.getfilesystemencoding()
2200     return encoding if encoding is not None else 'utf-8'
2201
2202
2203 def shell_quote(args):
2204     quoted_args = []
2205     encoding = get_filesystem_encoding()
2206     for a in args:
2207         if isinstance(a, bytes):
2208             # We may get a filename encoded with 'encodeFilename'
2209             a = a.decode(encoding)
2210         quoted_args.append(compat_shlex_quote(a))
2211     return ' '.join(quoted_args)
2212
2213
2214 def smuggle_url(url, data):
2215     """ Pass additional data in a URL for internal use. """
2216
2217     url, idata = unsmuggle_url(url, {})
2218     data.update(idata)
2219     sdata = urllib.parse.urlencode(
2220         {'__youtubedl_smuggle': json.dumps(data)})
2221     return url + '#' + sdata
2222
2223
2224 def unsmuggle_url(smug_url, default=None):
2225     if '#__youtubedl_smuggle' not in smug_url:
2226         return smug_url, default
2227     url, _, sdata = smug_url.rpartition('#')
2228     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2229     data = json.loads(jsond)
2230     return url, data
2231
2232
2233 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2234     """ Formats numbers with decimal sufixes like K, M, etc """
2235     num, factor = float_or_none(num), float(factor)
2236     if num is None or num < 0:
2237         return None
2238     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2239     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2240     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2241     if factor == 1024:
2242         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2243     converted = num / (factor ** exponent)
2244     return fmt % (converted, suffix)
2245
2246
2247 def format_bytes(bytes):
2248     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2249
2250
2251 def lookup_unit_table(unit_table, s):
2252     units_re = '|'.join(re.escape(u) for u in unit_table)
2253     m = re.match(
2254         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2255     if not m:
2256         return None
2257     num_str = m.group('num').replace(',', '.')
2258     mult = unit_table[m.group('unit')]
2259     return int(float(num_str) * mult)
2260
2261
2262 def parse_filesize(s):
2263     if s is None:
2264         return None
2265
2266     # The lower-case forms are of course incorrect and unofficial,
2267     # but we support those too
2268     _UNIT_TABLE = {
2269         'B': 1,
2270         'b': 1,
2271         'bytes': 1,
2272         'KiB': 1024,
2273         'KB': 1000,
2274         'kB': 1024,
2275         'Kb': 1000,
2276         'kb': 1000,
2277         'kilobytes': 1000,
2278         'kibibytes': 1024,
2279         'MiB': 1024 ** 2,
2280         'MB': 1000 ** 2,
2281         'mB': 1024 ** 2,
2282         'Mb': 1000 ** 2,
2283         'mb': 1000 ** 2,
2284         'megabytes': 1000 ** 2,
2285         'mebibytes': 1024 ** 2,
2286         'GiB': 1024 ** 3,
2287         'GB': 1000 ** 3,
2288         'gB': 1024 ** 3,
2289         'Gb': 1000 ** 3,
2290         'gb': 1000 ** 3,
2291         'gigabytes': 1000 ** 3,
2292         'gibibytes': 1024 ** 3,
2293         'TiB': 1024 ** 4,
2294         'TB': 1000 ** 4,
2295         'tB': 1024 ** 4,
2296         'Tb': 1000 ** 4,
2297         'tb': 1000 ** 4,
2298         'terabytes': 1000 ** 4,
2299         'tebibytes': 1024 ** 4,
2300         'PiB': 1024 ** 5,
2301         'PB': 1000 ** 5,
2302         'pB': 1024 ** 5,
2303         'Pb': 1000 ** 5,
2304         'pb': 1000 ** 5,
2305         'petabytes': 1000 ** 5,
2306         'pebibytes': 1024 ** 5,
2307         'EiB': 1024 ** 6,
2308         'EB': 1000 ** 6,
2309         'eB': 1024 ** 6,
2310         'Eb': 1000 ** 6,
2311         'eb': 1000 ** 6,
2312         'exabytes': 1000 ** 6,
2313         'exbibytes': 1024 ** 6,
2314         'ZiB': 1024 ** 7,
2315         'ZB': 1000 ** 7,
2316         'zB': 1024 ** 7,
2317         'Zb': 1000 ** 7,
2318         'zb': 1000 ** 7,
2319         'zettabytes': 1000 ** 7,
2320         'zebibytes': 1024 ** 7,
2321         'YiB': 1024 ** 8,
2322         'YB': 1000 ** 8,
2323         'yB': 1024 ** 8,
2324         'Yb': 1000 ** 8,
2325         'yb': 1000 ** 8,
2326         'yottabytes': 1000 ** 8,
2327         'yobibytes': 1024 ** 8,
2328     }
2329
2330     return lookup_unit_table(_UNIT_TABLE, s)
2331
2332
2333 def parse_count(s):
2334     if s is None:
2335         return None
2336
2337     s = re.sub(r'^[^\d]+\s', '', s).strip()
2338
2339     if re.match(r'^[\d,.]+$', s):
2340         return str_to_int(s)
2341
2342     _UNIT_TABLE = {
2343         'k': 1000,
2344         'K': 1000,
2345         'm': 1000 ** 2,
2346         'M': 1000 ** 2,
2347         'kk': 1000 ** 2,
2348         'KK': 1000 ** 2,
2349         'b': 1000 ** 3,
2350         'B': 1000 ** 3,
2351     }
2352
2353     ret = lookup_unit_table(_UNIT_TABLE, s)
2354     if ret is not None:
2355         return ret
2356
2357     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2358     if mobj:
2359         return str_to_int(mobj.group(1))
2360
2361
2362 def parse_resolution(s, *, lenient=False):
2363     if s is None:
2364         return {}
2365
2366     if lenient:
2367         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2368     else:
2369         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2370     if mobj:
2371         return {
2372             'width': int(mobj.group('w')),
2373             'height': int(mobj.group('h')),
2374         }
2375
2376     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2377     if mobj:
2378         return {'height': int(mobj.group(1))}
2379
2380     mobj = re.search(r'\b([48])[kK]\b', s)
2381     if mobj:
2382         return {'height': int(mobj.group(1)) * 540}
2383
2384     return {}
2385
2386
2387 def parse_bitrate(s):
2388     if not isinstance(s, str):
2389         return
2390     mobj = re.search(r'\b(\d+)\s*kbps', s)
2391     if mobj:
2392         return int(mobj.group(1))
2393
2394
2395 def month_by_name(name, lang='en'):
2396     """ Return the number of a month by (locale-independently) English name """
2397
2398     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2399
2400     try:
2401         return month_names.index(name) + 1
2402     except ValueError:
2403         return None
2404
2405
2406 def month_by_abbreviation(abbrev):
2407     """ Return the number of a month by (locale-independently) English
2408         abbreviations """
2409
2410     try:
2411         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2412     except ValueError:
2413         return None
2414
2415
2416 def fix_xml_ampersands(xml_str):
2417     """Replace all the '&' by '&amp;' in XML"""
2418     return re.sub(
2419         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2420         '&amp;',
2421         xml_str)
2422
2423
2424 def setproctitle(title):
2425     assert isinstance(title, str)
2426
2427     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2428     try:
2429         import ctypes
2430     except ImportError:
2431         return
2432
2433     try:
2434         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2435     except OSError:
2436         return
2437     except TypeError:
2438         # LoadLibrary in Windows Python 2.7.13 only expects
2439         # a bytestring, but since unicode_literals turns
2440         # every string into a unicode string, it fails.
2441         return
2442     title_bytes = title.encode()
2443     buf = ctypes.create_string_buffer(len(title_bytes))
2444     buf.value = title_bytes
2445     try:
2446         libc.prctl(15, buf, 0, 0, 0)
2447     except AttributeError:
2448         return  # Strange libc, just skip this
2449
2450
2451 def remove_start(s, start):
2452     return s[len(start):] if s is not None and s.startswith(start) else s
2453
2454
2455 def remove_end(s, end):
2456     return s[:-len(end)] if s is not None and s.endswith(end) else s
2457
2458
2459 def remove_quotes(s):
2460     if s is None or len(s) < 2:
2461         return s
2462     for quote in ('"', "'", ):
2463         if s[0] == quote and s[-1] == quote:
2464             return s[1:-1]
2465     return s
2466
2467
2468 def get_domain(url):
2469     """
2470     This implementation is inconsistent, but is kept for compatibility.
2471     Use this only for "webpage_url_domain"
2472     """
2473     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2474
2475
2476 def url_basename(url):
2477     path = urllib.parse.urlparse(url).path
2478     return path.strip('/').split('/')[-1]
2479
2480
2481 def base_url(url):
2482     return re.match(r'https?://[^?#]+/', url).group()
2483
2484
2485 def urljoin(base, path):
2486     if isinstance(path, bytes):
2487         path = path.decode()
2488     if not isinstance(path, str) or not path:
2489         return None
2490     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2491         return path
2492     if isinstance(base, bytes):
2493         base = base.decode()
2494     if not isinstance(base, str) or not re.match(
2495             r'^(?:https?:)?//', base):
2496         return None
2497     return urllib.parse.urljoin(base, path)
2498
2499
2500 class HEADRequest(urllib.request.Request):
2501     def get_method(self):
2502         return 'HEAD'
2503
2504
2505 class PUTRequest(urllib.request.Request):
2506     def get_method(self):
2507         return 'PUT'
2508
2509
2510 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2511     if get_attr and v is not None:
2512         v = getattr(v, get_attr, None)
2513     try:
2514         return int(v) * invscale // scale
2515     except (ValueError, TypeError, OverflowError):
2516         return default
2517
2518
2519 def str_or_none(v, default=None):
2520     return default if v is None else str(v)
2521
2522
2523 def str_to_int(int_str):
2524     """ A more relaxed version of int_or_none """
2525     if isinstance(int_str, int):
2526         return int_str
2527     elif isinstance(int_str, str):
2528         int_str = re.sub(r'[,\.\+]', '', int_str)
2529         return int_or_none(int_str)
2530
2531
2532 def float_or_none(v, scale=1, invscale=1, default=None):
2533     if v is None:
2534         return default
2535     try:
2536         return float(v) * invscale / scale
2537     except (ValueError, TypeError):
2538         return default
2539
2540
2541 def bool_or_none(v, default=None):
2542     return v if isinstance(v, bool) else default
2543
2544
2545 def strip_or_none(v, default=None):
2546     return v.strip() if isinstance(v, str) else default
2547
2548
2549 def url_or_none(url):
2550     if not url or not isinstance(url, str):
2551         return None
2552     url = url.strip()
2553     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2554
2555
2556 def request_to_url(req):
2557     if isinstance(req, urllib.request.Request):
2558         return req.get_full_url()
2559     else:
2560         return req
2561
2562
2563 def strftime_or_none(timestamp, date_format, default=None):
2564     datetime_object = None
2565     try:
2566         if isinstance(timestamp, (int, float)):  # unix timestamp
2567             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2568         elif isinstance(timestamp, str):  # assume YYYYMMDD
2569             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2570         return datetime_object.strftime(date_format)
2571     except (ValueError, TypeError, AttributeError):
2572         return default
2573
2574
2575 def parse_duration(s):
2576     if not isinstance(s, str):
2577         return None
2578     s = s.strip()
2579     if not s:
2580         return None
2581
2582     days, hours, mins, secs, ms = [None] * 5
2583     m = re.match(r'''(?x)
2584             (?P<before_secs>
2585                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2586             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2587             (?P<ms>[.:][0-9]+)?Z?$
2588         ''', s)
2589     if m:
2590         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2591     else:
2592         m = re.match(
2593             r'''(?ix)(?:P?
2594                 (?:
2595                     [0-9]+\s*y(?:ears?)?,?\s*
2596                 )?
2597                 (?:
2598                     [0-9]+\s*m(?:onths?)?,?\s*
2599                 )?
2600                 (?:
2601                     [0-9]+\s*w(?:eeks?)?,?\s*
2602                 )?
2603                 (?:
2604                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2605                 )?
2606                 T)?
2607                 (?:
2608                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2609                 )?
2610                 (?:
2611                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2612                 )?
2613                 (?:
2614                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2615                 )?Z?$''', s)
2616         if m:
2617             days, hours, mins, secs, ms = m.groups()
2618         else:
2619             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2620             if m:
2621                 hours, mins = m.groups()
2622             else:
2623                 return None
2624
2625     if ms:
2626         ms = ms.replace(':', '.')
2627     return sum(float(part or 0) * mult for part, mult in (
2628         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2629
2630
2631 def prepend_extension(filename, ext, expected_real_ext=None):
2632     name, real_ext = os.path.splitext(filename)
2633     return (
2634         f'{name}.{ext}{real_ext}'
2635         if not expected_real_ext or real_ext[1:] == expected_real_ext
2636         else f'{filename}.{ext}')
2637
2638
2639 def replace_extension(filename, ext, expected_real_ext=None):
2640     name, real_ext = os.path.splitext(filename)
2641     return '{}.{}'.format(
2642         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2643         ext)
2644
2645
2646 def check_executable(exe, args=[]):
2647     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2648     args can be a list of arguments for a short output (like -version) """
2649     try:
2650         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2651     except OSError:
2652         return False
2653     return exe
2654
2655
2656 def _get_exe_version_output(exe, args, *, to_screen=None):
2657     if to_screen:
2658         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2659     try:
2660         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2661         # SIGTTOU if yt-dlp is run in the background.
2662         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2663         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2664                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2665     except OSError:
2666         return False
2667     return stdout
2668
2669
2670 def detect_exe_version(output, version_re=None, unrecognized='present'):
2671     assert isinstance(output, str)
2672     if version_re is None:
2673         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2674     m = re.search(version_re, output)
2675     if m:
2676         return m.group(1)
2677     else:
2678         return unrecognized
2679
2680
2681 def get_exe_version(exe, args=['--version'],
2682                     version_re=None, unrecognized='present'):
2683     """ Returns the version of the specified executable,
2684     or False if the executable is not present """
2685     out = _get_exe_version_output(exe, args)
2686     return detect_exe_version(out, version_re, unrecognized) if out else False
2687
2688
2689 def frange(start=0, stop=None, step=1):
2690     """Float range"""
2691     if stop is None:
2692         start, stop = 0, start
2693     sign = [-1, 1][step > 0] if step else 0
2694     while sign * start < sign * stop:
2695         yield start
2696         start += step
2697
2698
2699 class LazyList(collections.abc.Sequence):
2700     """Lazy immutable list from an iterable
2701     Note that slices of a LazyList are lists and not LazyList"""
2702
2703     class IndexError(IndexError):
2704         pass
2705
2706     def __init__(self, iterable, *, reverse=False, _cache=None):
2707         self._iterable = iter(iterable)
2708         self._cache = [] if _cache is None else _cache
2709         self._reversed = reverse
2710
2711     def __iter__(self):
2712         if self._reversed:
2713             # We need to consume the entire iterable to iterate in reverse
2714             yield from self.exhaust()
2715             return
2716         yield from self._cache
2717         for item in self._iterable:
2718             self._cache.append(item)
2719             yield item
2720
2721     def _exhaust(self):
2722         self._cache.extend(self._iterable)
2723         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2724         return self._cache
2725
2726     def exhaust(self):
2727         """Evaluate the entire iterable"""
2728         return self._exhaust()[::-1 if self._reversed else 1]
2729
2730     @staticmethod
2731     def _reverse_index(x):
2732         return None if x is None else ~x
2733
2734     def __getitem__(self, idx):
2735         if isinstance(idx, slice):
2736             if self._reversed:
2737                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2738             start, stop, step = idx.start, idx.stop, idx.step or 1
2739         elif isinstance(idx, int):
2740             if self._reversed:
2741                 idx = self._reverse_index(idx)
2742             start, stop, step = idx, idx, 0
2743         else:
2744             raise TypeError('indices must be integers or slices')
2745         if ((start or 0) < 0 or (stop or 0) < 0
2746                 or (start is None and step < 0)
2747                 or (stop is None and step > 0)):
2748             # We need to consume the entire iterable to be able to slice from the end
2749             # Obviously, never use this with infinite iterables
2750             self._exhaust()
2751             try:
2752                 return self._cache[idx]
2753             except IndexError as e:
2754                 raise self.IndexError(e) from e
2755         n = max(start or 0, stop or 0) - len(self._cache) + 1
2756         if n > 0:
2757             self._cache.extend(itertools.islice(self._iterable, n))
2758         try:
2759             return self._cache[idx]
2760         except IndexError as e:
2761             raise self.IndexError(e) from e
2762
2763     def __bool__(self):
2764         try:
2765             self[-1] if self._reversed else self[0]
2766         except self.IndexError:
2767             return False
2768         return True
2769
2770     def __len__(self):
2771         self._exhaust()
2772         return len(self._cache)
2773
2774     def __reversed__(self):
2775         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2776
2777     def __copy__(self):
2778         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2779
2780     def __repr__(self):
2781         # repr and str should mimic a list. So we exhaust the iterable
2782         return repr(self.exhaust())
2783
2784     def __str__(self):
2785         return repr(self.exhaust())
2786
2787
2788 class PagedList:
2789
2790     class IndexError(IndexError):
2791         pass
2792
2793     def __len__(self):
2794         # This is only useful for tests
2795         return len(self.getslice())
2796
2797     def __init__(self, pagefunc, pagesize, use_cache=True):
2798         self._pagefunc = pagefunc
2799         self._pagesize = pagesize
2800         self._pagecount = float('inf')
2801         self._use_cache = use_cache
2802         self._cache = {}
2803
2804     def getpage(self, pagenum):
2805         page_results = self._cache.get(pagenum)
2806         if page_results is None:
2807             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2808         if self._use_cache:
2809             self._cache[pagenum] = page_results
2810         return page_results
2811
2812     def getslice(self, start=0, end=None):
2813         return list(self._getslice(start, end))
2814
2815     def _getslice(self, start, end):
2816         raise NotImplementedError('This method must be implemented by subclasses')
2817
2818     def __getitem__(self, idx):
2819         assert self._use_cache, 'Indexing PagedList requires cache'
2820         if not isinstance(idx, int) or idx < 0:
2821             raise TypeError('indices must be non-negative integers')
2822         entries = self.getslice(idx, idx + 1)
2823         if not entries:
2824             raise self.IndexError()
2825         return entries[0]
2826
2827
2828 class OnDemandPagedList(PagedList):
2829     """Download pages until a page with less than maximum results"""
2830
2831     def _getslice(self, start, end):
2832         for pagenum in itertools.count(start // self._pagesize):
2833             firstid = pagenum * self._pagesize
2834             nextfirstid = pagenum * self._pagesize + self._pagesize
2835             if start >= nextfirstid:
2836                 continue
2837
2838             startv = (
2839                 start % self._pagesize
2840                 if firstid <= start < nextfirstid
2841                 else 0)
2842             endv = (
2843                 ((end - 1) % self._pagesize) + 1
2844                 if (end is not None and firstid <= end <= nextfirstid)
2845                 else None)
2846
2847             try:
2848                 page_results = self.getpage(pagenum)
2849             except Exception:
2850                 self._pagecount = pagenum - 1
2851                 raise
2852             if startv != 0 or endv is not None:
2853                 page_results = page_results[startv:endv]
2854             yield from page_results
2855
2856             # A little optimization - if current page is not "full", ie. does
2857             # not contain page_size videos then we can assume that this page
2858             # is the last one - there are no more ids on further pages -
2859             # i.e. no need to query again.
2860             if len(page_results) + startv < self._pagesize:
2861                 break
2862
2863             # If we got the whole page, but the next page is not interesting,
2864             # break out early as well
2865             if end == nextfirstid:
2866                 break
2867
2868
2869 class InAdvancePagedList(PagedList):
2870     """PagedList with total number of pages known in advance"""
2871
2872     def __init__(self, pagefunc, pagecount, pagesize):
2873         PagedList.__init__(self, pagefunc, pagesize, True)
2874         self._pagecount = pagecount
2875
2876     def _getslice(self, start, end):
2877         start_page = start // self._pagesize
2878         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2879         skip_elems = start - start_page * self._pagesize
2880         only_more = None if end is None else end - start
2881         for pagenum in range(start_page, end_page):
2882             page_results = self.getpage(pagenum)
2883             if skip_elems:
2884                 page_results = page_results[skip_elems:]
2885                 skip_elems = None
2886             if only_more is not None:
2887                 if len(page_results) < only_more:
2888                     only_more -= len(page_results)
2889                 else:
2890                     yield from page_results[:only_more]
2891                     break
2892             yield from page_results
2893
2894
2895 class PlaylistEntries:
2896     MissingEntry = object()
2897     is_exhausted = False
2898
2899     def __init__(self, ydl, info_dict):
2900         self.ydl = ydl
2901
2902         # _entries must be assigned now since infodict can change during iteration
2903         entries = info_dict.get('entries')
2904         if entries is None:
2905             raise EntryNotInPlaylist('There are no entries')
2906         elif isinstance(entries, list):
2907             self.is_exhausted = True
2908
2909         requested_entries = info_dict.get('requested_entries')
2910         self.is_incomplete = bool(requested_entries)
2911         if self.is_incomplete:
2912             assert self.is_exhausted
2913             self._entries = [self.MissingEntry] * max(requested_entries)
2914             for i, entry in zip(requested_entries, entries):
2915                 self._entries[i - 1] = entry
2916         elif isinstance(entries, (list, PagedList, LazyList)):
2917             self._entries = entries
2918         else:
2919             self._entries = LazyList(entries)
2920
2921     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2922         (?P<start>[+-]?\d+)?
2923         (?P<range>[:-]
2924             (?P<end>[+-]?\d+|inf(?:inite)?)?
2925             (?::(?P<step>[+-]?\d+))?
2926         )?''')
2927
2928     @classmethod
2929     def parse_playlist_items(cls, string):
2930         for segment in string.split(','):
2931             if not segment:
2932                 raise ValueError('There is two or more consecutive commas')
2933             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2934             if not mobj:
2935                 raise ValueError(f'{segment!r} is not a valid specification')
2936             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2937             if int_or_none(step) == 0:
2938                 raise ValueError(f'Step in {segment!r} cannot be zero')
2939             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2940
2941     def get_requested_items(self):
2942         playlist_items = self.ydl.params.get('playlist_items')
2943         playlist_start = self.ydl.params.get('playliststart', 1)
2944         playlist_end = self.ydl.params.get('playlistend')
2945         # For backwards compatibility, interpret -1 as whole list
2946         if playlist_end in (-1, None):
2947             playlist_end = ''
2948         if not playlist_items:
2949             playlist_items = f'{playlist_start}:{playlist_end}'
2950         elif playlist_start != 1 or playlist_end:
2951             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2952
2953         for index in self.parse_playlist_items(playlist_items):
2954             for i, entry in self[index]:
2955                 yield i, entry
2956                 if not entry:
2957                     continue
2958                 try:
2959                     # TODO: Add auto-generated fields
2960                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2961                 except (ExistingVideoReached, RejectedVideoReached):
2962                     return
2963
2964     def get_full_count(self):
2965         if self.is_exhausted and not self.is_incomplete:
2966             return len(self)
2967         elif isinstance(self._entries, InAdvancePagedList):
2968             if self._entries._pagesize == 1:
2969                 return self._entries._pagecount
2970
2971     @functools.cached_property
2972     def _getter(self):
2973         if isinstance(self._entries, list):
2974             def get_entry(i):
2975                 try:
2976                     entry = self._entries[i]
2977                 except IndexError:
2978                     entry = self.MissingEntry
2979                     if not self.is_incomplete:
2980                         raise self.IndexError()
2981                 if entry is self.MissingEntry:
2982                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2983                 return entry
2984         else:
2985             def get_entry(i):
2986                 try:
2987                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2988                 except (LazyList.IndexError, PagedList.IndexError):
2989                     raise self.IndexError()
2990         return get_entry
2991
2992     def __getitem__(self, idx):
2993         if isinstance(idx, int):
2994             idx = slice(idx, idx)
2995
2996         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2997         step = 1 if idx.step is None else idx.step
2998         if idx.start is None:
2999             start = 0 if step > 0 else len(self) - 1
3000         else:
3001             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3002
3003         # NB: Do not call len(self) when idx == [:]
3004         if idx.stop is None:
3005             stop = 0 if step < 0 else float('inf')
3006         else:
3007             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3008         stop += [-1, 1][step > 0]
3009
3010         for i in frange(start, stop, step):
3011             if i < 0:
3012                 continue
3013             try:
3014                 entry = self._getter(i)
3015             except self.IndexError:
3016                 self.is_exhausted = True
3017                 if step > 0:
3018                     break
3019                 continue
3020             yield i + 1, entry
3021
3022     def __len__(self):
3023         return len(tuple(self[:]))
3024
3025     class IndexError(IndexError):
3026         pass
3027
3028
3029 def uppercase_escape(s):
3030     unicode_escape = codecs.getdecoder('unicode_escape')
3031     return re.sub(
3032         r'\\U[0-9a-fA-F]{8}',
3033         lambda m: unicode_escape(m.group(0))[0],
3034         s)
3035
3036
3037 def lowercase_escape(s):
3038     unicode_escape = codecs.getdecoder('unicode_escape')
3039     return re.sub(
3040         r'\\u[0-9a-fA-F]{4}',
3041         lambda m: unicode_escape(m.group(0))[0],
3042         s)
3043
3044
3045 def escape_rfc3986(s):
3046     """Escape non-ASCII characters as suggested by RFC 3986"""
3047     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3048
3049
3050 def escape_url(url):
3051     """Escape URL as suggested by RFC 3986"""
3052     url_parsed = urllib.parse.urlparse(url)
3053     return url_parsed._replace(
3054         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3055         path=escape_rfc3986(url_parsed.path),
3056         params=escape_rfc3986(url_parsed.params),
3057         query=escape_rfc3986(url_parsed.query),
3058         fragment=escape_rfc3986(url_parsed.fragment)
3059     ).geturl()
3060
3061
3062 def parse_qs(url):
3063     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3064
3065
3066 def read_batch_urls(batch_fd):
3067     def fixup(url):
3068         if not isinstance(url, str):
3069             url = url.decode('utf-8', 'replace')
3070         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3071         for bom in BOM_UTF8:
3072             if url.startswith(bom):
3073                 url = url[len(bom):]
3074         url = url.lstrip()
3075         if not url or url.startswith(('#', ';', ']')):
3076             return False
3077         # "#" cannot be stripped out since it is part of the URI
3078         # However, it can be safely stripped out if following a whitespace
3079         return re.split(r'\s#', url, 1)[0].rstrip()
3080
3081     with contextlib.closing(batch_fd) as fd:
3082         return [url for url in map(fixup, fd) if url]
3083
3084
3085 def urlencode_postdata(*args, **kargs):
3086     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3087
3088
3089 def update_url_query(url, query):
3090     if not query:
3091         return url
3092     parsed_url = urllib.parse.urlparse(url)
3093     qs = urllib.parse.parse_qs(parsed_url.query)
3094     qs.update(query)
3095     return urllib.parse.urlunparse(parsed_url._replace(
3096         query=urllib.parse.urlencode(qs, True)))
3097
3098
3099 def update_Request(req, url=None, data=None, headers=None, query=None):
3100     req_headers = req.headers.copy()
3101     req_headers.update(headers or {})
3102     req_data = data or req.data
3103     req_url = update_url_query(url or req.get_full_url(), query)
3104     req_get_method = req.get_method()
3105     if req_get_method == 'HEAD':
3106         req_type = HEADRequest
3107     elif req_get_method == 'PUT':
3108         req_type = PUTRequest
3109     else:
3110         req_type = urllib.request.Request
3111     new_req = req_type(
3112         req_url, data=req_data, headers=req_headers,
3113         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3114     if hasattr(req, 'timeout'):
3115         new_req.timeout = req.timeout
3116     return new_req
3117
3118
3119 def _multipart_encode_impl(data, boundary):
3120     content_type = 'multipart/form-data; boundary=%s' % boundary
3121
3122     out = b''
3123     for k, v in data.items():
3124         out += b'--' + boundary.encode('ascii') + b'\r\n'
3125         if isinstance(k, str):
3126             k = k.encode()
3127         if isinstance(v, str):
3128             v = v.encode()
3129         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3130         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3131         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3132         if boundary.encode('ascii') in content:
3133             raise ValueError('Boundary overlaps with data')
3134         out += content
3135
3136     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3137
3138     return out, content_type
3139
3140
3141 def multipart_encode(data, boundary=None):
3142     '''
3143     Encode a dict to RFC 7578-compliant form-data
3144
3145     data:
3146         A dict where keys and values can be either Unicode or bytes-like
3147         objects.
3148     boundary:
3149         If specified a Unicode object, it's used as the boundary. Otherwise
3150         a random boundary is generated.
3151
3152     Reference: https://tools.ietf.org/html/rfc7578
3153     '''
3154     has_specified_boundary = boundary is not None
3155
3156     while True:
3157         if boundary is None:
3158             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3159
3160         try:
3161             out, content_type = _multipart_encode_impl(data, boundary)
3162             break
3163         except ValueError:
3164             if has_specified_boundary:
3165                 raise
3166             boundary = None
3167
3168     return out, content_type
3169
3170
3171 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3172     for val in map(d.get, variadic(key_or_keys)):
3173         if val is not None and (val or not skip_false_values):
3174             return val
3175     return default
3176
3177
3178 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3179     for f in funcs:
3180         try:
3181             val = f(*args, **kwargs)
3182         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3183             pass
3184         else:
3185             if expected_type is None or isinstance(val, expected_type):
3186                 return val
3187
3188
3189 def try_get(src, getter, expected_type=None):
3190     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3191
3192
3193 def filter_dict(dct, cndn=lambda _, v: v is not None):
3194     return {k: v for k, v in dct.items() if cndn(k, v)}
3195
3196
3197 def merge_dicts(*dicts):
3198     merged = {}
3199     for a_dict in dicts:
3200         for k, v in a_dict.items():
3201             if (v is not None and k not in merged
3202                     or isinstance(v, str) and merged[k] == ''):
3203                 merged[k] = v
3204     return merged
3205
3206
3207 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3208     return string if isinstance(string, str) else str(string, encoding, errors)
3209
3210
3211 US_RATINGS = {
3212     'G': 0,
3213     'PG': 10,
3214     'PG-13': 13,
3215     'R': 16,
3216     'NC': 18,
3217 }
3218
3219
3220 TV_PARENTAL_GUIDELINES = {
3221     'TV-Y': 0,
3222     'TV-Y7': 7,
3223     'TV-G': 0,
3224     'TV-PG': 0,
3225     'TV-14': 14,
3226     'TV-MA': 17,
3227 }
3228
3229
3230 def parse_age_limit(s):
3231     # isinstance(False, int) is True. So type() must be used instead
3232     if type(s) is int:  # noqa: E721
3233         return s if 0 <= s <= 21 else None
3234     elif not isinstance(s, str):
3235         return None
3236     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3237     if m:
3238         return int(m.group('age'))
3239     s = s.upper()
3240     if s in US_RATINGS:
3241         return US_RATINGS[s]
3242     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3243     if m:
3244         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3245     return None
3246
3247
3248 def strip_jsonp(code):
3249     return re.sub(
3250         r'''(?sx)^
3251             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3252             (?:\s*&&\s*(?P=func_name))?
3253             \s*\(\s*(?P<callback_data>.*)\);?
3254             \s*?(?://[^\n]*)*$''',
3255         r'\g<callback_data>', code)
3256
3257
3258 def js_to_json(code, vars={}, *, strict=False):
3259     # vars is a dict of var, val pairs to substitute
3260     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3261     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3262     INTEGER_TABLE = (
3263         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3264         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3265     )
3266
3267     def fix_kv(m):
3268         v = m.group(0)
3269         if v in ('true', 'false', 'null'):
3270             return v
3271         elif v in ('undefined', 'void 0'):
3272             return 'null'
3273         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3274             return ""
3275
3276         if v[0] in ("'", '"'):
3277             v = re.sub(r'(?s)\\.|"', lambda m: {
3278                 '"': '\\"',
3279                 "\\'": "'",
3280                 '\\\n': '',
3281                 '\\x': '\\u00',
3282             }.get(m.group(0), m.group(0)), v[1:-1])
3283         else:
3284             for regex, base in INTEGER_TABLE:
3285                 im = re.match(regex, v)
3286                 if im:
3287                     i = int(im.group(1), base)
3288                     return '"%d":' % i if v.endswith(':') else '%d' % i
3289
3290             if v in vars:
3291                 return vars[v]
3292             if strict:
3293                 raise ValueError(f'Unknown value: {v}')
3294
3295         return '"%s"' % v
3296
3297     def create_map(mobj):
3298         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3299
3300     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3301     if not strict:
3302         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3303
3304     return re.sub(r'''(?sx)
3305         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3306         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3307         {comment}|,(?={skip}[\]}}])|
3308         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3309         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3310         [0-9]+(?={skip}:)|
3311         !+
3312         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3313
3314
3315 def qualities(quality_ids):
3316     """ Get a numeric quality value out of a list of possible values """
3317     def q(qid):
3318         try:
3319             return quality_ids.index(qid)
3320         except ValueError:
3321             return -1
3322     return q
3323
3324
3325 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3326
3327
3328 DEFAULT_OUTTMPL = {
3329     'default': '%(title)s [%(id)s].%(ext)s',
3330     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3331 }
3332 OUTTMPL_TYPES = {
3333     'chapter': None,
3334     'subtitle': None,
3335     'thumbnail': None,
3336     'description': 'description',
3337     'annotation': 'annotations.xml',
3338     'infojson': 'info.json',
3339     'link': None,
3340     'pl_video': None,
3341     'pl_thumbnail': None,
3342     'pl_description': 'description',
3343     'pl_infojson': 'info.json',
3344 }
3345
3346 # As of [1] format syntax is:
3347 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3348 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3349 STR_FORMAT_RE_TMPL = r'''(?x)
3350     (?<!%)(?P<prefix>(?:%%)*)
3351     %
3352     (?P<has_key>\((?P<key>{0})\))?
3353     (?P<format>
3354         (?P<conversion>[#0\-+ ]+)?
3355         (?P<min_width>\d+)?
3356         (?P<precision>\.\d+)?
3357         (?P<len_mod>[hlL])?  # unused in python
3358         {1}  # conversion type
3359     )
3360 '''
3361
3362
3363 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3364
3365
3366 def limit_length(s, length):
3367     """ Add ellipses to overly long strings """
3368     if s is None:
3369         return None
3370     ELLIPSES = '...'
3371     if len(s) > length:
3372         return s[:length - len(ELLIPSES)] + ELLIPSES
3373     return s
3374
3375
3376 def version_tuple(v):
3377     return tuple(int(e) for e in re.split(r'[-.]', v))
3378
3379
3380 def is_outdated_version(version, limit, assume_new=True):
3381     if not version:
3382         return not assume_new
3383     try:
3384         return version_tuple(version) < version_tuple(limit)
3385     except ValueError:
3386         return not assume_new
3387
3388
3389 def ytdl_is_updateable():
3390     """ Returns if yt-dlp can be updated with -U """
3391
3392     from .update import is_non_updateable
3393
3394     return not is_non_updateable()
3395
3396
3397 def args_to_str(args):
3398     # Get a short string representation for a subprocess command
3399     return ' '.join(compat_shlex_quote(a) for a in args)
3400
3401
3402 def error_to_compat_str(err):
3403     return str(err)
3404
3405
3406 def error_to_str(err):
3407     return f'{type(err).__name__}: {err}'
3408
3409
3410 def mimetype2ext(mt):
3411     if mt is None:
3412         return None
3413
3414     mt, _, params = mt.partition(';')
3415     mt = mt.strip()
3416
3417     FULL_MAP = {
3418         'audio/mp4': 'm4a',
3419         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3420         # it's the most popular one
3421         'audio/mpeg': 'mp3',
3422         'audio/x-wav': 'wav',
3423         'audio/wav': 'wav',
3424         'audio/wave': 'wav',
3425     }
3426
3427     ext = FULL_MAP.get(mt)
3428     if ext is not None:
3429         return ext
3430
3431     SUBTYPE_MAP = {
3432         '3gpp': '3gp',
3433         'smptett+xml': 'tt',
3434         'ttaf+xml': 'dfxp',
3435         'ttml+xml': 'ttml',
3436         'x-flv': 'flv',
3437         'x-mp4-fragmented': 'mp4',
3438         'x-ms-sami': 'sami',
3439         'x-ms-wmv': 'wmv',
3440         'mpegurl': 'm3u8',
3441         'x-mpegurl': 'm3u8',
3442         'vnd.apple.mpegurl': 'm3u8',
3443         'dash+xml': 'mpd',
3444         'f4m+xml': 'f4m',
3445         'hds+xml': 'f4m',
3446         'vnd.ms-sstr+xml': 'ism',
3447         'quicktime': 'mov',
3448         'mp2t': 'ts',
3449         'x-wav': 'wav',
3450         'filmstrip+json': 'fs',
3451         'svg+xml': 'svg',
3452     }
3453
3454     _, _, subtype = mt.rpartition('/')
3455     ext = SUBTYPE_MAP.get(subtype.lower())
3456     if ext is not None:
3457         return ext
3458
3459     SUFFIX_MAP = {
3460         'json': 'json',
3461         'xml': 'xml',
3462         'zip': 'zip',
3463         'gzip': 'gz',
3464     }
3465
3466     _, _, suffix = subtype.partition('+')
3467     ext = SUFFIX_MAP.get(suffix)
3468     if ext is not None:
3469         return ext
3470
3471     return subtype.replace('+', '.')
3472
3473
3474 def ext2mimetype(ext_or_url):
3475     if not ext_or_url:
3476         return None
3477     if '.' not in ext_or_url:
3478         ext_or_url = f'file.{ext_or_url}'
3479     return mimetypes.guess_type(ext_or_url)[0]
3480
3481
3482 def parse_codecs(codecs_str):
3483     # http://tools.ietf.org/html/rfc6381
3484     if not codecs_str:
3485         return {}
3486     split_codecs = list(filter(None, map(
3487         str.strip, codecs_str.strip().strip(',').split(','))))
3488     vcodec, acodec, scodec, hdr = None, None, None, None
3489     for full_codec in split_codecs:
3490         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3491         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3492                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3493             if vcodec:
3494                 continue
3495             vcodec = full_codec
3496             if parts[0] in ('dvh1', 'dvhe'):
3497                 hdr = 'DV'
3498             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3499                 hdr = 'HDR10'
3500             elif parts[:2] == ['vp9', '2']:
3501                 hdr = 'HDR10'
3502         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3503                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3504             acodec = acodec or full_codec
3505         elif parts[0] in ('stpp', 'wvtt'):
3506             scodec = scodec or full_codec
3507         else:
3508             write_string(f'WARNING: Unknown codec {full_codec}\n')
3509     if vcodec or acodec or scodec:
3510         return {
3511             'vcodec': vcodec or 'none',
3512             'acodec': acodec or 'none',
3513             'dynamic_range': hdr,
3514             **({'scodec': scodec} if scodec is not None else {}),
3515         }
3516     elif len(split_codecs) == 2:
3517         return {
3518             'vcodec': split_codecs[0],
3519             'acodec': split_codecs[1],
3520         }
3521     return {}
3522
3523
3524 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3525     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3526
3527     allow_mkv = not preferences or 'mkv' in preferences
3528
3529     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3530         return 'mkv'  # TODO: any other format allows this?
3531
3532     # TODO: All codecs supported by parse_codecs isn't handled here
3533     COMPATIBLE_CODECS = {
3534         'mp4': {
3535             'av1', 'hevc', 'avc1', 'mp4a',  # fourcc (m3u8, mpd)
3536             'h264', 'aacl',  # Set in ISM
3537         },
3538         'webm': {
3539             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3540             'vp9x', 'vp8x',  # in the webm spec
3541         },
3542     }
3543
3544     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3545     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3546
3547     for ext in preferences or COMPATIBLE_CODECS.keys():
3548         codec_set = COMPATIBLE_CODECS.get(ext, set())
3549         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3550             return ext
3551
3552     COMPATIBLE_EXTS = (
3553         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3554         {'webm'},
3555     )
3556     for ext in preferences or vexts:
3557         current_exts = {ext, *vexts, *aexts}
3558         if ext == 'mkv' or current_exts == {ext} or any(
3559                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3560             return ext
3561     return 'mkv' if allow_mkv else preferences[-1]
3562
3563
3564 def urlhandle_detect_ext(url_handle):
3565     getheader = url_handle.headers.get
3566
3567     cd = getheader('Content-Disposition')
3568     if cd:
3569         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3570         if m:
3571             e = determine_ext(m.group('filename'), default_ext=None)
3572             if e:
3573                 return e
3574
3575     return mimetype2ext(getheader('Content-Type'))
3576
3577
3578 def encode_data_uri(data, mime_type):
3579     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3580
3581
3582 def age_restricted(content_limit, age_limit):
3583     """ Returns True iff the content should be blocked """
3584
3585     if age_limit is None:  # No limit set
3586         return False
3587     if content_limit is None:
3588         return False  # Content available for everyone
3589     return age_limit < content_limit
3590
3591
3592 # List of known byte-order-marks (BOM)
3593 BOMS = [
3594     (b'\xef\xbb\xbf', 'utf-8'),
3595     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3596     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3597     (b'\xff\xfe', 'utf-16-le'),
3598     (b'\xfe\xff', 'utf-16-be'),
3599 ]
3600
3601
3602 def is_html(first_bytes):
3603     """ Detect whether a file contains HTML by examining its first bytes. """
3604
3605     encoding = 'utf-8'
3606     for bom, enc in BOMS:
3607         while first_bytes.startswith(bom):
3608             encoding, first_bytes = enc, first_bytes[len(bom):]
3609
3610     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3611
3612
3613 def determine_protocol(info_dict):
3614     protocol = info_dict.get('protocol')
3615     if protocol is not None:
3616         return protocol
3617
3618     url = sanitize_url(info_dict['url'])
3619     if url.startswith('rtmp'):
3620         return 'rtmp'
3621     elif url.startswith('mms'):
3622         return 'mms'
3623     elif url.startswith('rtsp'):
3624         return 'rtsp'
3625
3626     ext = determine_ext(url)
3627     if ext == 'm3u8':
3628         return 'm3u8'
3629     elif ext == 'f4m':
3630         return 'f4m'
3631
3632     return urllib.parse.urlparse(url).scheme
3633
3634
3635 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3636     """ Render a list of rows, each as a list of values.
3637     Text after a \t will be right aligned """
3638     def width(string):
3639         return len(remove_terminal_sequences(string).replace('\t', ''))
3640
3641     def get_max_lens(table):
3642         return [max(width(str(v)) for v in col) for col in zip(*table)]
3643
3644     def filter_using_list(row, filterArray):
3645         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3646
3647     max_lens = get_max_lens(data) if hide_empty else []
3648     header_row = filter_using_list(header_row, max_lens)
3649     data = [filter_using_list(row, max_lens) for row in data]
3650
3651     table = [header_row] + data
3652     max_lens = get_max_lens(table)
3653     extra_gap += 1
3654     if delim:
3655         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3656         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3657     for row in table:
3658         for pos, text in enumerate(map(str, row)):
3659             if '\t' in text:
3660                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3661             else:
3662                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3663     ret = '\n'.join(''.join(row).rstrip() for row in table)
3664     return ret
3665
3666
3667 def _match_one(filter_part, dct, incomplete):
3668     # TODO: Generalize code with YoutubeDL._build_format_filter
3669     STRING_OPERATORS = {
3670         '*=': operator.contains,
3671         '^=': lambda attr, value: attr.startswith(value),
3672         '$=': lambda attr, value: attr.endswith(value),
3673         '~=': lambda attr, value: re.search(value, attr),
3674     }
3675     COMPARISON_OPERATORS = {
3676         **STRING_OPERATORS,
3677         '<=': operator.le,  # "<=" must be defined above "<"
3678         '<': operator.lt,
3679         '>=': operator.ge,
3680         '>': operator.gt,
3681         '=': operator.eq,
3682     }
3683
3684     if isinstance(incomplete, bool):
3685         is_incomplete = lambda _: incomplete
3686     else:
3687         is_incomplete = lambda k: k in incomplete
3688
3689     operator_rex = re.compile(r'''(?x)
3690         (?P<key>[a-z_]+)
3691         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3692         (?:
3693             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3694             (?P<strval>.+?)
3695         )
3696         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3697     m = operator_rex.fullmatch(filter_part.strip())
3698     if m:
3699         m = m.groupdict()
3700         unnegated_op = COMPARISON_OPERATORS[m['op']]
3701         if m['negation']:
3702             op = lambda attr, value: not unnegated_op(attr, value)
3703         else:
3704             op = unnegated_op
3705         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3706         if m['quote']:
3707             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3708         actual_value = dct.get(m['key'])
3709         numeric_comparison = None
3710         if isinstance(actual_value, (int, float)):
3711             # If the original field is a string and matching comparisonvalue is
3712             # a number we should respect the origin of the original field
3713             # and process comparison value as a string (see
3714             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3715             try:
3716                 numeric_comparison = int(comparison_value)
3717             except ValueError:
3718                 numeric_comparison = parse_filesize(comparison_value)
3719                 if numeric_comparison is None:
3720                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3721                 if numeric_comparison is None:
3722                     numeric_comparison = parse_duration(comparison_value)
3723         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3724             raise ValueError('Operator %s only supports string values!' % m['op'])
3725         if actual_value is None:
3726             return is_incomplete(m['key']) or m['none_inclusive']
3727         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3728
3729     UNARY_OPERATORS = {
3730         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3731         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3732     }
3733     operator_rex = re.compile(r'''(?x)
3734         (?P<op>%s)\s*(?P<key>[a-z_]+)
3735         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3736     m = operator_rex.fullmatch(filter_part.strip())
3737     if m:
3738         op = UNARY_OPERATORS[m.group('op')]
3739         actual_value = dct.get(m.group('key'))
3740         if is_incomplete(m.group('key')) and actual_value is None:
3741             return True
3742         return op(actual_value)
3743
3744     raise ValueError('Invalid filter part %r' % filter_part)
3745
3746
3747 def match_str(filter_str, dct, incomplete=False):
3748     """ Filter a dictionary with a simple string syntax.
3749     @returns           Whether the filter passes
3750     @param incomplete  Set of keys that is expected to be missing from dct.
3751                        Can be True/False to indicate all/none of the keys may be missing.
3752                        All conditions on incomplete keys pass if the key is missing
3753     """
3754     return all(
3755         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3756         for filter_part in re.split(r'(?<!\\)&', filter_str))
3757
3758
3759 def match_filter_func(filters):
3760     if not filters:
3761         return None
3762     filters = set(variadic(filters))
3763
3764     interactive = '-' in filters
3765     if interactive:
3766         filters.remove('-')
3767
3768     def _match_func(info_dict, incomplete=False):
3769         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3770             return NO_DEFAULT if interactive and not incomplete else None
3771         else:
3772             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3773             filter_str = ') | ('.join(map(str.strip, filters))
3774             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3775     return _match_func
3776
3777
3778 class download_range_func:
3779     def __init__(self, chapters, ranges):
3780         self.chapters, self.ranges = chapters, ranges
3781
3782     def __call__(self, info_dict, ydl):
3783         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3784                    else 'Cannot match chapters since chapter information is unavailable')
3785         for regex in self.chapters or []:
3786             for i, chapter in enumerate(info_dict.get('chapters') or []):
3787                 if re.search(regex, chapter['title']):
3788                     warning = None
3789                     yield {**chapter, 'index': i}
3790         if self.chapters and warning:
3791             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3792
3793         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3794
3795     def __eq__(self, other):
3796         return (isinstance(other, download_range_func)
3797                 and self.chapters == other.chapters and self.ranges == other.ranges)
3798
3799
3800 def parse_dfxp_time_expr(time_expr):
3801     if not time_expr:
3802         return
3803
3804     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3805     if mobj:
3806         return float(mobj.group('time_offset'))
3807
3808     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3809     if mobj:
3810         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3811
3812
3813 def srt_subtitles_timecode(seconds):
3814     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3815
3816
3817 def ass_subtitles_timecode(seconds):
3818     time = timetuple_from_msec(seconds * 1000)
3819     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3820
3821
3822 def dfxp2srt(dfxp_data):
3823     '''
3824     @param dfxp_data A bytes-like object containing DFXP data
3825     @returns A unicode object containing converted SRT data
3826     '''
3827     LEGACY_NAMESPACES = (
3828         (b'http://www.w3.org/ns/ttml', [
3829             b'http://www.w3.org/2004/11/ttaf1',
3830             b'http://www.w3.org/2006/04/ttaf1',
3831             b'http://www.w3.org/2006/10/ttaf1',
3832         ]),
3833         (b'http://www.w3.org/ns/ttml#styling', [
3834             b'http://www.w3.org/ns/ttml#style',
3835         ]),
3836     )
3837
3838     SUPPORTED_STYLING = [
3839         'color',
3840         'fontFamily',
3841         'fontSize',
3842         'fontStyle',
3843         'fontWeight',
3844         'textDecoration'
3845     ]
3846
3847     _x = functools.partial(xpath_with_ns, ns_map={
3848         'xml': 'http://www.w3.org/XML/1998/namespace',
3849         'ttml': 'http://www.w3.org/ns/ttml',
3850         'tts': 'http://www.w3.org/ns/ttml#styling',
3851     })
3852
3853     styles = {}
3854     default_style = {}
3855
3856     class TTMLPElementParser:
3857         _out = ''
3858         _unclosed_elements = []
3859         _applied_styles = []
3860
3861         def start(self, tag, attrib):
3862             if tag in (_x('ttml:br'), 'br'):
3863                 self._out += '\n'
3864             else:
3865                 unclosed_elements = []
3866                 style = {}
3867                 element_style_id = attrib.get('style')
3868                 if default_style:
3869                     style.update(default_style)
3870                 if element_style_id:
3871                     style.update(styles.get(element_style_id, {}))
3872                 for prop in SUPPORTED_STYLING:
3873                     prop_val = attrib.get(_x('tts:' + prop))
3874                     if prop_val:
3875                         style[prop] = prop_val
3876                 if style:
3877                     font = ''
3878                     for k, v in sorted(style.items()):
3879                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3880                             continue
3881                         if k == 'color':
3882                             font += ' color="%s"' % v
3883                         elif k == 'fontSize':
3884                             font += ' size="%s"' % v
3885                         elif k == 'fontFamily':
3886                             font += ' face="%s"' % v
3887                         elif k == 'fontWeight' and v == 'bold':
3888                             self._out += '<b>'
3889                             unclosed_elements.append('b')
3890                         elif k == 'fontStyle' and v == 'italic':
3891                             self._out += '<i>'
3892                             unclosed_elements.append('i')
3893                         elif k == 'textDecoration' and v == 'underline':
3894                             self._out += '<u>'
3895                             unclosed_elements.append('u')
3896                     if font:
3897                         self._out += '<font' + font + '>'
3898                         unclosed_elements.append('font')
3899                     applied_style = {}
3900                     if self._applied_styles:
3901                         applied_style.update(self._applied_styles[-1])
3902                     applied_style.update(style)
3903                     self._applied_styles.append(applied_style)
3904                 self._unclosed_elements.append(unclosed_elements)
3905
3906         def end(self, tag):
3907             if tag not in (_x('ttml:br'), 'br'):
3908                 unclosed_elements = self._unclosed_elements.pop()
3909                 for element in reversed(unclosed_elements):
3910                     self._out += '</%s>' % element
3911                 if unclosed_elements and self._applied_styles:
3912                     self._applied_styles.pop()
3913
3914         def data(self, data):
3915             self._out += data
3916
3917         def close(self):
3918             return self._out.strip()
3919
3920     def parse_node(node):
3921         target = TTMLPElementParser()
3922         parser = xml.etree.ElementTree.XMLParser(target=target)
3923         parser.feed(xml.etree.ElementTree.tostring(node))
3924         return parser.close()
3925
3926     for k, v in LEGACY_NAMESPACES:
3927         for ns in v:
3928             dfxp_data = dfxp_data.replace(ns, k)
3929
3930     dfxp = compat_etree_fromstring(dfxp_data)
3931     out = []
3932     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3933
3934     if not paras:
3935         raise ValueError('Invalid dfxp/TTML subtitle')
3936
3937     repeat = False
3938     while True:
3939         for style in dfxp.findall(_x('.//ttml:style')):
3940             style_id = style.get('id') or style.get(_x('xml:id'))
3941             if not style_id:
3942                 continue
3943             parent_style_id = style.get('style')
3944             if parent_style_id:
3945                 if parent_style_id not in styles:
3946                     repeat = True
3947                     continue
3948                 styles[style_id] = styles[parent_style_id].copy()
3949             for prop in SUPPORTED_STYLING:
3950                 prop_val = style.get(_x('tts:' + prop))
3951                 if prop_val:
3952                     styles.setdefault(style_id, {})[prop] = prop_val
3953         if repeat:
3954             repeat = False
3955         else:
3956             break
3957
3958     for p in ('body', 'div'):
3959         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3960         if ele is None:
3961             continue
3962         style = styles.get(ele.get('style'))
3963         if not style:
3964             continue
3965         default_style.update(style)
3966
3967     for para, index in zip(paras, itertools.count(1)):
3968         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3969         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3970         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3971         if begin_time is None:
3972             continue
3973         if not end_time:
3974             if not dur:
3975                 continue
3976             end_time = begin_time + dur
3977         out.append('%d\n%s --> %s\n%s\n\n' % (
3978             index,
3979             srt_subtitles_timecode(begin_time),
3980             srt_subtitles_timecode(end_time),
3981             parse_node(para)))
3982
3983     return ''.join(out)
3984
3985
3986 def cli_option(params, command_option, param, separator=None):
3987     param = params.get(param)
3988     return ([] if param is None
3989             else [command_option, str(param)] if separator is None
3990             else [f'{command_option}{separator}{param}'])
3991
3992
3993 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3994     param = params.get(param)
3995     assert param in (True, False, None)
3996     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3997
3998
3999 def cli_valueless_option(params, command_option, param, expected_value=True):
4000     return [command_option] if params.get(param) == expected_value else []
4001
4002
4003 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4004     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4005         if use_compat:
4006             return argdict
4007         else:
4008             argdict = None
4009     if argdict is None:
4010         return default
4011     assert isinstance(argdict, dict)
4012
4013     assert isinstance(keys, (list, tuple))
4014     for key_list in keys:
4015         arg_list = list(filter(
4016             lambda x: x is not None,
4017             [argdict.get(key.lower()) for key in variadic(key_list)]))
4018         if arg_list:
4019             return [arg for args in arg_list for arg in args]
4020     return default
4021
4022
4023 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4024     main_key, exe = main_key.lower(), exe.lower()
4025     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4026     keys = [f'{root_key}{k}' for k in (keys or [''])]
4027     if root_key in keys:
4028         if main_key != exe:
4029             keys.append((main_key, exe))
4030         keys.append('default')
4031     else:
4032         use_compat = False
4033     return cli_configuration_args(argdict, keys, default, use_compat)
4034
4035
4036 class ISO639Utils:
4037     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4038     _lang_map = {
4039         'aa': 'aar',
4040         'ab': 'abk',
4041         'ae': 'ave',
4042         'af': 'afr',
4043         'ak': 'aka',
4044         'am': 'amh',
4045         'an': 'arg',
4046         'ar': 'ara',
4047         'as': 'asm',
4048         'av': 'ava',
4049         'ay': 'aym',
4050         'az': 'aze',
4051         'ba': 'bak',
4052         'be': 'bel',
4053         'bg': 'bul',
4054         'bh': 'bih',
4055         'bi': 'bis',
4056         'bm': 'bam',
4057         'bn': 'ben',
4058         'bo': 'bod',
4059         'br': 'bre',
4060         'bs': 'bos',
4061         'ca': 'cat',
4062         'ce': 'che',
4063         'ch': 'cha',
4064         'co': 'cos',
4065         'cr': 'cre',
4066         'cs': 'ces',
4067         'cu': 'chu',
4068         'cv': 'chv',
4069         'cy': 'cym',
4070         'da': 'dan',
4071         'de': 'deu',
4072         'dv': 'div',
4073         'dz': 'dzo',
4074         'ee': 'ewe',
4075         'el': 'ell',
4076         'en': 'eng',
4077         'eo': 'epo',
4078         'es': 'spa',
4079         'et': 'est',
4080         'eu': 'eus',
4081         'fa': 'fas',
4082         'ff': 'ful',
4083         'fi': 'fin',
4084         'fj': 'fij',
4085         'fo': 'fao',
4086         'fr': 'fra',
4087         'fy': 'fry',
4088         'ga': 'gle',
4089         'gd': 'gla',
4090         'gl': 'glg',
4091         'gn': 'grn',
4092         'gu': 'guj',
4093         'gv': 'glv',
4094         'ha': 'hau',
4095         'he': 'heb',
4096         'iw': 'heb',  # Replaced by he in 1989 revision
4097         'hi': 'hin',
4098         'ho': 'hmo',
4099         'hr': 'hrv',
4100         'ht': 'hat',
4101         'hu': 'hun',
4102         'hy': 'hye',
4103         'hz': 'her',
4104         'ia': 'ina',
4105         'id': 'ind',
4106         'in': 'ind',  # Replaced by id in 1989 revision
4107         'ie': 'ile',
4108         'ig': 'ibo',
4109         'ii': 'iii',
4110         'ik': 'ipk',
4111         'io': 'ido',
4112         'is': 'isl',
4113         'it': 'ita',
4114         'iu': 'iku',
4115         'ja': 'jpn',
4116         'jv': 'jav',
4117         'ka': 'kat',
4118         'kg': 'kon',
4119         'ki': 'kik',
4120         'kj': 'kua',
4121         'kk': 'kaz',
4122         'kl': 'kal',
4123         'km': 'khm',
4124         'kn': 'kan',
4125         'ko': 'kor',
4126         'kr': 'kau',
4127         'ks': 'kas',
4128         'ku': 'kur',
4129         'kv': 'kom',
4130         'kw': 'cor',
4131         'ky': 'kir',
4132         'la': 'lat',
4133         'lb': 'ltz',
4134         'lg': 'lug',
4135         'li': 'lim',
4136         'ln': 'lin',
4137         'lo': 'lao',
4138         'lt': 'lit',
4139         'lu': 'lub',
4140         'lv': 'lav',
4141         'mg': 'mlg',
4142         'mh': 'mah',
4143         'mi': 'mri',
4144         'mk': 'mkd',
4145         'ml': 'mal',
4146         'mn': 'mon',
4147         'mr': 'mar',
4148         'ms': 'msa',
4149         'mt': 'mlt',
4150         'my': 'mya',
4151         'na': 'nau',
4152         'nb': 'nob',
4153         'nd': 'nde',
4154         'ne': 'nep',
4155         'ng': 'ndo',
4156         'nl': 'nld',
4157         'nn': 'nno',
4158         'no': 'nor',
4159         'nr': 'nbl',
4160         'nv': 'nav',
4161         'ny': 'nya',
4162         'oc': 'oci',
4163         'oj': 'oji',
4164         'om': 'orm',
4165         'or': 'ori',
4166         'os': 'oss',
4167         'pa': 'pan',
4168         'pi': 'pli',
4169         'pl': 'pol',
4170         'ps': 'pus',
4171         'pt': 'por',
4172         'qu': 'que',
4173         'rm': 'roh',
4174         'rn': 'run',
4175         'ro': 'ron',
4176         'ru': 'rus',
4177         'rw': 'kin',
4178         'sa': 'san',
4179         'sc': 'srd',
4180         'sd': 'snd',
4181         'se': 'sme',
4182         'sg': 'sag',
4183         'si': 'sin',
4184         'sk': 'slk',
4185         'sl': 'slv',
4186         'sm': 'smo',
4187         'sn': 'sna',
4188         'so': 'som',
4189         'sq': 'sqi',
4190         'sr': 'srp',
4191         'ss': 'ssw',
4192         'st': 'sot',
4193         'su': 'sun',
4194         'sv': 'swe',
4195         'sw': 'swa',
4196         'ta': 'tam',
4197         'te': 'tel',
4198         'tg': 'tgk',
4199         'th': 'tha',
4200         'ti': 'tir',
4201         'tk': 'tuk',
4202         'tl': 'tgl',
4203         'tn': 'tsn',
4204         'to': 'ton',
4205         'tr': 'tur',
4206         'ts': 'tso',
4207         'tt': 'tat',
4208         'tw': 'twi',
4209         'ty': 'tah',
4210         'ug': 'uig',
4211         'uk': 'ukr',
4212         'ur': 'urd',
4213         'uz': 'uzb',
4214         've': 'ven',
4215         'vi': 'vie',
4216         'vo': 'vol',
4217         'wa': 'wln',
4218         'wo': 'wol',
4219         'xh': 'xho',
4220         'yi': 'yid',
4221         'ji': 'yid',  # Replaced by yi in 1989 revision
4222         'yo': 'yor',
4223         'za': 'zha',
4224         'zh': 'zho',
4225         'zu': 'zul',
4226     }
4227
4228     @classmethod
4229     def short2long(cls, code):
4230         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4231         return cls._lang_map.get(code[:2])
4232
4233     @classmethod
4234     def long2short(cls, code):
4235         """Convert language code from ISO 639-2/T to ISO 639-1"""
4236         for short_name, long_name in cls._lang_map.items():
4237             if long_name == code:
4238                 return short_name
4239
4240
4241 class ISO3166Utils:
4242     # From http://data.okfn.org/data/core/country-list
4243     _country_map = {
4244         'AF': 'Afghanistan',
4245         'AX': 'Åland Islands',
4246         'AL': 'Albania',
4247         'DZ': 'Algeria',
4248         'AS': 'American Samoa',
4249         'AD': 'Andorra',
4250         'AO': 'Angola',
4251         'AI': 'Anguilla',
4252         'AQ': 'Antarctica',
4253         'AG': 'Antigua and Barbuda',
4254         'AR': 'Argentina',
4255         'AM': 'Armenia',
4256         'AW': 'Aruba',
4257         'AU': 'Australia',
4258         'AT': 'Austria',
4259         'AZ': 'Azerbaijan',
4260         'BS': 'Bahamas',
4261         'BH': 'Bahrain',
4262         'BD': 'Bangladesh',
4263         'BB': 'Barbados',
4264         'BY': 'Belarus',
4265         'BE': 'Belgium',
4266         'BZ': 'Belize',
4267         'BJ': 'Benin',
4268         'BM': 'Bermuda',
4269         'BT': 'Bhutan',
4270         'BO': 'Bolivia, Plurinational State of',
4271         'BQ': 'Bonaire, Sint Eustatius and Saba',
4272         'BA': 'Bosnia and Herzegovina',
4273         'BW': 'Botswana',
4274         'BV': 'Bouvet Island',
4275         'BR': 'Brazil',
4276         'IO': 'British Indian Ocean Territory',
4277         'BN': 'Brunei Darussalam',
4278         'BG': 'Bulgaria',
4279         'BF': 'Burkina Faso',
4280         'BI': 'Burundi',
4281         'KH': 'Cambodia',
4282         'CM': 'Cameroon',
4283         'CA': 'Canada',
4284         'CV': 'Cape Verde',
4285         'KY': 'Cayman Islands',
4286         'CF': 'Central African Republic',
4287         'TD': 'Chad',
4288         'CL': 'Chile',
4289         'CN': 'China',
4290         'CX': 'Christmas Island',
4291         'CC': 'Cocos (Keeling) Islands',
4292         'CO': 'Colombia',
4293         'KM': 'Comoros',
4294         'CG': 'Congo',
4295         'CD': 'Congo, the Democratic Republic of the',
4296         'CK': 'Cook Islands',
4297         'CR': 'Costa Rica',
4298         'CI': 'Côte d\'Ivoire',
4299         'HR': 'Croatia',
4300         'CU': 'Cuba',
4301         'CW': 'Curaçao',
4302         'CY': 'Cyprus',
4303         'CZ': 'Czech Republic',
4304         'DK': 'Denmark',
4305         'DJ': 'Djibouti',
4306         'DM': 'Dominica',
4307         'DO': 'Dominican Republic',
4308         'EC': 'Ecuador',
4309         'EG': 'Egypt',
4310         'SV': 'El Salvador',
4311         'GQ': 'Equatorial Guinea',
4312         'ER': 'Eritrea',
4313         'EE': 'Estonia',
4314         'ET': 'Ethiopia',
4315         'FK': 'Falkland Islands (Malvinas)',
4316         'FO': 'Faroe Islands',
4317         'FJ': 'Fiji',
4318         'FI': 'Finland',
4319         'FR': 'France',
4320         'GF': 'French Guiana',
4321         'PF': 'French Polynesia',
4322         'TF': 'French Southern Territories',
4323         'GA': 'Gabon',
4324         'GM': 'Gambia',
4325         'GE': 'Georgia',
4326         'DE': 'Germany',
4327         'GH': 'Ghana',
4328         'GI': 'Gibraltar',
4329         'GR': 'Greece',
4330         'GL': 'Greenland',
4331         'GD': 'Grenada',
4332         'GP': 'Guadeloupe',
4333         'GU': 'Guam',
4334         'GT': 'Guatemala',
4335         'GG': 'Guernsey',
4336         'GN': 'Guinea',
4337         'GW': 'Guinea-Bissau',
4338         'GY': 'Guyana',
4339         'HT': 'Haiti',
4340         'HM': 'Heard Island and McDonald Islands',
4341         'VA': 'Holy See (Vatican City State)',
4342         'HN': 'Honduras',
4343         'HK': 'Hong Kong',
4344         'HU': 'Hungary',
4345         'IS': 'Iceland',
4346         'IN': 'India',
4347         'ID': 'Indonesia',
4348         'IR': 'Iran, Islamic Republic of',
4349         'IQ': 'Iraq',
4350         'IE': 'Ireland',
4351         'IM': 'Isle of Man',
4352         'IL': 'Israel',
4353         'IT': 'Italy',
4354         'JM': 'Jamaica',
4355         'JP': 'Japan',
4356         'JE': 'Jersey',
4357         'JO': 'Jordan',
4358         'KZ': 'Kazakhstan',
4359         'KE': 'Kenya',
4360         'KI': 'Kiribati',
4361         'KP': 'Korea, Democratic People\'s Republic of',
4362         'KR': 'Korea, Republic of',
4363         'KW': 'Kuwait',
4364         'KG': 'Kyrgyzstan',
4365         'LA': 'Lao People\'s Democratic Republic',
4366         'LV': 'Latvia',
4367         'LB': 'Lebanon',
4368         'LS': 'Lesotho',
4369         'LR': 'Liberia',
4370         'LY': 'Libya',
4371         'LI': 'Liechtenstein',
4372         'LT': 'Lithuania',
4373         'LU': 'Luxembourg',
4374         'MO': 'Macao',
4375         'MK': 'Macedonia, the Former Yugoslav Republic of',
4376         'MG': 'Madagascar',
4377         'MW': 'Malawi',
4378         'MY': 'Malaysia',
4379         'MV': 'Maldives',
4380         'ML': 'Mali',
4381         'MT': 'Malta',
4382         'MH': 'Marshall Islands',
4383         'MQ': 'Martinique',
4384         'MR': 'Mauritania',
4385         'MU': 'Mauritius',
4386         'YT': 'Mayotte',
4387         'MX': 'Mexico',
4388         'FM': 'Micronesia, Federated States of',
4389         'MD': 'Moldova, Republic of',
4390         'MC': 'Monaco',
4391         'MN': 'Mongolia',
4392         'ME': 'Montenegro',
4393         'MS': 'Montserrat',
4394         'MA': 'Morocco',
4395         'MZ': 'Mozambique',
4396         'MM': 'Myanmar',
4397         'NA': 'Namibia',
4398         'NR': 'Nauru',
4399         'NP': 'Nepal',
4400         'NL': 'Netherlands',
4401         'NC': 'New Caledonia',
4402         'NZ': 'New Zealand',
4403         'NI': 'Nicaragua',
4404         'NE': 'Niger',
4405         'NG': 'Nigeria',
4406         'NU': 'Niue',
4407         'NF': 'Norfolk Island',
4408         'MP': 'Northern Mariana Islands',
4409         'NO': 'Norway',
4410         'OM': 'Oman',
4411         'PK': 'Pakistan',
4412         'PW': 'Palau',
4413         'PS': 'Palestine, State of',
4414         'PA': 'Panama',
4415         'PG': 'Papua New Guinea',
4416         'PY': 'Paraguay',
4417         'PE': 'Peru',
4418         'PH': 'Philippines',
4419         'PN': 'Pitcairn',
4420         'PL': 'Poland',
4421         'PT': 'Portugal',
4422         'PR': 'Puerto Rico',
4423         'QA': 'Qatar',
4424         'RE': 'Réunion',
4425         'RO': 'Romania',
4426         'RU': 'Russian Federation',
4427         'RW': 'Rwanda',
4428         'BL': 'Saint Barthélemy',
4429         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4430         'KN': 'Saint Kitts and Nevis',
4431         'LC': 'Saint Lucia',
4432         'MF': 'Saint Martin (French part)',
4433         'PM': 'Saint Pierre and Miquelon',
4434         'VC': 'Saint Vincent and the Grenadines',
4435         'WS': 'Samoa',
4436         'SM': 'San Marino',
4437         'ST': 'Sao Tome and Principe',
4438         'SA': 'Saudi Arabia',
4439         'SN': 'Senegal',
4440         'RS': 'Serbia',
4441         'SC': 'Seychelles',
4442         'SL': 'Sierra Leone',
4443         'SG': 'Singapore',
4444         'SX': 'Sint Maarten (Dutch part)',
4445         'SK': 'Slovakia',
4446         'SI': 'Slovenia',
4447         'SB': 'Solomon Islands',
4448         'SO': 'Somalia',
4449         'ZA': 'South Africa',
4450         'GS': 'South Georgia and the South Sandwich Islands',
4451         'SS': 'South Sudan',
4452         'ES': 'Spain',
4453         'LK': 'Sri Lanka',
4454         'SD': 'Sudan',
4455         'SR': 'Suriname',
4456         'SJ': 'Svalbard and Jan Mayen',
4457         'SZ': 'Swaziland',
4458         'SE': 'Sweden',
4459         'CH': 'Switzerland',
4460         'SY': 'Syrian Arab Republic',
4461         'TW': 'Taiwan, Province of China',
4462         'TJ': 'Tajikistan',
4463         'TZ': 'Tanzania, United Republic of',
4464         'TH': 'Thailand',
4465         'TL': 'Timor-Leste',
4466         'TG': 'Togo',
4467         'TK': 'Tokelau',
4468         'TO': 'Tonga',
4469         'TT': 'Trinidad and Tobago',
4470         'TN': 'Tunisia',
4471         'TR': 'Turkey',
4472         'TM': 'Turkmenistan',
4473         'TC': 'Turks and Caicos Islands',
4474         'TV': 'Tuvalu',
4475         'UG': 'Uganda',
4476         'UA': 'Ukraine',
4477         'AE': 'United Arab Emirates',
4478         'GB': 'United Kingdom',
4479         'US': 'United States',
4480         'UM': 'United States Minor Outlying Islands',
4481         'UY': 'Uruguay',
4482         'UZ': 'Uzbekistan',
4483         'VU': 'Vanuatu',
4484         'VE': 'Venezuela, Bolivarian Republic of',
4485         'VN': 'Viet Nam',
4486         'VG': 'Virgin Islands, British',
4487         'VI': 'Virgin Islands, U.S.',
4488         'WF': 'Wallis and Futuna',
4489         'EH': 'Western Sahara',
4490         'YE': 'Yemen',
4491         'ZM': 'Zambia',
4492         'ZW': 'Zimbabwe',
4493         # Not ISO 3166 codes, but used for IP blocks
4494         'AP': 'Asia/Pacific Region',
4495         'EU': 'Europe',
4496     }
4497
4498     @classmethod
4499     def short2full(cls, code):
4500         """Convert an ISO 3166-2 country code to the corresponding full name"""
4501         return cls._country_map.get(code.upper())
4502
4503
4504 class GeoUtils:
4505     # Major IPv4 address blocks per country
4506     _country_ip_map = {
4507         'AD': '46.172.224.0/19',
4508         'AE': '94.200.0.0/13',
4509         'AF': '149.54.0.0/17',
4510         'AG': '209.59.64.0/18',
4511         'AI': '204.14.248.0/21',
4512         'AL': '46.99.0.0/16',
4513         'AM': '46.70.0.0/15',
4514         'AO': '105.168.0.0/13',
4515         'AP': '182.50.184.0/21',
4516         'AQ': '23.154.160.0/24',
4517         'AR': '181.0.0.0/12',
4518         'AS': '202.70.112.0/20',
4519         'AT': '77.116.0.0/14',
4520         'AU': '1.128.0.0/11',
4521         'AW': '181.41.0.0/18',
4522         'AX': '185.217.4.0/22',
4523         'AZ': '5.197.0.0/16',
4524         'BA': '31.176.128.0/17',
4525         'BB': '65.48.128.0/17',
4526         'BD': '114.130.0.0/16',
4527         'BE': '57.0.0.0/8',
4528         'BF': '102.178.0.0/15',
4529         'BG': '95.42.0.0/15',
4530         'BH': '37.131.0.0/17',
4531         'BI': '154.117.192.0/18',
4532         'BJ': '137.255.0.0/16',
4533         'BL': '185.212.72.0/23',
4534         'BM': '196.12.64.0/18',
4535         'BN': '156.31.0.0/16',
4536         'BO': '161.56.0.0/16',
4537         'BQ': '161.0.80.0/20',
4538         'BR': '191.128.0.0/12',
4539         'BS': '24.51.64.0/18',
4540         'BT': '119.2.96.0/19',
4541         'BW': '168.167.0.0/16',
4542         'BY': '178.120.0.0/13',
4543         'BZ': '179.42.192.0/18',
4544         'CA': '99.224.0.0/11',
4545         'CD': '41.243.0.0/16',
4546         'CF': '197.242.176.0/21',
4547         'CG': '160.113.0.0/16',
4548         'CH': '85.0.0.0/13',
4549         'CI': '102.136.0.0/14',
4550         'CK': '202.65.32.0/19',
4551         'CL': '152.172.0.0/14',
4552         'CM': '102.244.0.0/14',
4553         'CN': '36.128.0.0/10',
4554         'CO': '181.240.0.0/12',
4555         'CR': '201.192.0.0/12',
4556         'CU': '152.206.0.0/15',
4557         'CV': '165.90.96.0/19',
4558         'CW': '190.88.128.0/17',
4559         'CY': '31.153.0.0/16',
4560         'CZ': '88.100.0.0/14',
4561         'DE': '53.0.0.0/8',
4562         'DJ': '197.241.0.0/17',
4563         'DK': '87.48.0.0/12',
4564         'DM': '192.243.48.0/20',
4565         'DO': '152.166.0.0/15',
4566         'DZ': '41.96.0.0/12',
4567         'EC': '186.68.0.0/15',
4568         'EE': '90.190.0.0/15',
4569         'EG': '156.160.0.0/11',
4570         'ER': '196.200.96.0/20',
4571         'ES': '88.0.0.0/11',
4572         'ET': '196.188.0.0/14',
4573         'EU': '2.16.0.0/13',
4574         'FI': '91.152.0.0/13',
4575         'FJ': '144.120.0.0/16',
4576         'FK': '80.73.208.0/21',
4577         'FM': '119.252.112.0/20',
4578         'FO': '88.85.32.0/19',
4579         'FR': '90.0.0.0/9',
4580         'GA': '41.158.0.0/15',
4581         'GB': '25.0.0.0/8',
4582         'GD': '74.122.88.0/21',
4583         'GE': '31.146.0.0/16',
4584         'GF': '161.22.64.0/18',
4585         'GG': '62.68.160.0/19',
4586         'GH': '154.160.0.0/12',
4587         'GI': '95.164.0.0/16',
4588         'GL': '88.83.0.0/19',
4589         'GM': '160.182.0.0/15',
4590         'GN': '197.149.192.0/18',
4591         'GP': '104.250.0.0/19',
4592         'GQ': '105.235.224.0/20',
4593         'GR': '94.64.0.0/13',
4594         'GT': '168.234.0.0/16',
4595         'GU': '168.123.0.0/16',
4596         'GW': '197.214.80.0/20',
4597         'GY': '181.41.64.0/18',
4598         'HK': '113.252.0.0/14',
4599         'HN': '181.210.0.0/16',
4600         'HR': '93.136.0.0/13',
4601         'HT': '148.102.128.0/17',
4602         'HU': '84.0.0.0/14',
4603         'ID': '39.192.0.0/10',
4604         'IE': '87.32.0.0/12',
4605         'IL': '79.176.0.0/13',
4606         'IM': '5.62.80.0/20',
4607         'IN': '117.192.0.0/10',
4608         'IO': '203.83.48.0/21',
4609         'IQ': '37.236.0.0/14',
4610         'IR': '2.176.0.0/12',
4611         'IS': '82.221.0.0/16',
4612         'IT': '79.0.0.0/10',
4613         'JE': '87.244.64.0/18',
4614         'JM': '72.27.0.0/17',
4615         'JO': '176.29.0.0/16',
4616         'JP': '133.0.0.0/8',
4617         'KE': '105.48.0.0/12',
4618         'KG': '158.181.128.0/17',
4619         'KH': '36.37.128.0/17',
4620         'KI': '103.25.140.0/22',
4621         'KM': '197.255.224.0/20',
4622         'KN': '198.167.192.0/19',
4623         'KP': '175.45.176.0/22',
4624         'KR': '175.192.0.0/10',
4625         'KW': '37.36.0.0/14',
4626         'KY': '64.96.0.0/15',
4627         'KZ': '2.72.0.0/13',
4628         'LA': '115.84.64.0/18',
4629         'LB': '178.135.0.0/16',
4630         'LC': '24.92.144.0/20',
4631         'LI': '82.117.0.0/19',
4632         'LK': '112.134.0.0/15',
4633         'LR': '102.183.0.0/16',
4634         'LS': '129.232.0.0/17',
4635         'LT': '78.56.0.0/13',
4636         'LU': '188.42.0.0/16',
4637         'LV': '46.109.0.0/16',
4638         'LY': '41.252.0.0/14',
4639         'MA': '105.128.0.0/11',
4640         'MC': '88.209.64.0/18',
4641         'MD': '37.246.0.0/16',
4642         'ME': '178.175.0.0/17',
4643         'MF': '74.112.232.0/21',
4644         'MG': '154.126.0.0/17',
4645         'MH': '117.103.88.0/21',
4646         'MK': '77.28.0.0/15',
4647         'ML': '154.118.128.0/18',
4648         'MM': '37.111.0.0/17',
4649         'MN': '49.0.128.0/17',
4650         'MO': '60.246.0.0/16',
4651         'MP': '202.88.64.0/20',
4652         'MQ': '109.203.224.0/19',
4653         'MR': '41.188.64.0/18',
4654         'MS': '208.90.112.0/22',
4655         'MT': '46.11.0.0/16',
4656         'MU': '105.16.0.0/12',
4657         'MV': '27.114.128.0/18',
4658         'MW': '102.70.0.0/15',
4659         'MX': '187.192.0.0/11',
4660         'MY': '175.136.0.0/13',
4661         'MZ': '197.218.0.0/15',
4662         'NA': '41.182.0.0/16',
4663         'NC': '101.101.0.0/18',
4664         'NE': '197.214.0.0/18',
4665         'NF': '203.17.240.0/22',
4666         'NG': '105.112.0.0/12',
4667         'NI': '186.76.0.0/15',
4668         'NL': '145.96.0.0/11',
4669         'NO': '84.208.0.0/13',
4670         'NP': '36.252.0.0/15',
4671         'NR': '203.98.224.0/19',
4672         'NU': '49.156.48.0/22',
4673         'NZ': '49.224.0.0/14',
4674         'OM': '5.36.0.0/15',
4675         'PA': '186.72.0.0/15',
4676         'PE': '186.160.0.0/14',
4677         'PF': '123.50.64.0/18',
4678         'PG': '124.240.192.0/19',
4679         'PH': '49.144.0.0/13',
4680         'PK': '39.32.0.0/11',
4681         'PL': '83.0.0.0/11',
4682         'PM': '70.36.0.0/20',
4683         'PR': '66.50.0.0/16',
4684         'PS': '188.161.0.0/16',
4685         'PT': '85.240.0.0/13',
4686         'PW': '202.124.224.0/20',
4687         'PY': '181.120.0.0/14',
4688         'QA': '37.210.0.0/15',
4689         'RE': '102.35.0.0/16',
4690         'RO': '79.112.0.0/13',
4691         'RS': '93.86.0.0/15',
4692         'RU': '5.136.0.0/13',
4693         'RW': '41.186.0.0/16',
4694         'SA': '188.48.0.0/13',
4695         'SB': '202.1.160.0/19',
4696         'SC': '154.192.0.0/11',
4697         'SD': '102.120.0.0/13',
4698         'SE': '78.64.0.0/12',
4699         'SG': '8.128.0.0/10',
4700         'SI': '188.196.0.0/14',
4701         'SK': '78.98.0.0/15',
4702         'SL': '102.143.0.0/17',
4703         'SM': '89.186.32.0/19',
4704         'SN': '41.82.0.0/15',
4705         'SO': '154.115.192.0/18',
4706         'SR': '186.179.128.0/17',
4707         'SS': '105.235.208.0/21',
4708         'ST': '197.159.160.0/19',
4709         'SV': '168.243.0.0/16',
4710         'SX': '190.102.0.0/20',
4711         'SY': '5.0.0.0/16',
4712         'SZ': '41.84.224.0/19',
4713         'TC': '65.255.48.0/20',
4714         'TD': '154.68.128.0/19',
4715         'TG': '196.168.0.0/14',
4716         'TH': '171.96.0.0/13',
4717         'TJ': '85.9.128.0/18',
4718         'TK': '27.96.24.0/21',
4719         'TL': '180.189.160.0/20',
4720         'TM': '95.85.96.0/19',
4721         'TN': '197.0.0.0/11',
4722         'TO': '175.176.144.0/21',
4723         'TR': '78.160.0.0/11',
4724         'TT': '186.44.0.0/15',
4725         'TV': '202.2.96.0/19',
4726         'TW': '120.96.0.0/11',
4727         'TZ': '156.156.0.0/14',
4728         'UA': '37.52.0.0/14',
4729         'UG': '102.80.0.0/13',
4730         'US': '6.0.0.0/8',
4731         'UY': '167.56.0.0/13',
4732         'UZ': '84.54.64.0/18',
4733         'VA': '212.77.0.0/19',
4734         'VC': '207.191.240.0/21',
4735         'VE': '186.88.0.0/13',
4736         'VG': '66.81.192.0/20',
4737         'VI': '146.226.0.0/16',
4738         'VN': '14.160.0.0/11',
4739         'VU': '202.80.32.0/20',
4740         'WF': '117.20.32.0/21',
4741         'WS': '202.4.32.0/19',
4742         'YE': '134.35.0.0/16',
4743         'YT': '41.242.116.0/22',
4744         'ZA': '41.0.0.0/11',
4745         'ZM': '102.144.0.0/13',
4746         'ZW': '102.177.192.0/18',
4747     }
4748
4749     @classmethod
4750     def random_ipv4(cls, code_or_block):
4751         if len(code_or_block) == 2:
4752             block = cls._country_ip_map.get(code_or_block.upper())
4753             if not block:
4754                 return None
4755         else:
4756             block = code_or_block
4757         addr, preflen = block.split('/')
4758         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4759         addr_max = addr_min | (0xffffffff >> int(preflen))
4760         return str(socket.inet_ntoa(
4761             struct.pack('!L', random.randint(addr_min, addr_max))))
4762
4763
4764 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4765     def __init__(self, proxies=None):
4766         # Set default handlers
4767         for type in ('http', 'https'):
4768             setattr(self, '%s_open' % type,
4769                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4770                         meth(r, proxy, type))
4771         urllib.request.ProxyHandler.__init__(self, proxies)
4772
4773     def proxy_open(self, req, proxy, type):
4774         req_proxy = req.headers.get('Ytdl-request-proxy')
4775         if req_proxy is not None:
4776             proxy = req_proxy
4777             del req.headers['Ytdl-request-proxy']
4778
4779         if proxy == '__noproxy__':
4780             return None  # No Proxy
4781         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4782             req.add_header('Ytdl-socks-proxy', proxy)
4783             # yt-dlp's http/https handlers do wrapping the socket with socks
4784             return None
4785         return urllib.request.ProxyHandler.proxy_open(
4786             self, req, proxy, type)
4787
4788
4789 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4790 # released into Public Domain
4791 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4792
4793 def long_to_bytes(n, blocksize=0):
4794     """long_to_bytes(n:long, blocksize:int) : string
4795     Convert a long integer to a byte string.
4796
4797     If optional blocksize is given and greater than zero, pad the front of the
4798     byte string with binary zeros so that the length is a multiple of
4799     blocksize.
4800     """
4801     # after much testing, this algorithm was deemed to be the fastest
4802     s = b''
4803     n = int(n)
4804     while n > 0:
4805         s = struct.pack('>I', n & 0xffffffff) + s
4806         n = n >> 32
4807     # strip off leading zeros
4808     for i in range(len(s)):
4809         if s[i] != b'\000'[0]:
4810             break
4811     else:
4812         # only happens when n == 0
4813         s = b'\000'
4814         i = 0
4815     s = s[i:]
4816     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4817     # de-padding being done above, but sigh...
4818     if blocksize > 0 and len(s) % blocksize:
4819         s = (blocksize - len(s) % blocksize) * b'\000' + s
4820     return s
4821
4822
4823 def bytes_to_long(s):
4824     """bytes_to_long(string) : long
4825     Convert a byte string to a long integer.
4826
4827     This is (essentially) the inverse of long_to_bytes().
4828     """
4829     acc = 0
4830     length = len(s)
4831     if length % 4:
4832         extra = (4 - length % 4)
4833         s = b'\000' * extra + s
4834         length = length + extra
4835     for i in range(0, length, 4):
4836         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4837     return acc
4838
4839
4840 def ohdave_rsa_encrypt(data, exponent, modulus):
4841     '''
4842     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4843
4844     Input:
4845         data: data to encrypt, bytes-like object
4846         exponent, modulus: parameter e and N of RSA algorithm, both integer
4847     Output: hex string of encrypted data
4848
4849     Limitation: supports one block encryption only
4850     '''
4851
4852     payload = int(binascii.hexlify(data[::-1]), 16)
4853     encrypted = pow(payload, exponent, modulus)
4854     return '%x' % encrypted
4855
4856
4857 def pkcs1pad(data, length):
4858     """
4859     Padding input data with PKCS#1 scheme
4860
4861     @param {int[]} data        input data
4862     @param {int}   length      target length
4863     @returns {int[]}           padded data
4864     """
4865     if len(data) > length - 11:
4866         raise ValueError('Input data too long for PKCS#1 padding')
4867
4868     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4869     return [0, 2] + pseudo_random + [0] + data
4870
4871
4872 def _base_n_table(n, table):
4873     if not table and not n:
4874         raise ValueError('Either table or n must be specified')
4875     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4876
4877     if n and n != len(table):
4878         raise ValueError(f'base {n} exceeds table length {len(table)}')
4879     return table
4880
4881
4882 def encode_base_n(num, n=None, table=None):
4883     """Convert given int to a base-n string"""
4884     table = _base_n_table(n, table)
4885     if not num:
4886         return table[0]
4887
4888     result, base = '', len(table)
4889     while num:
4890         result = table[num % base] + result
4891         num = num // base
4892     return result
4893
4894
4895 def decode_base_n(string, n=None, table=None):
4896     """Convert given base-n string to int"""
4897     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4898     result, base = 0, len(table)
4899     for char in string:
4900         result = result * base + table[char]
4901     return result
4902
4903
4904 def decode_base(value, digits):
4905     deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4906                         f'in a future version. Use {__name__}.decode_base_n instead')
4907     return decode_base_n(value, table=digits)
4908
4909
4910 def decode_packed_codes(code):
4911     mobj = re.search(PACKED_CODES_RE, code)
4912     obfuscated_code, base, count, symbols = mobj.groups()
4913     base = int(base)
4914     count = int(count)
4915     symbols = symbols.split('|')
4916     symbol_table = {}
4917
4918     while count:
4919         count -= 1
4920         base_n_count = encode_base_n(count, base)
4921         symbol_table[base_n_count] = symbols[count] or base_n_count
4922
4923     return re.sub(
4924         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4925         obfuscated_code)
4926
4927
4928 def caesar(s, alphabet, shift):
4929     if shift == 0:
4930         return s
4931     l = len(alphabet)
4932     return ''.join(
4933         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4934         for c in s)
4935
4936
4937 def rot47(s):
4938     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4939
4940
4941 def parse_m3u8_attributes(attrib):
4942     info = {}
4943     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4944         if val.startswith('"'):
4945             val = val[1:-1]
4946         info[key] = val
4947     return info
4948
4949
4950 def urshift(val, n):
4951     return val >> n if val >= 0 else (val + 0x100000000) >> n
4952
4953
4954 # Based on png2str() written by @gdkchan and improved by @yokrysty
4955 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4956 def decode_png(png_data):
4957     # Reference: https://www.w3.org/TR/PNG/
4958     header = png_data[8:]
4959
4960     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4961         raise OSError('Not a valid PNG file.')
4962
4963     int_map = {1: '>B', 2: '>H', 4: '>I'}
4964     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4965
4966     chunks = []
4967
4968     while header:
4969         length = unpack_integer(header[:4])
4970         header = header[4:]
4971
4972         chunk_type = header[:4]
4973         header = header[4:]
4974
4975         chunk_data = header[:length]
4976         header = header[length:]
4977
4978         header = header[4:]  # Skip CRC
4979
4980         chunks.append({
4981             'type': chunk_type,
4982             'length': length,
4983             'data': chunk_data
4984         })
4985
4986     ihdr = chunks[0]['data']
4987
4988     width = unpack_integer(ihdr[:4])
4989     height = unpack_integer(ihdr[4:8])
4990
4991     idat = b''
4992
4993     for chunk in chunks:
4994         if chunk['type'] == b'IDAT':
4995             idat += chunk['data']
4996
4997     if not idat:
4998         raise OSError('Unable to read PNG data.')
4999
5000     decompressed_data = bytearray(zlib.decompress(idat))
5001
5002     stride = width * 3
5003     pixels = []
5004
5005     def _get_pixel(idx):
5006         x = idx % stride
5007         y = idx // stride
5008         return pixels[y][x]
5009
5010     for y in range(height):
5011         basePos = y * (1 + stride)
5012         filter_type = decompressed_data[basePos]
5013
5014         current_row = []
5015
5016         pixels.append(current_row)
5017
5018         for x in range(stride):
5019             color = decompressed_data[1 + basePos + x]
5020             basex = y * stride + x
5021             left = 0
5022             up = 0
5023
5024             if x > 2:
5025                 left = _get_pixel(basex - 3)
5026             if y > 0:
5027                 up = _get_pixel(basex - stride)
5028
5029             if filter_type == 1:  # Sub
5030                 color = (color + left) & 0xff
5031             elif filter_type == 2:  # Up
5032                 color = (color + up) & 0xff
5033             elif filter_type == 3:  # Average
5034                 color = (color + ((left + up) >> 1)) & 0xff
5035             elif filter_type == 4:  # Paeth
5036                 a = left
5037                 b = up
5038                 c = 0
5039
5040                 if x > 2 and y > 0:
5041                     c = _get_pixel(basex - stride - 3)
5042
5043                 p = a + b - c
5044
5045                 pa = abs(p - a)
5046                 pb = abs(p - b)
5047                 pc = abs(p - c)
5048
5049                 if pa <= pb and pa <= pc:
5050                     color = (color + a) & 0xff
5051                 elif pb <= pc:
5052                     color = (color + b) & 0xff
5053                 else:
5054                     color = (color + c) & 0xff
5055
5056             current_row.append(color)
5057
5058     return width, height, pixels
5059
5060
5061 def write_xattr(path, key, value):
5062     # Windows: Write xattrs to NTFS Alternate Data Streams:
5063     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5064     if compat_os_name == 'nt':
5065         assert ':' not in key
5066         assert os.path.exists(path)
5067
5068         try:
5069             with open(f'{path}:{key}', 'wb') as f:
5070                 f.write(value)
5071         except OSError as e:
5072             raise XAttrMetadataError(e.errno, e.strerror)
5073         return
5074
5075     # UNIX Method 1. Use xattrs/pyxattrs modules
5076
5077     setxattr = None
5078     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5079         # Unicode arguments are not supported in pyxattr until version 0.5.0
5080         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5081         if version_tuple(xattr.__version__) >= (0, 5, 0):
5082             setxattr = xattr.set
5083     elif xattr:
5084         setxattr = xattr.setxattr
5085
5086     if setxattr:
5087         try:
5088             setxattr(path, key, value)
5089         except OSError as e:
5090             raise XAttrMetadataError(e.errno, e.strerror)
5091         return
5092
5093     # UNIX Method 2. Use setfattr/xattr executables
5094     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5095            else 'xattr' if check_executable('xattr', ['-h']) else None)
5096     if not exe:
5097         raise XAttrUnavailableError(
5098             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5099             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5100
5101     value = value.decode()
5102     try:
5103         _, stderr, returncode = Popen.run(
5104             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5105             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5106     except OSError as e:
5107         raise XAttrMetadataError(e.errno, e.strerror)
5108     if returncode:
5109         raise XAttrMetadataError(returncode, stderr)
5110
5111
5112 def random_birthday(year_field, month_field, day_field):
5113     start_date = datetime.date(1950, 1, 1)
5114     end_date = datetime.date(1995, 12, 31)
5115     offset = random.randint(0, (end_date - start_date).days)
5116     random_date = start_date + datetime.timedelta(offset)
5117     return {
5118         year_field: str(random_date.year),
5119         month_field: str(random_date.month),
5120         day_field: str(random_date.day),
5121     }
5122
5123
5124 # Templates for internet shortcut files, which are plain text files.
5125 DOT_URL_LINK_TEMPLATE = '''\
5126 [InternetShortcut]
5127 URL=%(url)s
5128 '''
5129
5130 DOT_WEBLOC_LINK_TEMPLATE = '''\
5131 <?xml version="1.0" encoding="UTF-8"?>
5132 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5133 <plist version="1.0">
5134 <dict>
5135 \t<key>URL</key>
5136 \t<string>%(url)s</string>
5137 </dict>
5138 </plist>
5139 '''
5140
5141 DOT_DESKTOP_LINK_TEMPLATE = '''\
5142 [Desktop Entry]
5143 Encoding=UTF-8
5144 Name=%(filename)s
5145 Type=Link
5146 URL=%(url)s
5147 Icon=text-html
5148 '''
5149
5150 LINK_TEMPLATES = {
5151     'url': DOT_URL_LINK_TEMPLATE,
5152     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5153     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5154 }
5155
5156
5157 def iri_to_uri(iri):
5158     """
5159     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5160
5161     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5162     """
5163
5164     iri_parts = urllib.parse.urlparse(iri)
5165
5166     if '[' in iri_parts.netloc:
5167         raise ValueError('IPv6 URIs are not, yet, supported.')
5168         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5169
5170     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5171
5172     net_location = ''
5173     if iri_parts.username:
5174         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5175         if iri_parts.password is not None:
5176             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5177         net_location += '@'
5178
5179     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5180     # The 'idna' encoding produces ASCII text.
5181     if iri_parts.port is not None and iri_parts.port != 80:
5182         net_location += ':' + str(iri_parts.port)
5183
5184     return urllib.parse.urlunparse(
5185         (iri_parts.scheme,
5186             net_location,
5187
5188             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5189
5190             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5191             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5192
5193             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5194             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5195
5196             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5197
5198     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5199
5200
5201 def to_high_limit_path(path):
5202     if sys.platform in ['win32', 'cygwin']:
5203         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5204         return '\\\\?\\' + os.path.abspath(path)
5205
5206     return path
5207
5208
5209 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5210     val = traverse_obj(obj, *variadic(field))
5211     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5212         return default
5213     return template % func(val)
5214
5215
5216 def clean_podcast_url(url):
5217     return re.sub(r'''(?x)
5218         (?:
5219             (?:
5220                 chtbl\.com/track|
5221                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5222                 play\.podtrac\.com
5223             )/[^/]+|
5224             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5225             flex\.acast\.com|
5226             pd(?:
5227                 cn\.co| # https://podcorn.com/analytics-prefix/
5228                 st\.fm # https://podsights.com/docs/
5229             )/e
5230         )/''', '', url)
5231
5232
5233 _HEX_TABLE = '0123456789abcdef'
5234
5235
5236 def random_uuidv4():
5237     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5238
5239
5240 def make_dir(path, to_screen=None):
5241     try:
5242         dn = os.path.dirname(path)
5243         if dn and not os.path.exists(dn):
5244             os.makedirs(dn)
5245         return True
5246     except OSError as err:
5247         if callable(to_screen) is not None:
5248             to_screen('unable to create directory ' + error_to_compat_str(err))
5249         return False
5250
5251
5252 def get_executable_path():
5253     from .update import _get_variant_and_executable_path
5254
5255     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5256
5257
5258 def load_plugins(name, suffix, namespace):
5259     classes = {}
5260     with contextlib.suppress(FileNotFoundError):
5261         plugins_spec = importlib.util.spec_from_file_location(
5262             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5263         plugins = importlib.util.module_from_spec(plugins_spec)
5264         sys.modules[plugins_spec.name] = plugins
5265         plugins_spec.loader.exec_module(plugins)
5266         for name in dir(plugins):
5267             if name in namespace:
5268                 continue
5269             if not name.endswith(suffix):
5270                 continue
5271             klass = getattr(plugins, name)
5272             classes[name] = namespace[name] = klass
5273     return classes
5274
5275
5276 def traverse_obj(
5277         obj, *path_list, default=None, expected_type=None, get_all=True,
5278         casesense=True, is_user_input=False, traverse_string=False):
5279     ''' Traverse nested list/dict/tuple
5280     @param path_list        A list of paths which are checked one by one.
5281                             Each path is a list of keys where each key is a:
5282                               - None:     Do nothing
5283                               - string:   A dictionary key / regex group
5284                               - int:      An index into a list
5285                               - tuple:    A list of keys all of which will be traversed
5286                               - Ellipsis: Fetch all values in the object
5287                               - Function: Takes the key and value as arguments
5288                                           and returns whether the key matches or not
5289     @param default          Default value to return
5290     @param expected_type    Only accept final value of this type (Can also be any callable)
5291     @param get_all          Return all the values obtained from a path or only the first one
5292     @param casesense        Whether to consider dictionary keys as case sensitive
5293
5294     The following are only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API
5295
5296     @param path_list        In addition to the above,
5297                               - dict:     Given {k:v, ...}; return {k: traverse_obj(obj, v), ...}
5298     @param is_user_input    Whether the keys are generated from user input. If True,
5299                             strings are converted to int/slice if necessary
5300     @param traverse_string  Whether to traverse inside strings. If True, any
5301                             non-compatible object will also be converted into a string
5302     '''  # TODO: Write tests
5303     if not casesense:
5304         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5305         path_list = (map(_lower, variadic(path)) for path in path_list)
5306
5307     def _traverse_obj(obj, path, _current_depth=0):
5308         nonlocal depth
5309         path = tuple(variadic(path))
5310         for i, key in enumerate(path):
5311             if None in (key, obj):
5312                 return obj
5313             if isinstance(key, (list, tuple)):
5314                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5315                 key = ...
5316
5317             if key is ...:
5318                 obj = (obj.values() if isinstance(obj, dict)
5319                        else obj if isinstance(obj, (list, tuple, LazyList))
5320                        else str(obj) if traverse_string else [])
5321                 _current_depth += 1
5322                 depth = max(depth, _current_depth)
5323                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5324             elif isinstance(key, dict):
5325                 obj = filter_dict({k: _traverse_obj(obj, v, _current_depth) for k, v in key.items()})
5326             elif callable(key):
5327                 if isinstance(obj, (list, tuple, LazyList)):
5328                     obj = enumerate(obj)
5329                 elif isinstance(obj, dict):
5330                     obj = obj.items()
5331                 else:
5332                     if not traverse_string:
5333                         return None
5334                     obj = str(obj)
5335                 _current_depth += 1
5336                 depth = max(depth, _current_depth)
5337                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5338             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5339                 obj = (obj.get(key) if casesense or (key in obj)
5340                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5341             else:
5342                 if is_user_input:
5343                     key = (int_or_none(key) if ':' not in key
5344                            else slice(*map(int_or_none, key.split(':'))))
5345                     if key == slice(None):
5346                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5347                 if not isinstance(key, (int, slice)):
5348                     return None
5349                 if not isinstance(obj, (list, tuple, LazyList)):
5350                     if not traverse_string:
5351                         return None
5352                     obj = str(obj)
5353                 try:
5354                     obj = obj[key]
5355                 except IndexError:
5356                     return None
5357         return obj
5358
5359     if isinstance(expected_type, type):
5360         type_test = lambda val: val if isinstance(val, expected_type) else None
5361     else:
5362         type_test = expected_type or IDENTITY
5363
5364     for path in path_list:
5365         depth = 0
5366         val = _traverse_obj(obj, path)
5367         if val is not None:
5368             if depth:
5369                 for _ in range(depth - 1):
5370                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5371                 val = [v for v in map(type_test, val) if v is not None]
5372                 if val:
5373                     return val if get_all else val[0]
5374             else:
5375                 val = type_test(val)
5376                 if val is not None:
5377                     return val
5378     return default
5379
5380
5381 def traverse_dict(dictn, keys, casesense=True):
5382     deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5383                         f'in a future version. Use "{__name__}.traverse_obj" instead')
5384     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5385
5386
5387 def get_first(obj, keys, **kwargs):
5388     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5389
5390
5391 def variadic(x, allowed_types=(str, bytes, dict)):
5392     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5393
5394
5395 def time_seconds(**kwargs):
5396     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5397     return t.timestamp()
5398
5399
5400 # create a JSON Web Signature (jws) with HS256 algorithm
5401 # the resulting format is in JWS Compact Serialization
5402 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5403 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5404 def jwt_encode_hs256(payload_data, key, headers={}):
5405     header_data = {
5406         'alg': 'HS256',
5407         'typ': 'JWT',
5408     }
5409     if headers:
5410         header_data.update(headers)
5411     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5412     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5413     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5414     signature_b64 = base64.b64encode(h.digest())
5415     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5416     return token
5417
5418
5419 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5420 def jwt_decode_hs256(jwt):
5421     header_b64, payload_b64, signature_b64 = jwt.split('.')
5422     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5423     return payload_data
5424
5425
5426 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5427
5428
5429 @functools.cache
5430 def supports_terminal_sequences(stream):
5431     if compat_os_name == 'nt':
5432         if not WINDOWS_VT_MODE:
5433             return False
5434     elif not os.getenv('TERM'):
5435         return False
5436     try:
5437         return stream.isatty()
5438     except BaseException:
5439         return False
5440
5441
5442 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5443     if get_windows_version() < (10, 0, 10586):
5444         return
5445     global WINDOWS_VT_MODE
5446     try:
5447         Popen.run('', shell=True)
5448     except Exception:
5449         return
5450
5451     WINDOWS_VT_MODE = True
5452     supports_terminal_sequences.cache_clear()
5453
5454
5455 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5456
5457
5458 def remove_terminal_sequences(string):
5459     return _terminal_sequences_re.sub('', string)
5460
5461
5462 def number_of_digits(number):
5463     return len('%d' % number)
5464
5465
5466 def join_nonempty(*values, delim='-', from_dict=None):
5467     if from_dict is not None:
5468         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5469     return delim.join(map(str, filter(None, values)))
5470
5471
5472 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5473     """
5474     Find the largest format dimensions in terms of video width and, for each thumbnail:
5475     * Modify the URL: Match the width with the provided regex and replace with the former width
5476     * Update dimensions
5477
5478     This function is useful with video services that scale the provided thumbnails on demand
5479     """
5480     _keys = ('width', 'height')
5481     max_dimensions = max(
5482         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5483         default=(0, 0))
5484     if not max_dimensions[0]:
5485         return thumbnails
5486     return [
5487         merge_dicts(
5488             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5489             dict(zip(_keys, max_dimensions)), thumbnail)
5490         for thumbnail in thumbnails
5491     ]
5492
5493
5494 def parse_http_range(range):
5495     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5496     if not range:
5497         return None, None, None
5498     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5499     if not crg:
5500         return None, None, None
5501     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5502
5503
5504 def read_stdin(what):
5505     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5506     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5507     return sys.stdin
5508
5509
5510 def determine_file_encoding(data):
5511     """
5512     Detect the text encoding used
5513     @returns (encoding, bytes to skip)
5514     """
5515
5516     # BOM marks are given priority over declarations
5517     for bom, enc in BOMS:
5518         if data.startswith(bom):
5519             return enc, len(bom)
5520
5521     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5522     # We ignore the endianness to get a good enough match
5523     data = data.replace(b'\0', b'')
5524     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5525     return mobj.group(1).decode() if mobj else None, 0
5526
5527
5528 class Config:
5529     own_args = None
5530     parsed_args = None
5531     filename = None
5532     __initialized = False
5533
5534     def __init__(self, parser, label=None):
5535         self.parser, self.label = parser, label
5536         self._loaded_paths, self.configs = set(), []
5537
5538     def init(self, args=None, filename=None):
5539         assert not self.__initialized
5540         self.own_args, self.filename = args, filename
5541         return self.load_configs()
5542
5543     def load_configs(self):
5544         directory = ''
5545         if self.filename:
5546             location = os.path.realpath(self.filename)
5547             directory = os.path.dirname(location)
5548             if location in self._loaded_paths:
5549                 return False
5550             self._loaded_paths.add(location)
5551
5552         self.__initialized = True
5553         opts, _ = self.parser.parse_known_args(self.own_args)
5554         self.parsed_args = self.own_args
5555         for location in opts.config_locations or []:
5556             if location == '-':
5557                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5558                 continue
5559             location = os.path.join(directory, expand_path(location))
5560             if os.path.isdir(location):
5561                 location = os.path.join(location, 'yt-dlp.conf')
5562             if not os.path.exists(location):
5563                 self.parser.error(f'config location {location} does not exist')
5564             self.append_config(self.read_file(location), location)
5565         return True
5566
5567     def __str__(self):
5568         label = join_nonempty(
5569             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5570             delim=' ')
5571         return join_nonempty(
5572             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5573             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5574             delim='\n')
5575
5576     @staticmethod
5577     def read_file(filename, default=[]):
5578         try:
5579             optionf = open(filename, 'rb')
5580         except OSError:
5581             return default  # silently skip if file is not present
5582         try:
5583             enc, skip = determine_file_encoding(optionf.read(512))
5584             optionf.seek(skip, io.SEEK_SET)
5585         except OSError:
5586             enc = None  # silently skip read errors
5587         try:
5588             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5589             contents = optionf.read().decode(enc or preferredencoding())
5590             res = shlex.split(contents, comments=True)
5591         except Exception as err:
5592             raise ValueError(f'Unable to parse "{filename}": {err}')
5593         finally:
5594             optionf.close()
5595         return res
5596
5597     @staticmethod
5598     def hide_login_info(opts):
5599         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5600         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5601
5602         def _scrub_eq(o):
5603             m = eqre.match(o)
5604             if m:
5605                 return m.group('key') + '=PRIVATE'
5606             else:
5607                 return o
5608
5609         opts = list(map(_scrub_eq, opts))
5610         for idx, opt in enumerate(opts):
5611             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5612                 opts[idx + 1] = 'PRIVATE'
5613         return opts
5614
5615     def append_config(self, *args, label=None):
5616         config = type(self)(self.parser, label)
5617         config._loaded_paths = self._loaded_paths
5618         if config.init(*args):
5619             self.configs.append(config)
5620
5621     @property
5622     def all_args(self):
5623         for config in reversed(self.configs):
5624             yield from config.all_args
5625         yield from self.parsed_args or []
5626
5627     def parse_known_args(self, **kwargs):
5628         return self.parser.parse_known_args(self.all_args, **kwargs)
5629
5630     def parse_args(self):
5631         return self.parser.parse_args(self.all_args)
5632
5633
5634 class WebSocketsWrapper():
5635     """Wraps websockets module to use in non-async scopes"""
5636     pool = None
5637
5638     def __init__(self, url, headers=None, connect=True):
5639         self.loop = asyncio.new_event_loop()
5640         # XXX: "loop" is deprecated
5641         self.conn = websockets.connect(
5642             url, extra_headers=headers, ping_interval=None,
5643             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5644         if connect:
5645             self.__enter__()
5646         atexit.register(self.__exit__, None, None, None)
5647
5648     def __enter__(self):
5649         if not self.pool:
5650             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5651         return self
5652
5653     def send(self, *args):
5654         self.run_with_loop(self.pool.send(*args), self.loop)
5655
5656     def recv(self, *args):
5657         return self.run_with_loop(self.pool.recv(*args), self.loop)
5658
5659     def __exit__(self, type, value, traceback):
5660         try:
5661             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5662         finally:
5663             self.loop.close()
5664             self._cancel_all_tasks(self.loop)
5665
5666     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5667     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5668     @staticmethod
5669     def run_with_loop(main, loop):
5670         if not asyncio.iscoroutine(main):
5671             raise ValueError(f'a coroutine was expected, got {main!r}')
5672
5673         try:
5674             return loop.run_until_complete(main)
5675         finally:
5676             loop.run_until_complete(loop.shutdown_asyncgens())
5677             if hasattr(loop, 'shutdown_default_executor'):
5678                 loop.run_until_complete(loop.shutdown_default_executor())
5679
5680     @staticmethod
5681     def _cancel_all_tasks(loop):
5682         to_cancel = asyncio.all_tasks(loop)
5683
5684         if not to_cancel:
5685             return
5686
5687         for task in to_cancel:
5688             task.cancel()
5689
5690         # XXX: "loop" is removed in python 3.10+
5691         loop.run_until_complete(
5692             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5693
5694         for task in to_cancel:
5695             if task.cancelled():
5696                 continue
5697             if task.exception() is not None:
5698                 loop.call_exception_handler({
5699                     'message': 'unhandled exception during asyncio.run() shutdown',
5700                     'exception': task.exception(),
5701                     'task': task,
5702                 })
5703
5704
5705 def merge_headers(*dicts):
5706     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5707     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5708
5709
5710 def cached_method(f):
5711     """Cache a method"""
5712     signature = inspect.signature(f)
5713
5714     @functools.wraps(f)
5715     def wrapper(self, *args, **kwargs):
5716         bound_args = signature.bind(self, *args, **kwargs)
5717         bound_args.apply_defaults()
5718         key = tuple(bound_args.arguments.values())
5719
5720         if not hasattr(self, '__cached_method__cache'):
5721             self.__cached_method__cache = {}
5722         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5723         if key not in cache:
5724             cache[key] = f(self, *args, **kwargs)
5725         return cache[key]
5726     return wrapper
5727
5728
5729 class classproperty:
5730     """property access for class methods"""
5731
5732     def __init__(self, func):
5733         functools.update_wrapper(self, func)
5734         self.func = func
5735
5736     def __get__(self, _, cls):
5737         return self.func(cls)
5738
5739
5740 class Namespace(types.SimpleNamespace):
5741     """Immutable namespace"""
5742
5743     def __iter__(self):
5744         return iter(self.__dict__.values())
5745
5746     @property
5747     def items_(self):
5748         return self.__dict__.items()
5749
5750
5751 MEDIA_EXTENSIONS = Namespace(
5752     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5753     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5754     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5755     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5756     thumbnails=('jpg', 'png', 'webp'),
5757     storyboards=('mhtml', ),
5758     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5759     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5760 )
5761 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5762 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5763
5764 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5765
5766
5767 class RetryManager:
5768     """Usage:
5769         for retry in RetryManager(...):
5770             try:
5771                 ...
5772             except SomeException as err:
5773                 retry.error = err
5774                 continue
5775     """
5776     attempt, _error = 0, None
5777
5778     def __init__(self, _retries, _error_callback, **kwargs):
5779         self.retries = _retries or 0
5780         self.error_callback = functools.partial(_error_callback, **kwargs)
5781
5782     def _should_retry(self):
5783         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5784
5785     @property
5786     def error(self):
5787         if self._error is NO_DEFAULT:
5788             return None
5789         return self._error
5790
5791     @error.setter
5792     def error(self, value):
5793         self._error = value
5794
5795     def __iter__(self):
5796         while self._should_retry():
5797             self.error = NO_DEFAULT
5798             self.attempt += 1
5799             yield self
5800             if self.error:
5801                 self.error_callback(self.error, self.attempt, self.retries)
5802
5803     @staticmethod
5804     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5805         """Utility function for reporting retries"""
5806         if count > retries:
5807             if error:
5808                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5809             raise e
5810
5811         if not count:
5812             return warn(e)
5813         elif isinstance(e, ExtractorError):
5814             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5815         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5816
5817         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5818         if delay:
5819             info(f'Sleeping {delay:.2f} seconds ...')
5820             time.sleep(delay)
5821
5822
5823 def make_archive_id(ie, video_id):
5824     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5825     return f'{ie_key.lower()} {video_id}'
5826
5827
5828 def truncate_string(s, left, right=0):
5829     assert left > 3 and right >= 0
5830     if s is None or len(s) <= left + right:
5831         return s
5832     return f'{s[:left-3]}...{s[-right:]}'
5833
5834
5835 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5836     assert 'all' in alias_dict, '"all" alias is required'
5837     requested = list(start or [])
5838     for val in options:
5839         discard = val.startswith('-')
5840         if discard:
5841             val = val[1:]
5842
5843         if val in alias_dict:
5844             val = alias_dict[val] if not discard else [
5845                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5846             # NB: Do not allow regex in aliases for performance
5847             requested = orderedSet_from_options(val, alias_dict, start=requested)
5848             continue
5849
5850         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5851                    else [val] if val in alias_dict['all'] else None)
5852         if current is None:
5853             raise ValueError(val)
5854
5855         if discard:
5856             for item in current:
5857                 while item in requested:
5858                     requested.remove(item)
5859         else:
5860             requested.extend(current)
5861
5862     return orderedSet(requested)
5863
5864
5865 # Deprecated
5866 has_certifi = bool(certifi)
5867 has_websockets = bool(websockets)