yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from .compat import functools  # isort: split
  51 from .compat import (
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_HTMLParseError,
  55     compat_os_name,
  56     compat_shlex_quote,
  57 )
  58 from .dependencies import brotli, certifi, websockets, xattr
  59 from .socks import ProxyType, sockssocket
  60
  61
  62 def register_socks_protocols():
  63     # "Register" SOCKS protocols
  64     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  65     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  66     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  67         if scheme not in urllib.parse.uses_netloc:
  68             urllib.parse.uses_netloc.append(scheme)
  69
  70
  71 # This is not clearly defined otherwise
  72 compiled_regex_type = type(re.compile(''))
  73
  74
  75 def random_user_agent():
  76     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  77     _CHROME_VERSIONS = (
  78         '90.0.4430.212',
  79         '90.0.4430.24',
  80         '90.0.4430.70',
  81         '90.0.4430.72',
  82         '90.0.4430.85',
  83         '90.0.4430.93',
  84         '91.0.4472.101',
  85         '91.0.4472.106',
  86         '91.0.4472.114',
  87         '91.0.4472.124',
  88         '91.0.4472.164',
  89         '91.0.4472.19',
  90         '91.0.4472.77',
  91         '92.0.4515.107',
  92         '92.0.4515.115',
  93         '92.0.4515.131',
  94         '92.0.4515.159',
  95         '92.0.4515.43',
  96         '93.0.4556.0',
  97         '93.0.4577.15',
  98         '93.0.4577.63',
  99         '93.0.4577.82',
 100         '94.0.4606.41',
 101         '94.0.4606.54',
 102         '94.0.4606.61',
 103         '94.0.4606.71',
 104         '94.0.4606.81',
 105         '94.0.4606.85',
 106         '95.0.4638.17',
 107         '95.0.4638.50',
 108         '95.0.4638.54',
 109         '95.0.4638.69',
 110         '95.0.4638.74',
 111         '96.0.4664.18',
 112         '96.0.4664.45',
 113         '96.0.4664.55',
 114         '96.0.4664.93',
 115         '97.0.4692.20',
 116     )
 117     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 118
 119
 120 SUPPORTED_ENCODINGS = [
 121     'gzip', 'deflate'
 122 ]
 123 if brotli:
 124     SUPPORTED_ENCODINGS.append('br')
 125
 126 std_headers = {
 127     'User-Agent': random_user_agent(),
 128     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 129     'Accept-Language': 'en-us,en;q=0.5',
 130     'Sec-Fetch-Mode': 'navigate',
 131 }
 132
 133
 134 USER_AGENTS = {
 135     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 136 }
 137
 138
 139 NO_DEFAULT = object()
 140 IDENTITY = lambda x: x
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151 }
 152
 153 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 154 TIMEZONE_NAMES = {
 155     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 156     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 157     'EST': -5, 'EDT': -4,  # Eastern
 158     'CST': -6, 'CDT': -5,  # Central
 159     'MST': -7, 'MDT': -6,  # Mountain
 160     'PST': -8, 'PDT': -7   # Pacific
 161 }
 162
 163 # needed for sanitizing filenames in restricted mode
 164 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 165                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 166                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 167
 168 DATE_FORMATS = (
 169     '%d %B %Y',
 170     '%d %b %Y',
 171     '%B %d %Y',
 172     '%B %dst %Y',
 173     '%B %dnd %Y',
 174     '%B %drd %Y',
 175     '%B %dth %Y',
 176     '%b %d %Y',
 177     '%b %dst %Y',
 178     '%b %dnd %Y',
 179     '%b %drd %Y',
 180     '%b %dth %Y',
 181     '%b %dst %Y %I:%M',
 182     '%b %dnd %Y %I:%M',
 183     '%b %drd %Y %I:%M',
 184     '%b %dth %Y %I:%M',
 185     '%Y %m %d',
 186     '%Y-%m-%d',
 187     '%Y.%m.%d.',
 188     '%Y/%m/%d',
 189     '%Y/%m/%d %H:%M',
 190     '%Y/%m/%d %H:%M:%S',
 191     '%Y%m%d%H%M',
 192     '%Y%m%d%H%M%S',
 193     '%Y%m%d',
 194     '%Y-%m-%d %H:%M',
 195     '%Y-%m-%d %H:%M:%S',
 196     '%Y-%m-%d %H:%M:%S.%f',
 197     '%Y-%m-%d %H:%M:%S:%f',
 198     '%d.%m.%Y %H:%M',
 199     '%d.%m.%Y %H.%M',
 200     '%Y-%m-%dT%H:%M:%SZ',
 201     '%Y-%m-%dT%H:%M:%S.%fZ',
 202     '%Y-%m-%dT%H:%M:%S.%f0Z',
 203     '%Y-%m-%dT%H:%M:%S',
 204     '%Y-%m-%dT%H:%M:%S.%f',
 205     '%Y-%m-%dT%H:%M',
 206     '%b %d %Y at %H:%M',
 207     '%b %d %Y at %H:%M:%S',
 208     '%B %d %Y at %H:%M',
 209     '%B %d %Y at %H:%M:%S',
 210     '%H:%M %d-%b-%Y',
 211 )
 212
 213 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 214 DATE_FORMATS_DAY_FIRST.extend([
 215     '%d-%m-%Y',
 216     '%d.%m.%Y',
 217     '%d.%m.%y',
 218     '%d/%m/%Y',
 219     '%d/%m/%y',
 220     '%d/%m/%Y %H:%M:%S',
 221     '%d-%m-%Y %H:%M',
 222 ])
 223
 224 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 225 DATE_FORMATS_MONTH_FIRST.extend([
 226     '%m-%d-%Y',
 227     '%m.%d.%Y',
 228     '%m/%d/%Y',
 229     '%m/%d/%y',
 230     '%m/%d/%Y %H:%M:%S',
 231 ])
 232
 233 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 234 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 235
 236 NUMBER_RE = r'\d+(?:\.\d+)?'
 237
 238
 239 @functools.cache
 240 def preferredencoding():
 241     """Get preferred encoding.
 242
 243     Returns the best encoding scheme for the system, based on
 244     locale.getpreferredencoding() and some further tweaks.
 245     """
 246     try:
 247         pref = locale.getpreferredencoding()
 248         'TEST'.encode(pref)
 249     except Exception:
 250         pref = 'UTF-8'
 251
 252     return pref
 253
 254
 255 def write_json_file(obj, fn):
 256     """ Encode obj as JSON and write it to fn, atomically if possible """
 257
 258     tf = tempfile.NamedTemporaryFile(
 259         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 260         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf, ensure_ascii=False)
 265         if sys.platform == 'win32':
 266             # Need to remove existing file on Windows, else os.rename raises
 267             # WindowsError or FileExistsError.
 268             with contextlib.suppress(OSError):
 269                 os.unlink(fn)
 270         with contextlib.suppress(OSError):
 271             mask = os.umask(0)
 272             os.umask(mask)
 273             os.chmod(tf.name, 0o666 & ~mask)
 274         os.rename(tf.name, fn)
 275     except Exception:
 276         with contextlib.suppress(OSError):
 277             os.remove(tf.name)
 278         raise
 279
 280
 281 def find_xpath_attr(node, xpath, key, val=None):
 282     """ Find the xpath xpath[@key=val] """
 283     assert re.match(r'^[a-zA-Z_-]+$', key)
 284     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 285     return node.find(expr)
 286
 287 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 288 # the namespace parameter
 289
 290
 291 def xpath_with_ns(path, ns_map):
 292     components = [c.split(':') for c in path.split('/')]
 293     replaced = []
 294     for c in components:
 295         if len(c) == 1:
 296             replaced.append(c[0])
 297         else:
 298             ns, tag = c
 299             replaced.append('{%s}%s' % (ns_map[ns], tag))
 300     return '/'.join(replaced)
 301
 302
 303 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 304     def _find_xpath(xpath):
 305         return node.find(xpath)
 306
 307     if isinstance(xpath, str):
 308         n = _find_xpath(xpath)
 309     else:
 310         for xp in xpath:
 311             n = _find_xpath(xp)
 312             if n is not None:
 313                 break
 314
 315     if n is None:
 316         if default is not NO_DEFAULT:
 317             return default
 318         elif fatal:
 319             name = xpath if name is None else name
 320             raise ExtractorError('Could not find XML element %s' % name)
 321         else:
 322             return None
 323     return n
 324
 325
 326 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 327     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 328     if n is None or n == default:
 329         return n
 330     if n.text is None:
 331         if default is not NO_DEFAULT:
 332             return default
 333         elif fatal:
 334             name = xpath if name is None else name
 335             raise ExtractorError('Could not find XML element\'s text %s' % name)
 336         else:
 337             return None
 338     return n.text
 339
 340
 341 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 342     n = find_xpath_attr(node, xpath, key)
 343     if n is None:
 344         if default is not NO_DEFAULT:
 345             return default
 346         elif fatal:
 347             name = f'{xpath}[@{key}]' if name is None else name
 348             raise ExtractorError('Could not find XML attribute %s' % name)
 349         else:
 350             return None
 351     return n.attrib[key]
 352
 353
 354 def get_element_by_id(id, html, **kwargs):
 355     """Return the content of the tag with the specified ID in the passed HTML document"""
 356     return get_element_by_attribute('id', id, html, **kwargs)
 357
 358
 359 def get_element_html_by_id(id, html, **kwargs):
 360     """Return the html of the tag with the specified ID in the passed HTML document"""
 361     return get_element_html_by_attribute('id', id, html, **kwargs)
 362
 363
 364 def get_element_by_class(class_name, html):
 365     """Return the content of the first tag with the specified class in the passed HTML document"""
 366     retval = get_elements_by_class(class_name, html)
 367     return retval[0] if retval else None
 368
 369
 370 def get_element_html_by_class(class_name, html):
 371     """Return the html of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_html_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_by_attribute(attribute, value, html, **kwargs):
 377     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_html_by_attribute(attribute, value, html, **kargs):
 382     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 383     return retval[0] if retval else None
 384
 385
 386 def get_elements_by_class(class_name, html, **kargs):
 387     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 388     return get_elements_by_attribute(
 389         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 390         html, escape_value=False)
 391
 392
 393 def get_elements_html_by_class(class_name, html):
 394     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 395     return get_elements_html_by_attribute(
 396         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 397         html, escape_value=False)
 398
 399
 400 def get_elements_by_attribute(*args, **kwargs):
 401     """Return the content of the tag with the specified attribute in the passed HTML document"""
 402     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 403
 404
 405 def get_elements_html_by_attribute(*args, **kwargs):
 406     """Return the html of the tag with the specified attribute in the passed HTML document"""
 407     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 408
 409
 410 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 411     """
 412     Return the text (content) and the html (whole) of the tag with the specified
 413     attribute in the passed HTML document
 414     """
 415
 416     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 417
 418     value = re.escape(value) if escape_value else value
 419
 420     partial_element_re = rf'''(?x)
 421         <(?P<tag>[a-zA-Z0-9:._-]+)
 422          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 423          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 424         '''
 425
 426     for m in re.finditer(partial_element_re, html):
 427         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 428
 429         yield (
 430             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 431             whole
 432         )
 433
 434
 435 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 436     """
 437     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 438     closing tag for the first opening tag it has encountered, and can be used
 439     as a context manager
 440     """
 441
 442     class HTMLBreakOnClosingTagException(Exception):
 443         pass
 444
 445     def __init__(self):
 446         self.tagstack = collections.deque()
 447         html.parser.HTMLParser.__init__(self)
 448
 449     def __enter__(self):
 450         return self
 451
 452     def __exit__(self, *_):
 453         self.close()
 454
 455     def close(self):
 456         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 457         # so data remains buffered; we no longer have any interest in it, thus
 458         # override this method to discard it
 459         pass
 460
 461     def handle_starttag(self, tag, _):
 462         self.tagstack.append(tag)
 463
 464     def handle_endtag(self, tag):
 465         if not self.tagstack:
 466             raise compat_HTMLParseError('no tags in the stack')
 467         while self.tagstack:
 468             inner_tag = self.tagstack.pop()
 469             if inner_tag == tag:
 470                 break
 471         else:
 472             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 473         if not self.tagstack:
 474             raise self.HTMLBreakOnClosingTagException()
 475
 476
 477 def get_element_text_and_html_by_tag(tag, html):
 478     """
 479     For the first element with the specified tag in the passed HTML document
 480     return its' content (text) and the whole element (html)
 481     """
 482     def find_or_raise(haystack, needle, exc):
 483         try:
 484             return haystack.index(needle)
 485         except ValueError:
 486             raise exc
 487     closing_tag = f'</{tag}>'
 488     whole_start = find_or_raise(
 489         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 490     content_start = find_or_raise(
 491         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 492     content_start += whole_start + 1
 493     with HTMLBreakOnClosingTagParser() as parser:
 494         parser.feed(html[whole_start:content_start])
 495         if not parser.tagstack or parser.tagstack[0] != tag:
 496             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 497         offset = content_start
 498         while offset < len(html):
 499             next_closing_tag_start = find_or_raise(
 500                 html[offset:], closing_tag,
 501                 compat_HTMLParseError(f'closing {tag} tag not found'))
 502             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 503             try:
 504                 parser.feed(html[offset:offset + next_closing_tag_end])
 505                 offset += next_closing_tag_end
 506             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 507                 return html[content_start:offset + next_closing_tag_start], \
 508                     html[whole_start:offset + next_closing_tag_end]
 509         raise compat_HTMLParseError('unexpected end of html')
 510
 511
 512 class HTMLAttributeParser(html.parser.HTMLParser):
 513     """Trivial HTML parser to gather the attributes for a single element"""
 514
 515     def __init__(self):
 516         self.attrs = {}
 517         html.parser.HTMLParser.__init__(self)
 518
 519     def handle_starttag(self, tag, attrs):
 520         self.attrs = dict(attrs)
 521
 522
 523 class HTMLListAttrsParser(html.parser.HTMLParser):
 524     """HTML parser to gather the attributes for the elements of a list"""
 525
 526     def __init__(self):
 527         html.parser.HTMLParser.__init__(self)
 528         self.items = []
 529         self._level = 0
 530
 531     def handle_starttag(self, tag, attrs):
 532         if tag == 'li' and self._level == 0:
 533             self.items.append(dict(attrs))
 534         self._level += 1
 535
 536     def handle_endtag(self, tag):
 537         self._level -= 1
 538
 539
 540 def extract_attributes(html_element):
 541     """Given a string for an HTML element such as
 542     <el
 543          a="foo" B="bar" c="&98;az" d=boz
 544          empty= noval entity="&amp;"
 545          sq='"' dq="'"
 546     >
 547     Decode and return a dictionary of attributes.
 548     {
 549         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 550         'empty': '', 'noval': None, 'entity': '&',
 551         'sq': '"', 'dq': '\''
 552     }.
 553     """
 554     parser = HTMLAttributeParser()
 555     with contextlib.suppress(compat_HTMLParseError):
 556         parser.feed(html_element)
 557         parser.close()
 558     return parser.attrs
 559
 560
 561 def parse_list(webpage):
 562     """Given a string for an series of HTML <li> elements,
 563     return a dictionary of their attributes"""
 564     parser = HTMLListAttrsParser()
 565     parser.feed(webpage)
 566     parser.close()
 567     return parser.items
 568
 569
 570 def clean_html(html):
 571     """Clean an HTML snippet into a readable string"""
 572
 573     if html is None:  # Convenience for sanitizing descriptions etc.
 574         return html
 575
 576     html = re.sub(r'\s+', ' ', html)
 577     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 578     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 579     # Strip html tags
 580     html = re.sub('<.*?>', '', html)
 581     # Replace html entities
 582     html = unescapeHTML(html)
 583     return html.strip()
 584
 585
 586 class LenientJSONDecoder(json.JSONDecoder):
 587     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 588         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 589         super().__init__(*args, **kwargs)
 590
 591     def decode(self, s):
 592         if self.transform_source:
 593             s = self.transform_source(s)
 594         if self.ignore_extra:
 595             return self.raw_decode(s.lstrip())[0]
 596         return super().decode(s)
 597
 598
 599 def sanitize_open(filename, open_mode):
 600     """Try to open the given filename, and slightly tweak it if this fails.
 601
 602     Attempts to open the given filename. If this fails, it tries to change
 603     the filename slightly, step by step, until it's either able to open it
 604     or it fails and raises a final exception, like the standard open()
 605     function.
 606
 607     It returns the tuple (stream, definitive_file_name).
 608     """
 609     if filename == '-':
 610         if sys.platform == 'win32':
 611             import msvcrt
 612
 613             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 614             with contextlib.suppress(io.UnsupportedOperation):
 615                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 616         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 617
 618     for attempt in range(2):
 619         try:
 620             try:
 621                 if sys.platform == 'win32':
 622                     # FIXME: An exclusive lock also locks the file from being read.
 623                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 624                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 625                     raise LockingUnsupportedError()
 626                 stream = locked_file(filename, open_mode, block=False).__enter__()
 627             except OSError:
 628                 stream = open(filename, open_mode)
 629             return stream, filename
 630         except OSError as err:
 631             if attempt or err.errno in (errno.EACCES,):
 632                 raise
 633             old_filename, filename = filename, sanitize_path(filename)
 634             if old_filename == filename:
 635                 raise
 636
 637
 638 def timeconvert(timestr):
 639     """Convert RFC 2822 defined time string into system timestamp"""
 640     timestamp = None
 641     timetuple = email.utils.parsedate_tz(timestr)
 642     if timetuple is not None:
 643         timestamp = email.utils.mktime_tz(timetuple)
 644     return timestamp
 645
 646
 647 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 648     """Sanitizes a string so it could be used as part of a filename.
 649     @param restricted   Use a stricter subset of allowed characters
 650     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 651                         If unset, yt-dlp's new sanitization rules are in effect
 652     """
 653     if s == '':
 654         return ''
 655
 656     def replace_insane(char):
 657         if restricted and char in ACCENT_CHARS:
 658             return ACCENT_CHARS[char]
 659         elif not restricted and char == '\n':
 660             return '\0 '
 661         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 662             # Replace with their full-width unicode counterparts
 663             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 664         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 665             return ''
 666         elif char == '"':
 667             return '' if restricted else '\''
 668         elif char == ':':
 669             return '\0_\0-' if restricted else '\0 \0-'
 670         elif char in '\\/|*<>':
 671             return '\0_'
 672         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 673             return '\0_'
 674         return char
 675
 676     if restricted and is_id is NO_DEFAULT:
 677         s = unicodedata.normalize('NFKC', s)
 678     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 679     result = ''.join(map(replace_insane, s))
 680     if is_id is NO_DEFAULT:
 681         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 682         STRIP_RE = r'(?:\0.|[ _-])*'
 683         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 684     result = result.replace('\0', '') or '_'
 685
 686     if not is_id:
 687         while '__' in result:
 688             result = result.replace('__', '_')
 689         result = result.strip('_')
 690         # Common case of "Foreign band name - English song title"
 691         if restricted and result.startswith('-_'):
 692             result = result[2:]
 693         if result.startswith('-'):
 694             result = '_' + result[len('-'):]
 695         result = result.lstrip('.')
 696         if not result:
 697             result = '_'
 698     return result
 699
 700
 701 def sanitize_path(s, force=False):
 702     """Sanitizes and normalizes path on Windows"""
 703     if sys.platform == 'win32':
 704         force = False
 705         drive_or_unc, _ = os.path.splitdrive(s)
 706     elif force:
 707         drive_or_unc = ''
 708     else:
 709         return s
 710
 711     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 712     if drive_or_unc:
 713         norm_path.pop(0)
 714     sanitized_path = [
 715         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 716         for path_part in norm_path]
 717     if drive_or_unc:
 718         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 719     elif force and s and s[0] == os.path.sep:
 720         sanitized_path.insert(0, os.path.sep)
 721     return os.path.join(*sanitized_path)
 722
 723
 724 def sanitize_url(url, *, scheme='http'):
 725     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 726     # the number of unwanted failures due to missing protocol
 727     if url is None:
 728         return
 729     elif url.startswith('//'):
 730         return f'{scheme}:{url}'
 731     # Fix some common typos seen so far
 732     COMMON_TYPOS = (
 733         # https://github.com/ytdl-org/youtube-dl/issues/15649
 734         (r'^httpss://', r'https://'),
 735         # https://bx1.be/lives/direct-tv/
 736         (r'^rmtp([es]?)://', r'rtmp\1://'),
 737     )
 738     for mistake, fixup in COMMON_TYPOS:
 739         if re.match(mistake, url):
 740             return re.sub(mistake, fixup, url)
 741     return url
 742
 743
 744 def extract_basic_auth(url):
 745     parts = urllib.parse.urlsplit(url)
 746     if parts.username is None:
 747         return url, None
 748     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 749         parts.hostname if parts.port is None
 750         else '%s:%d' % (parts.hostname, parts.port))))
 751     auth_payload = base64.b64encode(
 752         ('%s:%s' % (parts.username, parts.password or '')).encode())
 753     return url, f'Basic {auth_payload.decode()}'
 754
 755
 756 def sanitized_Request(url, *args, **kwargs):
 757     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 758     if auth_header is not None:
 759         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 760         headers['Authorization'] = auth_header
 761     return urllib.request.Request(url, *args, **kwargs)
 762
 763
 764 def expand_path(s):
 765     """Expand shell variables and ~"""
 766     return os.path.expandvars(compat_expanduser(s))
 767
 768
 769 def orderedSet(iterable, *, lazy=False):
 770     """Remove all duplicates from the input iterable"""
 771     def _iter():
 772         seen = []  # Do not use set since the items can be unhashable
 773         for x in iterable:
 774             if x not in seen:
 775                 seen.append(x)
 776                 yield x
 777
 778     return _iter() if lazy else list(_iter())
 779
 780
 781 def _htmlentity_transform(entity_with_semicolon):
 782     """Transforms an HTML entity to a character."""
 783     entity = entity_with_semicolon[:-1]
 784
 785     # Known non-numeric HTML entity
 786     if entity in html.entities.name2codepoint:
 787         return chr(html.entities.name2codepoint[entity])
 788
 789     # TODO: HTML5 allows entities without a semicolon.
 790     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 791     if entity_with_semicolon in html.entities.html5:
 792         return html.entities.html5[entity_with_semicolon]
 793
 794     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 795     if mobj is not None:
 796         numstr = mobj.group(1)
 797         if numstr.startswith('x'):
 798             base = 16
 799             numstr = '0%s' % numstr
 800         else:
 801             base = 10
 802         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 803         with contextlib.suppress(ValueError):
 804             return chr(int(numstr, base))
 805
 806     # Unknown entity in name, return its literal representation
 807     return '&%s;' % entity
 808
 809
 810 def unescapeHTML(s):
 811     if s is None:
 812         return None
 813     assert isinstance(s, str)
 814
 815     return re.sub(
 816         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 817
 818
 819 def escapeHTML(text):
 820     return (
 821         text
 822         .replace('&', '&amp;')
 823         .replace('<', '&lt;')
 824         .replace('>', '&gt;')
 825         .replace('"', '&quot;')
 826         .replace("'", '&#39;')
 827     )
 828
 829
 830 def process_communicate_or_kill(p, *args, **kwargs):
 831     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 832                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 833     return Popen.communicate_or_kill(p, *args, **kwargs)
 834
 835
 836 class Popen(subprocess.Popen):
 837     if sys.platform == 'win32':
 838         _startupinfo = subprocess.STARTUPINFO()
 839         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 840     else:
 841         _startupinfo = None
 842
 843     def __init__(self, *args, text=False, **kwargs):
 844         if text is True:
 845             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 846             kwargs.setdefault('encoding', 'utf-8')
 847             kwargs.setdefault('errors', 'replace')
 848         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 849
 850     def communicate_or_kill(self, *args, **kwargs):
 851         try:
 852             return self.communicate(*args, **kwargs)
 853         except BaseException:  # Including KeyboardInterrupt
 854             self.kill(timeout=None)
 855             raise
 856
 857     def kill(self, *, timeout=0):
 858         super().kill()
 859         if timeout != 0:
 860             self.wait(timeout=timeout)
 861
 862     @classmethod
 863     def run(cls, *args, **kwargs):
 864         with cls(*args, **kwargs) as proc:
 865             stdout, stderr = proc.communicate_or_kill()
 866             return stdout or '', stderr or '', proc.returncode
 867
 868
 869 def get_subprocess_encoding():
 870     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 871         # For subprocess calls, encode with locale encoding
 872         # Refer to http://stackoverflow.com/a/9951851/35070
 873         encoding = preferredencoding()
 874     else:
 875         encoding = sys.getfilesystemencoding()
 876     if encoding is None:
 877         encoding = 'utf-8'
 878     return encoding
 879
 880
 881 def encodeFilename(s, for_subprocess=False):
 882     assert isinstance(s, str)
 883     return s
 884
 885
 886 def decodeFilename(b, for_subprocess=False):
 887     return b
 888
 889
 890 def encodeArgument(s):
 891     # Legacy code that uses byte strings
 892     # Uncomment the following line after fixing all post processors
 893     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 894     return s if isinstance(s, str) else s.decode('ascii')
 895
 896
 897 def decodeArgument(b):
 898     return b
 899
 900
 901 def decodeOption(optval):
 902     if optval is None:
 903         return optval
 904     if isinstance(optval, bytes):
 905         optval = optval.decode(preferredencoding())
 906
 907     assert isinstance(optval, str)
 908     return optval
 909
 910
 911 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 912
 913
 914 def timetuple_from_msec(msec):
 915     secs, msec = divmod(msec, 1000)
 916     mins, secs = divmod(secs, 60)
 917     hrs, mins = divmod(mins, 60)
 918     return _timetuple(hrs, mins, secs, msec)
 919
 920
 921 def formatSeconds(secs, delim=':', msec=False):
 922     time = timetuple_from_msec(secs * 1000)
 923     if time.hours:
 924         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 925     elif time.minutes:
 926         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 927     else:
 928         ret = '%d' % time.seconds
 929     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 930
 931
 932 def _ssl_load_windows_store_certs(ssl_context, storename):
 933     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 934     try:
 935         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 936                  if encoding == 'x509_asn' and (
 937                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 938     except PermissionError:
 939         return
 940     for cert in certs:
 941         with contextlib.suppress(ssl.SSLError):
 942             ssl_context.load_verify_locations(cadata=cert)
 943
 944
 945 def make_HTTPS_handler(params, **kwargs):
 946     opts_check_certificate = not params.get('nocheckcertificate')
 947     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 948     context.check_hostname = opts_check_certificate
 949     if params.get('legacyserverconnect'):
 950         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 951         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 952         context.set_ciphers('DEFAULT')
 953
 954     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 955     if opts_check_certificate:
 956         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 957             context.load_verify_locations(cafile=certifi.where())
 958         else:
 959             try:
 960                 context.load_default_certs()
 961                 # Work around the issue in load_default_certs when there are bad certificates. See:
 962                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 963                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 964             except ssl.SSLError:
 965                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 966                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 967                     for storename in ('CA', 'ROOT'):
 968                         _ssl_load_windows_store_certs(context, storename)
 969                 context.set_default_verify_paths()
 970
 971     client_certfile = params.get('client_certificate')
 972     if client_certfile:
 973         try:
 974             context.load_cert_chain(
 975                 client_certfile, keyfile=params.get('client_certificate_key'),
 976                 password=params.get('client_certificate_password'))
 977         except ssl.SSLError:
 978             raise YoutubeDLError('Unable to load client certificate')
 979
 980     # Some servers may reject requests if ALPN extension is not sent. See:
 981     # https://github.com/python/cpython/issues/85140
 982     # https://github.com/yt-dlp/yt-dlp/issues/3878
 983     with contextlib.suppress(NotImplementedError):
 984         context.set_alpn_protocols(['http/1.1'])
 985
 986     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 987
 988
 989 def bug_reports_message(before=';'):
 990     from .update import REPOSITORY
 991
 992     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 993            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 994
 995     before = before.rstrip()
 996     if not before or before.endswith(('.', '!', '?')):
 997         msg = msg[0].title() + msg[1:]
 998
 999     return (before + ' ' if before else '') + msg
1000
1001
1002 class YoutubeDLError(Exception):
1003     """Base exception for YoutubeDL errors."""
1004     msg = None
1005
1006     def __init__(self, msg=None):
1007         if msg is not None:
1008             self.msg = msg
1009         elif self.msg is None:
1010             self.msg = type(self).__name__
1011         super().__init__(self.msg)
1012
1013
1014 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1015 if hasattr(ssl, 'CertificateError'):
1016     network_exceptions.append(ssl.CertificateError)
1017 network_exceptions = tuple(network_exceptions)
1018
1019
1020 class ExtractorError(YoutubeDLError):
1021     """Error during info extraction."""
1022
1023     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1024         """ tb, if given, is the original traceback (so that it can be printed out).
1025         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1026         """
1027         if sys.exc_info()[0] in network_exceptions:
1028             expected = True
1029
1030         self.orig_msg = str(msg)
1031         self.traceback = tb
1032         self.expected = expected
1033         self.cause = cause
1034         self.video_id = video_id
1035         self.ie = ie
1036         self.exc_info = sys.exc_info()  # preserve original exception
1037         if isinstance(self.exc_info[1], ExtractorError):
1038             self.exc_info = self.exc_info[1].exc_info
1039
1040         super().__init__(''.join((
1041             format_field(ie, None, '[%s] '),
1042             format_field(video_id, None, '%s: '),
1043             msg,
1044             format_field(cause, None, ' (caused by %r)'),
1045             '' if expected else bug_reports_message())))
1046
1047     def format_traceback(self):
1048         return join_nonempty(
1049             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1050             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1051             delim='\n') or None
1052
1053
1054 class UnsupportedError(ExtractorError):
1055     def __init__(self, url):
1056         super().__init__(
1057             'Unsupported URL: %s' % url, expected=True)
1058         self.url = url
1059
1060
1061 class RegexNotFoundError(ExtractorError):
1062     """Error when a regex didn't match"""
1063     pass
1064
1065
1066 class GeoRestrictedError(ExtractorError):
1067     """Geographic restriction Error exception.
1068
1069     This exception may be thrown when a video is not available from your
1070     geographic location due to geographic restrictions imposed by a website.
1071     """
1072
1073     def __init__(self, msg, countries=None, **kwargs):
1074         kwargs['expected'] = True
1075         super().__init__(msg, **kwargs)
1076         self.countries = countries
1077
1078
1079 class UserNotLive(ExtractorError):
1080     """Error when a channel/user is not live"""
1081
1082     def __init__(self, msg=None, **kwargs):
1083         kwargs['expected'] = True
1084         super().__init__(msg or 'The channel is not currently live', **kwargs)
1085
1086
1087 class DownloadError(YoutubeDLError):
1088     """Download Error exception.
1089
1090     This exception may be thrown by FileDownloader objects if they are not
1091     configured to continue on errors. They will contain the appropriate
1092     error message.
1093     """
1094
1095     def __init__(self, msg, exc_info=None):
1096         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1097         super().__init__(msg)
1098         self.exc_info = exc_info
1099
1100
1101 class EntryNotInPlaylist(YoutubeDLError):
1102     """Entry not in playlist exception.
1103
1104     This exception will be thrown by YoutubeDL when a requested entry
1105     is not found in the playlist info_dict
1106     """
1107     msg = 'Entry not found in info'
1108
1109
1110 class SameFileError(YoutubeDLError):
1111     """Same File exception.
1112
1113     This exception will be thrown by FileDownloader objects if they detect
1114     multiple files would have to be downloaded to the same file on disk.
1115     """
1116     msg = 'Fixed output name but more than one file to download'
1117
1118     def __init__(self, filename=None):
1119         if filename is not None:
1120             self.msg += f': {filename}'
1121         super().__init__(self.msg)
1122
1123
1124 class PostProcessingError(YoutubeDLError):
1125     """Post Processing exception.
1126
1127     This exception may be raised by PostProcessor's .run() method to
1128     indicate an error in the postprocessing task.
1129     """
1130
1131
1132 class DownloadCancelled(YoutubeDLError):
1133     """ Exception raised when the download queue should be interrupted """
1134     msg = 'The download was cancelled'
1135
1136
1137 class ExistingVideoReached(DownloadCancelled):
1138     """ --break-on-existing triggered """
1139     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1140
1141
1142 class RejectedVideoReached(DownloadCancelled):
1143     """ --break-on-reject triggered """
1144     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1145
1146
1147 class MaxDownloadsReached(DownloadCancelled):
1148     """ --max-downloads limit has been reached. """
1149     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1150
1151
1152 class ReExtractInfo(YoutubeDLError):
1153     """ Video info needs to be re-extracted. """
1154
1155     def __init__(self, msg, expected=False):
1156         super().__init__(msg)
1157         self.expected = expected
1158
1159
1160 class ThrottledDownload(ReExtractInfo):
1161     """ Download speed below --throttled-rate. """
1162     msg = 'The download speed is below throttle limit'
1163
1164     def __init__(self):
1165         super().__init__(self.msg, expected=False)
1166
1167
1168 class UnavailableVideoError(YoutubeDLError):
1169     """Unavailable Format exception.
1170
1171     This exception will be thrown when a video is requested
1172     in a format that is not available for that video.
1173     """
1174     msg = 'Unable to download video'
1175
1176     def __init__(self, err=None):
1177         if err is not None:
1178             self.msg += f': {err}'
1179         super().__init__(self.msg)
1180
1181
1182 class ContentTooShortError(YoutubeDLError):
1183     """Content Too Short exception.
1184
1185     This exception may be raised by FileDownloader objects when a file they
1186     download is too small for what the server announced first, indicating
1187     the connection was probably interrupted.
1188     """
1189
1190     def __init__(self, downloaded, expected):
1191         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1192         # Both in bytes
1193         self.downloaded = downloaded
1194         self.expected = expected
1195
1196
1197 class XAttrMetadataError(YoutubeDLError):
1198     def __init__(self, code=None, msg='Unknown error'):
1199         super().__init__(msg)
1200         self.code = code
1201         self.msg = msg
1202
1203         # Parsing code and msg
1204         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1205                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1206             self.reason = 'NO_SPACE'
1207         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1208             self.reason = 'VALUE_TOO_LONG'
1209         else:
1210             self.reason = 'NOT_SUPPORTED'
1211
1212
1213 class XAttrUnavailableError(YoutubeDLError):
1214     pass
1215
1216
1217 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1218     hc = http_class(*args, **kwargs)
1219     source_address = ydl_handler._params.get('source_address')
1220
1221     if source_address is not None:
1222         # This is to workaround _create_connection() from socket where it will try all
1223         # address data from getaddrinfo() including IPv6. This filters the result from
1224         # getaddrinfo() based on the source_address value.
1225         # This is based on the cpython socket.create_connection() function.
1226         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1227         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1228             host, port = address
1229             err = None
1230             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1231             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1232             ip_addrs = [addr for addr in addrs if addr[0] == af]
1233             if addrs and not ip_addrs:
1234                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1235                 raise OSError(
1236                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1237                     % (ip_version, source_address[0]))
1238             for res in ip_addrs:
1239                 af, socktype, proto, canonname, sa = res
1240                 sock = None
1241                 try:
1242                     sock = socket.socket(af, socktype, proto)
1243                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1244                         sock.settimeout(timeout)
1245                     sock.bind(source_address)
1246                     sock.connect(sa)
1247                     err = None  # Explicitly break reference cycle
1248                     return sock
1249                 except OSError as _:
1250                     err = _
1251                     if sock is not None:
1252                         sock.close()
1253             if err is not None:
1254                 raise err
1255             else:
1256                 raise OSError('getaddrinfo returns an empty list')
1257         if hasattr(hc, '_create_connection'):
1258             hc._create_connection = _create_connection
1259         hc.source_address = (source_address, 0)
1260
1261     return hc
1262
1263
1264 def handle_youtubedl_headers(headers):
1265     filtered_headers = headers
1266
1267     if 'Youtubedl-no-compression' in filtered_headers:
1268         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1269         del filtered_headers['Youtubedl-no-compression']
1270
1271     return filtered_headers
1272
1273
1274 class YoutubeDLHandler(urllib.request.HTTPHandler):
1275     """Handler for HTTP requests and responses.
1276
1277     This class, when installed with an OpenerDirector, automatically adds
1278     the standard headers to every HTTP request and handles gzipped and
1279     deflated responses from web servers. If compression is to be avoided in
1280     a particular request, the original request in the program code only has
1281     to include the HTTP header "Youtubedl-no-compression", which will be
1282     removed before making the real request.
1283
1284     Part of this code was copied from:
1285
1286     http://techknack.net/python-urllib2-handlers/
1287
1288     Andrew Rowls, the author of that code, agreed to release it to the
1289     public domain.
1290     """
1291
1292     def __init__(self, params, *args, **kwargs):
1293         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1294         self._params = params
1295
1296     def http_open(self, req):
1297         conn_class = http.client.HTTPConnection
1298
1299         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1300         if socks_proxy:
1301             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1302             del req.headers['Ytdl-socks-proxy']
1303
1304         return self.do_open(functools.partial(
1305             _create_http_connection, self, conn_class, False),
1306             req)
1307
1308     @staticmethod
1309     def deflate(data):
1310         if not data:
1311             return data
1312         try:
1313             return zlib.decompress(data, -zlib.MAX_WBITS)
1314         except zlib.error:
1315             return zlib.decompress(data)
1316
1317     @staticmethod
1318     def brotli(data):
1319         if not data:
1320             return data
1321         return brotli.decompress(data)
1322
1323     def http_request(self, req):
1324         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1325         # always respected by websites, some tend to give out URLs with non percent-encoded
1326         # non-ASCII characters (see telemb.py, ard.py [#3412])
1327         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1328         # To work around aforementioned issue we will replace request's original URL with
1329         # percent-encoded one
1330         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1331         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1332         url = req.get_full_url()
1333         url_escaped = escape_url(url)
1334
1335         # Substitute URL if any change after escaping
1336         if url != url_escaped:
1337             req = update_Request(req, url=url_escaped)
1338
1339         for h, v in self._params.get('http_headers', std_headers).items():
1340             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1341             # The dict keys are capitalized because of this bug by urllib
1342             if h.capitalize() not in req.headers:
1343                 req.add_header(h, v)
1344
1345         if 'Accept-encoding' not in req.headers:
1346             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1347
1348         req.headers = handle_youtubedl_headers(req.headers)
1349
1350         return super().do_request_(req)
1351
1352     def http_response(self, req, resp):
1353         old_resp = resp
1354         # gzip
1355         if resp.headers.get('Content-encoding', '') == 'gzip':
1356             content = resp.read()
1357             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1358             try:
1359                 uncompressed = io.BytesIO(gz.read())
1360             except OSError as original_ioerror:
1361                 # There may be junk add the end of the file
1362                 # See http://stackoverflow.com/q/4928560/35070 for details
1363                 for i in range(1, 1024):
1364                     try:
1365                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1366                         uncompressed = io.BytesIO(gz.read())
1367                     except OSError:
1368                         continue
1369                     break
1370                 else:
1371                     raise original_ioerror
1372             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1373             resp.msg = old_resp.msg
1374             del resp.headers['Content-encoding']
1375         # deflate
1376         if resp.headers.get('Content-encoding', '') == 'deflate':
1377             gz = io.BytesIO(self.deflate(resp.read()))
1378             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1379             resp.msg = old_resp.msg
1380             del resp.headers['Content-encoding']
1381         # brotli
1382         if resp.headers.get('Content-encoding', '') == 'br':
1383             resp = urllib.request.addinfourl(
1384                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1385             resp.msg = old_resp.msg
1386             del resp.headers['Content-encoding']
1387         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1388         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1389         if 300 <= resp.code < 400:
1390             location = resp.headers.get('Location')
1391             if location:
1392                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1393                 location = location.encode('iso-8859-1').decode()
1394                 location_escaped = escape_url(location)
1395                 if location != location_escaped:
1396                     del resp.headers['Location']
1397                     resp.headers['Location'] = location_escaped
1398         return resp
1399
1400     https_request = http_request
1401     https_response = http_response
1402
1403
1404 def make_socks_conn_class(base_class, socks_proxy):
1405     assert issubclass(base_class, (
1406         http.client.HTTPConnection, http.client.HTTPSConnection))
1407
1408     url_components = urllib.parse.urlparse(socks_proxy)
1409     if url_components.scheme.lower() == 'socks5':
1410         socks_type = ProxyType.SOCKS5
1411     elif url_components.scheme.lower() in ('socks', 'socks4'):
1412         socks_type = ProxyType.SOCKS4
1413     elif url_components.scheme.lower() == 'socks4a':
1414         socks_type = ProxyType.SOCKS4A
1415
1416     def unquote_if_non_empty(s):
1417         if not s:
1418             return s
1419         return urllib.parse.unquote_plus(s)
1420
1421     proxy_args = (
1422         socks_type,
1423         url_components.hostname, url_components.port or 1080,
1424         True,  # Remote DNS
1425         unquote_if_non_empty(url_components.username),
1426         unquote_if_non_empty(url_components.password),
1427     )
1428
1429     class SocksConnection(base_class):
1430         def connect(self):
1431             self.sock = sockssocket()
1432             self.sock.setproxy(*proxy_args)
1433             if isinstance(self.timeout, (int, float)):
1434                 self.sock.settimeout(self.timeout)
1435             self.sock.connect((self.host, self.port))
1436
1437             if isinstance(self, http.client.HTTPSConnection):
1438                 if hasattr(self, '_context'):  # Python > 2.6
1439                     self.sock = self._context.wrap_socket(
1440                         self.sock, server_hostname=self.host)
1441                 else:
1442                     self.sock = ssl.wrap_socket(self.sock)
1443
1444     return SocksConnection
1445
1446
1447 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1448     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1449         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1450         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1451         self._params = params
1452
1453     def https_open(self, req):
1454         kwargs = {}
1455         conn_class = self._https_conn_class
1456
1457         if hasattr(self, '_context'):  # python > 2.6
1458             kwargs['context'] = self._context
1459         if hasattr(self, '_check_hostname'):  # python 3.x
1460             kwargs['check_hostname'] = self._check_hostname
1461
1462         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1463         if socks_proxy:
1464             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1465             del req.headers['Ytdl-socks-proxy']
1466
1467         try:
1468             return self.do_open(
1469                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1470         except urllib.error.URLError as e:
1471             if (isinstance(e.reason, ssl.SSLError)
1472                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1473                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1474             raise
1475
1476
1477 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1478     """
1479     See [1] for cookie file format.
1480
1481     1. https://curl.haxx.se/docs/http-cookies.html
1482     """
1483     _HTTPONLY_PREFIX = '#HttpOnly_'
1484     _ENTRY_LEN = 7
1485     _HEADER = '''# Netscape HTTP Cookie File
1486 # This file is generated by yt-dlp.  Do not edit.
1487
1488 '''
1489     _CookieFileEntry = collections.namedtuple(
1490         'CookieFileEntry',
1491         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1492
1493     def __init__(self, filename=None, *args, **kwargs):
1494         super().__init__(None, *args, **kwargs)
1495         if self.is_path(filename):
1496             filename = os.fspath(filename)
1497         self.filename = filename
1498
1499     @staticmethod
1500     def _true_or_false(cndn):
1501         return 'TRUE' if cndn else 'FALSE'
1502
1503     @staticmethod
1504     def is_path(file):
1505         return isinstance(file, (str, bytes, os.PathLike))
1506
1507     @contextlib.contextmanager
1508     def open(self, file, *, write=False):
1509         if self.is_path(file):
1510             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1511                 yield f
1512         else:
1513             if write:
1514                 file.truncate(0)
1515             yield file
1516
1517     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1518         now = time.time()
1519         for cookie in self:
1520             if (not ignore_discard and cookie.discard
1521                     or not ignore_expires and cookie.is_expired(now)):
1522                 continue
1523             name, value = cookie.name, cookie.value
1524             if value is None:
1525                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1526                 # with no name, whereas http.cookiejar regards it as a
1527                 # cookie with no value.
1528                 name, value = '', name
1529             f.write('%s\n' % '\t'.join((
1530                 cookie.domain,
1531                 self._true_or_false(cookie.domain.startswith('.')),
1532                 cookie.path,
1533                 self._true_or_false(cookie.secure),
1534                 str_or_none(cookie.expires, default=''),
1535                 name, value
1536             )))
1537
1538     def save(self, filename=None, *args, **kwargs):
1539         """
1540         Save cookies to a file.
1541         Code is taken from CPython 3.6
1542         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1543
1544         if filename is None:
1545             if self.filename is not None:
1546                 filename = self.filename
1547             else:
1548                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1549
1550         # Store session cookies with `expires` set to 0 instead of an empty string
1551         for cookie in self:
1552             if cookie.expires is None:
1553                 cookie.expires = 0
1554
1555         with self.open(filename, write=True) as f:
1556             f.write(self._HEADER)
1557             self._really_save(f, *args, **kwargs)
1558
1559     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1560         """Load cookies from a file."""
1561         if filename is None:
1562             if self.filename is not None:
1563                 filename = self.filename
1564             else:
1565                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1566
1567         def prepare_line(line):
1568             if line.startswith(self._HTTPONLY_PREFIX):
1569                 line = line[len(self._HTTPONLY_PREFIX):]
1570             # comments and empty lines are fine
1571             if line.startswith('#') or not line.strip():
1572                 return line
1573             cookie_list = line.split('\t')
1574             if len(cookie_list) != self._ENTRY_LEN:
1575                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1576             cookie = self._CookieFileEntry(*cookie_list)
1577             if cookie.expires_at and not cookie.expires_at.isdigit():
1578                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1579             return line
1580
1581         cf = io.StringIO()
1582         with self.open(filename) as f:
1583             for line in f:
1584                 try:
1585                     cf.write(prepare_line(line))
1586                 except http.cookiejar.LoadError as e:
1587                     if f'{line.strip()} '[0] in '[{"':
1588                         raise http.cookiejar.LoadError(
1589                             'Cookies file must be Netscape formatted, not JSON. See  '
1590                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1591                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1592                     continue
1593         cf.seek(0)
1594         self._really_load(cf, filename, ignore_discard, ignore_expires)
1595         # Session cookies are denoted by either `expires` field set to
1596         # an empty string or 0. MozillaCookieJar only recognizes the former
1597         # (see [1]). So we need force the latter to be recognized as session
1598         # cookies on our own.
1599         # Session cookies may be important for cookies-based authentication,
1600         # e.g. usually, when user does not check 'Remember me' check box while
1601         # logging in on a site, some important cookies are stored as session
1602         # cookies so that not recognizing them will result in failed login.
1603         # 1. https://bugs.python.org/issue17164
1604         for cookie in self:
1605             # Treat `expires=0` cookies as session cookies
1606             if cookie.expires == 0:
1607                 cookie.expires = None
1608                 cookie.discard = True
1609
1610
1611 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1612     def __init__(self, cookiejar=None):
1613         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1614
1615     def http_response(self, request, response):
1616         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1617
1618     https_request = urllib.request.HTTPCookieProcessor.http_request
1619     https_response = http_response
1620
1621
1622 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1623     """YoutubeDL redirect handler
1624
1625     The code is based on HTTPRedirectHandler implementation from CPython [1].
1626
1627     This redirect handler solves two issues:
1628      - ensures redirect URL is always unicode under python 2
1629      - introduces support for experimental HTTP response status code
1630        308 Permanent Redirect [2] used by some sites [3]
1631
1632     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1633     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1634     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1635     """
1636
1637     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1638
1639     def redirect_request(self, req, fp, code, msg, headers, newurl):
1640         """Return a Request or None in response to a redirect.
1641
1642         This is called by the http_error_30x methods when a
1643         redirection response is received.  If a redirection should
1644         take place, return a new Request to allow http_error_30x to
1645         perform the redirect.  Otherwise, raise HTTPError if no-one
1646         else should try to handle this url.  Return None if you can't
1647         but another Handler might.
1648         """
1649         m = req.get_method()
1650         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1651                  or code in (301, 302, 303) and m == "POST")):
1652             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1653         # Strictly (according to RFC 2616), 301 or 302 in response to
1654         # a POST MUST NOT cause a redirection without confirmation
1655         # from the user (of urllib.request, in this case).  In practice,
1656         # essentially all clients do redirect in this case, so we do
1657         # the same.
1658
1659         # Be conciliant with URIs containing a space.  This is mainly
1660         # redundant with the more complete encoding done in http_error_302(),
1661         # but it is kept for compatibility with other callers.
1662         newurl = newurl.replace(' ', '%20')
1663
1664         CONTENT_HEADERS = ("content-length", "content-type")
1665         # NB: don't use dict comprehension for python 2.6 compatibility
1666         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1667
1668         # A 303 must either use GET or HEAD for subsequent request
1669         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1670         if code == 303 and m != 'HEAD':
1671             m = 'GET'
1672         # 301 and 302 redirects are commonly turned into a GET from a POST
1673         # for subsequent requests by browsers, so we'll do the same.
1674         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1675         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1676         if code in (301, 302) and m == 'POST':
1677             m = 'GET'
1678
1679         return urllib.request.Request(
1680             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1681             unverifiable=True, method=m)
1682
1683
1684 def extract_timezone(date_str):
1685     m = re.search(
1686         r'''(?x)
1687             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1688             (?P<tz>Z|                                            # just the UTC Z, or
1689                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1690                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1691                    [ ]?                                          # optional space
1692                 (?P<sign>\+|-)                                   # +/-
1693                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1694             $)
1695         ''', date_str)
1696     if not m:
1697         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1698         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1699         if timezone is not None:
1700             date_str = date_str[:-len(m.group('tz'))]
1701         timezone = datetime.timedelta(hours=timezone or 0)
1702     else:
1703         date_str = date_str[:-len(m.group('tz'))]
1704         if not m.group('sign'):
1705             timezone = datetime.timedelta()
1706         else:
1707             sign = 1 if m.group('sign') == '+' else -1
1708             timezone = datetime.timedelta(
1709                 hours=sign * int(m.group('hours')),
1710                 minutes=sign * int(m.group('minutes')))
1711     return timezone, date_str
1712
1713
1714 def parse_iso8601(date_str, delimiter='T', timezone=None):
1715     """ Return a UNIX timestamp from the given date """
1716
1717     if date_str is None:
1718         return None
1719
1720     date_str = re.sub(r'\.[0-9]+', '', date_str)
1721
1722     if timezone is None:
1723         timezone, date_str = extract_timezone(date_str)
1724
1725     with contextlib.suppress(ValueError):
1726         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1727         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1728         return calendar.timegm(dt.timetuple())
1729
1730
1731 def date_formats(day_first=True):
1732     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1733
1734
1735 def unified_strdate(date_str, day_first=True):
1736     """Return a string with the date in the format YYYYMMDD"""
1737
1738     if date_str is None:
1739         return None
1740     upload_date = None
1741     # Replace commas
1742     date_str = date_str.replace(',', ' ')
1743     # Remove AM/PM + timezone
1744     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1745     _, date_str = extract_timezone(date_str)
1746
1747     for expression in date_formats(day_first):
1748         with contextlib.suppress(ValueError):
1749             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1750     if upload_date is None:
1751         timetuple = email.utils.parsedate_tz(date_str)
1752         if timetuple:
1753             with contextlib.suppress(ValueError):
1754                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1755     if upload_date is not None:
1756         return str(upload_date)
1757
1758
1759 def unified_timestamp(date_str, day_first=True):
1760     if date_str is None:
1761         return None
1762
1763     date_str = re.sub(r'\s+', ' ', re.sub(
1764         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1765
1766     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1767     timezone, date_str = extract_timezone(date_str)
1768
1769     # Remove AM/PM + timezone
1770     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1771
1772     # Remove unrecognized timezones from ISO 8601 alike timestamps
1773     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1774     if m:
1775         date_str = date_str[:-len(m.group('tz'))]
1776
1777     # Python only supports microseconds, so remove nanoseconds
1778     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1779     if m:
1780         date_str = m.group(1)
1781
1782     for expression in date_formats(day_first):
1783         with contextlib.suppress(ValueError):
1784             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1785             return calendar.timegm(dt.timetuple())
1786
1787     timetuple = email.utils.parsedate_tz(date_str)
1788     if timetuple:
1789         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1790
1791
1792 def determine_ext(url, default_ext='unknown_video'):
1793     if url is None or '.' not in url:
1794         return default_ext
1795     guess = url.partition('?')[0].rpartition('.')[2]
1796     if re.match(r'^[A-Za-z0-9]+$', guess):
1797         return guess
1798     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1799     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1800         return guess.rstrip('/')
1801     else:
1802         return default_ext
1803
1804
1805 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1806     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1807
1808
1809 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1810     R"""
1811     Return a datetime object from a string.
1812     Supported format:
1813         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1814
1815     @param format       strftime format of DATE
1816     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1817                         auto: round to the unit provided in date_str (if applicable).
1818     """
1819     auto_precision = False
1820     if precision == 'auto':
1821         auto_precision = True
1822         precision = 'microsecond'
1823     today = datetime_round(datetime.datetime.utcnow(), precision)
1824     if date_str in ('now', 'today'):
1825         return today
1826     if date_str == 'yesterday':
1827         return today - datetime.timedelta(days=1)
1828     match = re.match(
1829         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1830         date_str)
1831     if match is not None:
1832         start_time = datetime_from_str(match.group('start'), precision, format)
1833         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1834         unit = match.group('unit')
1835         if unit == 'month' or unit == 'year':
1836             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1837             unit = 'day'
1838         else:
1839             if unit == 'week':
1840                 unit = 'day'
1841                 time *= 7
1842             delta = datetime.timedelta(**{unit + 's': time})
1843             new_date = start_time + delta
1844         if auto_precision:
1845             return datetime_round(new_date, unit)
1846         return new_date
1847
1848     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1849
1850
1851 def date_from_str(date_str, format='%Y%m%d', strict=False):
1852     R"""
1853     Return a date object from a string using datetime_from_str
1854
1855     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1856                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1857     """
1858     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1859         raise ValueError(f'Invalid date format "{date_str}"')
1860     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1861
1862
1863 def datetime_add_months(dt, months):
1864     """Increment/Decrement a datetime object by months."""
1865     month = dt.month + months - 1
1866     year = dt.year + month // 12
1867     month = month % 12 + 1
1868     day = min(dt.day, calendar.monthrange(year, month)[1])
1869     return dt.replace(year, month, day)
1870
1871
1872 def datetime_round(dt, precision='day'):
1873     """
1874     Round a datetime object's time to a specific precision
1875     """
1876     if precision == 'microsecond':
1877         return dt
1878
1879     unit_seconds = {
1880         'day': 86400,
1881         'hour': 3600,
1882         'minute': 60,
1883         'second': 1,
1884     }
1885     roundto = lambda x, n: ((x + n / 2) // n) * n
1886     timestamp = calendar.timegm(dt.timetuple())
1887     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1888
1889
1890 def hyphenate_date(date_str):
1891     """
1892     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1893     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1894     if match is not None:
1895         return '-'.join(match.groups())
1896     else:
1897         return date_str
1898
1899
1900 class DateRange:
1901     """Represents a time interval between two dates"""
1902
1903     def __init__(self, start=None, end=None):
1904         """start and end must be strings in the format accepted by date"""
1905         if start is not None:
1906             self.start = date_from_str(start, strict=True)
1907         else:
1908             self.start = datetime.datetime.min.date()
1909         if end is not None:
1910             self.end = date_from_str(end, strict=True)
1911         else:
1912             self.end = datetime.datetime.max.date()
1913         if self.start > self.end:
1914             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1915
1916     @classmethod
1917     def day(cls, day):
1918         """Returns a range that only contains the given day"""
1919         return cls(day, day)
1920
1921     def __contains__(self, date):
1922         """Check if the date is in the range"""
1923         if not isinstance(date, datetime.date):
1924             date = date_from_str(date)
1925         return self.start <= date <= self.end
1926
1927     def __str__(self):
1928         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1929
1930     def __eq__(self, other):
1931         return (isinstance(other, DateRange)
1932                 and self.start == other.start and self.end == other.end)
1933
1934
1935 def platform_name():
1936     """ Returns the platform name as a str """
1937     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1938     return platform.platform()
1939
1940
1941 @functools.cache
1942 def system_identifier():
1943     python_implementation = platform.python_implementation()
1944     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1945         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1946
1947     return 'Python %s (%s %s) - %s %s' % (
1948         platform.python_version(),
1949         python_implementation,
1950         platform.architecture()[0],
1951         platform.platform(),
1952         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1953     )
1954
1955
1956 @functools.cache
1957 def get_windows_version():
1958     ''' Get Windows version. returns () if it's not running on Windows '''
1959     if compat_os_name == 'nt':
1960         return version_tuple(platform.win32_ver()[1])
1961     else:
1962         return ()
1963
1964
1965 def write_string(s, out=None, encoding=None):
1966     assert isinstance(s, str)
1967     out = out or sys.stderr
1968
1969     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1970         s = re.sub(r'([\r\n]+)', r' \1', s)
1971
1972     enc, buffer = None, out
1973     if 'b' in getattr(out, 'mode', ''):
1974         enc = encoding or preferredencoding()
1975     elif hasattr(out, 'buffer'):
1976         buffer = out.buffer
1977         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1978
1979     buffer.write(s.encode(enc, 'ignore') if enc else s)
1980     out.flush()
1981
1982
1983 def bytes_to_intlist(bs):
1984     if not bs:
1985         return []
1986     if isinstance(bs[0], int):  # Python 3
1987         return list(bs)
1988     else:
1989         return [ord(c) for c in bs]
1990
1991
1992 def intlist_to_bytes(xs):
1993     if not xs:
1994         return b''
1995     return struct.pack('%dB' % len(xs), *xs)
1996
1997
1998 class LockingUnsupportedError(OSError):
1999     msg = 'File locking is not supported'
2000
2001     def __init__(self):
2002         super().__init__(self.msg)
2003
2004
2005 # Cross-platform file locking
2006 if sys.platform == 'win32':
2007     import ctypes
2008     import ctypes.wintypes
2009     import msvcrt
2010
2011     class OVERLAPPED(ctypes.Structure):
2012         _fields_ = [
2013             ('Internal', ctypes.wintypes.LPVOID),
2014             ('InternalHigh', ctypes.wintypes.LPVOID),
2015             ('Offset', ctypes.wintypes.DWORD),
2016             ('OffsetHigh', ctypes.wintypes.DWORD),
2017             ('hEvent', ctypes.wintypes.HANDLE),
2018         ]
2019
2020     kernel32 = ctypes.windll.kernel32
2021     LockFileEx = kernel32.LockFileEx
2022     LockFileEx.argtypes = [
2023         ctypes.wintypes.HANDLE,     # hFile
2024         ctypes.wintypes.DWORD,      # dwFlags
2025         ctypes.wintypes.DWORD,      # dwReserved
2026         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2027         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2028         ctypes.POINTER(OVERLAPPED)  # Overlapped
2029     ]
2030     LockFileEx.restype = ctypes.wintypes.BOOL
2031     UnlockFileEx = kernel32.UnlockFileEx
2032     UnlockFileEx.argtypes = [
2033         ctypes.wintypes.HANDLE,     # hFile
2034         ctypes.wintypes.DWORD,      # dwReserved
2035         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2036         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2037         ctypes.POINTER(OVERLAPPED)  # Overlapped
2038     ]
2039     UnlockFileEx.restype = ctypes.wintypes.BOOL
2040     whole_low = 0xffffffff
2041     whole_high = 0x7fffffff
2042
2043     def _lock_file(f, exclusive, block):
2044         overlapped = OVERLAPPED()
2045         overlapped.Offset = 0
2046         overlapped.OffsetHigh = 0
2047         overlapped.hEvent = 0
2048         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2049
2050         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2051                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2052                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2053             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2054             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2055
2056     def _unlock_file(f):
2057         assert f._lock_file_overlapped_p
2058         handle = msvcrt.get_osfhandle(f.fileno())
2059         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2060             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2061
2062 else:
2063     try:
2064         import fcntl
2065
2066         def _lock_file(f, exclusive, block):
2067             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2068             if not block:
2069                 flags |= fcntl.LOCK_NB
2070             try:
2071                 fcntl.flock(f, flags)
2072             except BlockingIOError:
2073                 raise
2074             except OSError:  # AOSP does not have flock()
2075                 fcntl.lockf(f, flags)
2076
2077         def _unlock_file(f):
2078             try:
2079                 fcntl.flock(f, fcntl.LOCK_UN)
2080             except OSError:
2081                 fcntl.lockf(f, fcntl.LOCK_UN)
2082
2083     except ImportError:
2084
2085         def _lock_file(f, exclusive, block):
2086             raise LockingUnsupportedError()
2087
2088         def _unlock_file(f):
2089             raise LockingUnsupportedError()
2090
2091
2092 class locked_file:
2093     locked = False
2094
2095     def __init__(self, filename, mode, block=True, encoding=None):
2096         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2097             raise NotImplementedError(mode)
2098         self.mode, self.block = mode, block
2099
2100         writable = any(f in mode for f in 'wax+')
2101         readable = any(f in mode for f in 'r+')
2102         flags = functools.reduce(operator.ior, (
2103             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2104             getattr(os, 'O_BINARY', 0),  # Windows only
2105             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2106             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2107             os.O_APPEND if 'a' in mode else 0,
2108             os.O_EXCL if 'x' in mode else 0,
2109             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2110         ))
2111
2112         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2113
2114     def __enter__(self):
2115         exclusive = 'r' not in self.mode
2116         try:
2117             _lock_file(self.f, exclusive, self.block)
2118             self.locked = True
2119         except OSError:
2120             self.f.close()
2121             raise
2122         if 'w' in self.mode:
2123             try:
2124                 self.f.truncate()
2125             except OSError as e:
2126                 if e.errno not in (
2127                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2128                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2129                 ):
2130                     raise
2131         return self
2132
2133     def unlock(self):
2134         if not self.locked:
2135             return
2136         try:
2137             _unlock_file(self.f)
2138         finally:
2139             self.locked = False
2140
2141     def __exit__(self, *_):
2142         try:
2143             self.unlock()
2144         finally:
2145             self.f.close()
2146
2147     open = __enter__
2148     close = __exit__
2149
2150     def __getattr__(self, attr):
2151         return getattr(self.f, attr)
2152
2153     def __iter__(self):
2154         return iter(self.f)
2155
2156
2157 @functools.cache
2158 def get_filesystem_encoding():
2159     encoding = sys.getfilesystemencoding()
2160     return encoding if encoding is not None else 'utf-8'
2161
2162
2163 def shell_quote(args):
2164     quoted_args = []
2165     encoding = get_filesystem_encoding()
2166     for a in args:
2167         if isinstance(a, bytes):
2168             # We may get a filename encoded with 'encodeFilename'
2169             a = a.decode(encoding)
2170         quoted_args.append(compat_shlex_quote(a))
2171     return ' '.join(quoted_args)
2172
2173
2174 def smuggle_url(url, data):
2175     """ Pass additional data in a URL for internal use. """
2176
2177     url, idata = unsmuggle_url(url, {})
2178     data.update(idata)
2179     sdata = urllib.parse.urlencode(
2180         {'__youtubedl_smuggle': json.dumps(data)})
2181     return url + '#' + sdata
2182
2183
2184 def unsmuggle_url(smug_url, default=None):
2185     if '#__youtubedl_smuggle' not in smug_url:
2186         return smug_url, default
2187     url, _, sdata = smug_url.rpartition('#')
2188     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2189     data = json.loads(jsond)
2190     return url, data
2191
2192
2193 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2194     """ Formats numbers with decimal sufixes like K, M, etc """
2195     num, factor = float_or_none(num), float(factor)
2196     if num is None or num < 0:
2197         return None
2198     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2199     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2200     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2201     if factor == 1024:
2202         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2203     converted = num / (factor ** exponent)
2204     return fmt % (converted, suffix)
2205
2206
2207 def format_bytes(bytes):
2208     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2209
2210
2211 def lookup_unit_table(unit_table, s):
2212     units_re = '|'.join(re.escape(u) for u in unit_table)
2213     m = re.match(
2214         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2215     if not m:
2216         return None
2217     num_str = m.group('num').replace(',', '.')
2218     mult = unit_table[m.group('unit')]
2219     return int(float(num_str) * mult)
2220
2221
2222 def parse_filesize(s):
2223     if s is None:
2224         return None
2225
2226     # The lower-case forms are of course incorrect and unofficial,
2227     # but we support those too
2228     _UNIT_TABLE = {
2229         'B': 1,
2230         'b': 1,
2231         'bytes': 1,
2232         'KiB': 1024,
2233         'KB': 1000,
2234         'kB': 1024,
2235         'Kb': 1000,
2236         'kb': 1000,
2237         'kilobytes': 1000,
2238         'kibibytes': 1024,
2239         'MiB': 1024 ** 2,
2240         'MB': 1000 ** 2,
2241         'mB': 1024 ** 2,
2242         'Mb': 1000 ** 2,
2243         'mb': 1000 ** 2,
2244         'megabytes': 1000 ** 2,
2245         'mebibytes': 1024 ** 2,
2246         'GiB': 1024 ** 3,
2247         'GB': 1000 ** 3,
2248         'gB': 1024 ** 3,
2249         'Gb': 1000 ** 3,
2250         'gb': 1000 ** 3,
2251         'gigabytes': 1000 ** 3,
2252         'gibibytes': 1024 ** 3,
2253         'TiB': 1024 ** 4,
2254         'TB': 1000 ** 4,
2255         'tB': 1024 ** 4,
2256         'Tb': 1000 ** 4,
2257         'tb': 1000 ** 4,
2258         'terabytes': 1000 ** 4,
2259         'tebibytes': 1024 ** 4,
2260         'PiB': 1024 ** 5,
2261         'PB': 1000 ** 5,
2262         'pB': 1024 ** 5,
2263         'Pb': 1000 ** 5,
2264         'pb': 1000 ** 5,
2265         'petabytes': 1000 ** 5,
2266         'pebibytes': 1024 ** 5,
2267         'EiB': 1024 ** 6,
2268         'EB': 1000 ** 6,
2269         'eB': 1024 ** 6,
2270         'Eb': 1000 ** 6,
2271         'eb': 1000 ** 6,
2272         'exabytes': 1000 ** 6,
2273         'exbibytes': 1024 ** 6,
2274         'ZiB': 1024 ** 7,
2275         'ZB': 1000 ** 7,
2276         'zB': 1024 ** 7,
2277         'Zb': 1000 ** 7,
2278         'zb': 1000 ** 7,
2279         'zettabytes': 1000 ** 7,
2280         'zebibytes': 1024 ** 7,
2281         'YiB': 1024 ** 8,
2282         'YB': 1000 ** 8,
2283         'yB': 1024 ** 8,
2284         'Yb': 1000 ** 8,
2285         'yb': 1000 ** 8,
2286         'yottabytes': 1000 ** 8,
2287         'yobibytes': 1024 ** 8,
2288     }
2289
2290     return lookup_unit_table(_UNIT_TABLE, s)
2291
2292
2293 def parse_count(s):
2294     if s is None:
2295         return None
2296
2297     s = re.sub(r'^[^\d]+\s', '', s).strip()
2298
2299     if re.match(r'^[\d,.]+$', s):
2300         return str_to_int(s)
2301
2302     _UNIT_TABLE = {
2303         'k': 1000,
2304         'K': 1000,
2305         'm': 1000 ** 2,
2306         'M': 1000 ** 2,
2307         'kk': 1000 ** 2,
2308         'KK': 1000 ** 2,
2309         'b': 1000 ** 3,
2310         'B': 1000 ** 3,
2311     }
2312
2313     ret = lookup_unit_table(_UNIT_TABLE, s)
2314     if ret is not None:
2315         return ret
2316
2317     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2318     if mobj:
2319         return str_to_int(mobj.group(1))
2320
2321
2322 def parse_resolution(s, *, lenient=False):
2323     if s is None:
2324         return {}
2325
2326     if lenient:
2327         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2328     else:
2329         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2330     if mobj:
2331         return {
2332             'width': int(mobj.group('w')),
2333             'height': int(mobj.group('h')),
2334         }
2335
2336     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2337     if mobj:
2338         return {'height': int(mobj.group(1))}
2339
2340     mobj = re.search(r'\b([48])[kK]\b', s)
2341     if mobj:
2342         return {'height': int(mobj.group(1)) * 540}
2343
2344     return {}
2345
2346
2347 def parse_bitrate(s):
2348     if not isinstance(s, str):
2349         return
2350     mobj = re.search(r'\b(\d+)\s*kbps', s)
2351     if mobj:
2352         return int(mobj.group(1))
2353
2354
2355 def month_by_name(name, lang='en'):
2356     """ Return the number of a month by (locale-independently) English name """
2357
2358     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2359
2360     try:
2361         return month_names.index(name) + 1
2362     except ValueError:
2363         return None
2364
2365
2366 def month_by_abbreviation(abbrev):
2367     """ Return the number of a month by (locale-independently) English
2368         abbreviations """
2369
2370     try:
2371         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2372     except ValueError:
2373         return None
2374
2375
2376 def fix_xml_ampersands(xml_str):
2377     """Replace all the '&' by '&amp;' in XML"""
2378     return re.sub(
2379         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2380         '&amp;',
2381         xml_str)
2382
2383
2384 def setproctitle(title):
2385     assert isinstance(title, str)
2386
2387     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2388     try:
2389         import ctypes
2390     except ImportError:
2391         return
2392
2393     try:
2394         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2395     except OSError:
2396         return
2397     except TypeError:
2398         # LoadLibrary in Windows Python 2.7.13 only expects
2399         # a bytestring, but since unicode_literals turns
2400         # every string into a unicode string, it fails.
2401         return
2402     title_bytes = title.encode()
2403     buf = ctypes.create_string_buffer(len(title_bytes))
2404     buf.value = title_bytes
2405     try:
2406         libc.prctl(15, buf, 0, 0, 0)
2407     except AttributeError:
2408         return  # Strange libc, just skip this
2409
2410
2411 def remove_start(s, start):
2412     return s[len(start):] if s is not None and s.startswith(start) else s
2413
2414
2415 def remove_end(s, end):
2416     return s[:-len(end)] if s is not None and s.endswith(end) else s
2417
2418
2419 def remove_quotes(s):
2420     if s is None or len(s) < 2:
2421         return s
2422     for quote in ('"', "'", ):
2423         if s[0] == quote and s[-1] == quote:
2424             return s[1:-1]
2425     return s
2426
2427
2428 def get_domain(url):
2429     """
2430     This implementation is inconsistent, but is kept for compatibility.
2431     Use this only for "webpage_url_domain"
2432     """
2433     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2434
2435
2436 def url_basename(url):
2437     path = urllib.parse.urlparse(url).path
2438     return path.strip('/').split('/')[-1]
2439
2440
2441 def base_url(url):
2442     return re.match(r'https?://[^?#&]+/', url).group()
2443
2444
2445 def urljoin(base, path):
2446     if isinstance(path, bytes):
2447         path = path.decode()
2448     if not isinstance(path, str) or not path:
2449         return None
2450     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2451         return path
2452     if isinstance(base, bytes):
2453         base = base.decode()
2454     if not isinstance(base, str) or not re.match(
2455             r'^(?:https?:)?//', base):
2456         return None
2457     return urllib.parse.urljoin(base, path)
2458
2459
2460 class HEADRequest(urllib.request.Request):
2461     def get_method(self):
2462         return 'HEAD'
2463
2464
2465 class PUTRequest(urllib.request.Request):
2466     def get_method(self):
2467         return 'PUT'
2468
2469
2470 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2471     if get_attr and v is not None:
2472         v = getattr(v, get_attr, None)
2473     try:
2474         return int(v) * invscale // scale
2475     except (ValueError, TypeError, OverflowError):
2476         return default
2477
2478
2479 def str_or_none(v, default=None):
2480     return default if v is None else str(v)
2481
2482
2483 def str_to_int(int_str):
2484     """ A more relaxed version of int_or_none """
2485     if isinstance(int_str, int):
2486         return int_str
2487     elif isinstance(int_str, str):
2488         int_str = re.sub(r'[,\.\+]', '', int_str)
2489         return int_or_none(int_str)
2490
2491
2492 def float_or_none(v, scale=1, invscale=1, default=None):
2493     if v is None:
2494         return default
2495     try:
2496         return float(v) * invscale / scale
2497     except (ValueError, TypeError):
2498         return default
2499
2500
2501 def bool_or_none(v, default=None):
2502     return v if isinstance(v, bool) else default
2503
2504
2505 def strip_or_none(v, default=None):
2506     return v.strip() if isinstance(v, str) else default
2507
2508
2509 def url_or_none(url):
2510     if not url or not isinstance(url, str):
2511         return None
2512     url = url.strip()
2513     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2514
2515
2516 def request_to_url(req):
2517     if isinstance(req, urllib.request.Request):
2518         return req.get_full_url()
2519     else:
2520         return req
2521
2522
2523 def strftime_or_none(timestamp, date_format, default=None):
2524     datetime_object = None
2525     try:
2526         if isinstance(timestamp, (int, float)):  # unix timestamp
2527             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2528         elif isinstance(timestamp, str):  # assume YYYYMMDD
2529             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2530         return datetime_object.strftime(date_format)
2531     except (ValueError, TypeError, AttributeError):
2532         return default
2533
2534
2535 def parse_duration(s):
2536     if not isinstance(s, str):
2537         return None
2538     s = s.strip()
2539     if not s:
2540         return None
2541
2542     days, hours, mins, secs, ms = [None] * 5
2543     m = re.match(r'''(?x)
2544             (?P<before_secs>
2545                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2546             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2547             (?P<ms>[.:][0-9]+)?Z?$
2548         ''', s)
2549     if m:
2550         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2551     else:
2552         m = re.match(
2553             r'''(?ix)(?:P?
2554                 (?:
2555                     [0-9]+\s*y(?:ears?)?,?\s*
2556                 )?
2557                 (?:
2558                     [0-9]+\s*m(?:onths?)?,?\s*
2559                 )?
2560                 (?:
2561                     [0-9]+\s*w(?:eeks?)?,?\s*
2562                 )?
2563                 (?:
2564                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2565                 )?
2566                 T)?
2567                 (?:
2568                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2569                 )?
2570                 (?:
2571                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2572                 )?
2573                 (?:
2574                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2575                 )?Z?$''', s)
2576         if m:
2577             days, hours, mins, secs, ms = m.groups()
2578         else:
2579             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2580             if m:
2581                 hours, mins = m.groups()
2582             else:
2583                 return None
2584
2585     if ms:
2586         ms = ms.replace(':', '.')
2587     return sum(float(part or 0) * mult for part, mult in (
2588         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2589
2590
2591 def prepend_extension(filename, ext, expected_real_ext=None):
2592     name, real_ext = os.path.splitext(filename)
2593     return (
2594         f'{name}.{ext}{real_ext}'
2595         if not expected_real_ext or real_ext[1:] == expected_real_ext
2596         else f'{filename}.{ext}')
2597
2598
2599 def replace_extension(filename, ext, expected_real_ext=None):
2600     name, real_ext = os.path.splitext(filename)
2601     return '{}.{}'.format(
2602         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2603         ext)
2604
2605
2606 def check_executable(exe, args=[]):
2607     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2608     args can be a list of arguments for a short output (like -version) """
2609     try:
2610         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2611     except OSError:
2612         return False
2613     return exe
2614
2615
2616 def _get_exe_version_output(exe, args, *, to_screen=None):
2617     if to_screen:
2618         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2619     try:
2620         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2621         # SIGTTOU if yt-dlp is run in the background.
2622         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2623         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2624                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2625     except OSError:
2626         return False
2627     return stdout
2628
2629
2630 def detect_exe_version(output, version_re=None, unrecognized='present'):
2631     assert isinstance(output, str)
2632     if version_re is None:
2633         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2634     m = re.search(version_re, output)
2635     if m:
2636         return m.group(1)
2637     else:
2638         return unrecognized
2639
2640
2641 def get_exe_version(exe, args=['--version'],
2642                     version_re=None, unrecognized='present'):
2643     """ Returns the version of the specified executable,
2644     or False if the executable is not present """
2645     out = _get_exe_version_output(exe, args)
2646     return detect_exe_version(out, version_re, unrecognized) if out else False
2647
2648
2649 def frange(start=0, stop=None, step=1):
2650     """Float range"""
2651     if stop is None:
2652         start, stop = 0, start
2653     sign = [-1, 1][step > 0] if step else 0
2654     while sign * start < sign * stop:
2655         yield start
2656         start += step
2657
2658
2659 class LazyList(collections.abc.Sequence):
2660     """Lazy immutable list from an iterable
2661     Note that slices of a LazyList are lists and not LazyList"""
2662
2663     class IndexError(IndexError):
2664         pass
2665
2666     def __init__(self, iterable, *, reverse=False, _cache=None):
2667         self._iterable = iter(iterable)
2668         self._cache = [] if _cache is None else _cache
2669         self._reversed = reverse
2670
2671     def __iter__(self):
2672         if self._reversed:
2673             # We need to consume the entire iterable to iterate in reverse
2674             yield from self.exhaust()
2675             return
2676         yield from self._cache
2677         for item in self._iterable:
2678             self._cache.append(item)
2679             yield item
2680
2681     def _exhaust(self):
2682         self._cache.extend(self._iterable)
2683         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2684         return self._cache
2685
2686     def exhaust(self):
2687         """Evaluate the entire iterable"""
2688         return self._exhaust()[::-1 if self._reversed else 1]
2689
2690     @staticmethod
2691     def _reverse_index(x):
2692         return None if x is None else ~x
2693
2694     def __getitem__(self, idx):
2695         if isinstance(idx, slice):
2696             if self._reversed:
2697                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2698             start, stop, step = idx.start, idx.stop, idx.step or 1
2699         elif isinstance(idx, int):
2700             if self._reversed:
2701                 idx = self._reverse_index(idx)
2702             start, stop, step = idx, idx, 0
2703         else:
2704             raise TypeError('indices must be integers or slices')
2705         if ((start or 0) < 0 or (stop or 0) < 0
2706                 or (start is None and step < 0)
2707                 or (stop is None and step > 0)):
2708             # We need to consume the entire iterable to be able to slice from the end
2709             # Obviously, never use this with infinite iterables
2710             self._exhaust()
2711             try:
2712                 return self._cache[idx]
2713             except IndexError as e:
2714                 raise self.IndexError(e) from e
2715         n = max(start or 0, stop or 0) - len(self._cache) + 1
2716         if n > 0:
2717             self._cache.extend(itertools.islice(self._iterable, n))
2718         try:
2719             return self._cache[idx]
2720         except IndexError as e:
2721             raise self.IndexError(e) from e
2722
2723     def __bool__(self):
2724         try:
2725             self[-1] if self._reversed else self[0]
2726         except self.IndexError:
2727             return False
2728         return True
2729
2730     def __len__(self):
2731         self._exhaust()
2732         return len(self._cache)
2733
2734     def __reversed__(self):
2735         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2736
2737     def __copy__(self):
2738         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2739
2740     def __repr__(self):
2741         # repr and str should mimic a list. So we exhaust the iterable
2742         return repr(self.exhaust())
2743
2744     def __str__(self):
2745         return repr(self.exhaust())
2746
2747
2748 class PagedList:
2749
2750     class IndexError(IndexError):
2751         pass
2752
2753     def __len__(self):
2754         # This is only useful for tests
2755         return len(self.getslice())
2756
2757     def __init__(self, pagefunc, pagesize, use_cache=True):
2758         self._pagefunc = pagefunc
2759         self._pagesize = pagesize
2760         self._pagecount = float('inf')
2761         self._use_cache = use_cache
2762         self._cache = {}
2763
2764     def getpage(self, pagenum):
2765         page_results = self._cache.get(pagenum)
2766         if page_results is None:
2767             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2768         if self._use_cache:
2769             self._cache[pagenum] = page_results
2770         return page_results
2771
2772     def getslice(self, start=0, end=None):
2773         return list(self._getslice(start, end))
2774
2775     def _getslice(self, start, end):
2776         raise NotImplementedError('This method must be implemented by subclasses')
2777
2778     def __getitem__(self, idx):
2779         assert self._use_cache, 'Indexing PagedList requires cache'
2780         if not isinstance(idx, int) or idx < 0:
2781             raise TypeError('indices must be non-negative integers')
2782         entries = self.getslice(idx, idx + 1)
2783         if not entries:
2784             raise self.IndexError()
2785         return entries[0]
2786
2787
2788 class OnDemandPagedList(PagedList):
2789     """Download pages until a page with less than maximum results"""
2790
2791     def _getslice(self, start, end):
2792         for pagenum in itertools.count(start // self._pagesize):
2793             firstid = pagenum * self._pagesize
2794             nextfirstid = pagenum * self._pagesize + self._pagesize
2795             if start >= nextfirstid:
2796                 continue
2797
2798             startv = (
2799                 start % self._pagesize
2800                 if firstid <= start < nextfirstid
2801                 else 0)
2802             endv = (
2803                 ((end - 1) % self._pagesize) + 1
2804                 if (end is not None and firstid <= end <= nextfirstid)
2805                 else None)
2806
2807             try:
2808                 page_results = self.getpage(pagenum)
2809             except Exception:
2810                 self._pagecount = pagenum - 1
2811                 raise
2812             if startv != 0 or endv is not None:
2813                 page_results = page_results[startv:endv]
2814             yield from page_results
2815
2816             # A little optimization - if current page is not "full", ie. does
2817             # not contain page_size videos then we can assume that this page
2818             # is the last one - there are no more ids on further pages -
2819             # i.e. no need to query again.
2820             if len(page_results) + startv < self._pagesize:
2821                 break
2822
2823             # If we got the whole page, but the next page is not interesting,
2824             # break out early as well
2825             if end == nextfirstid:
2826                 break
2827
2828
2829 class InAdvancePagedList(PagedList):
2830     """PagedList with total number of pages known in advance"""
2831
2832     def __init__(self, pagefunc, pagecount, pagesize):
2833         PagedList.__init__(self, pagefunc, pagesize, True)
2834         self._pagecount = pagecount
2835
2836     def _getslice(self, start, end):
2837         start_page = start // self._pagesize
2838         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2839         skip_elems = start - start_page * self._pagesize
2840         only_more = None if end is None else end - start
2841         for pagenum in range(start_page, end_page):
2842             page_results = self.getpage(pagenum)
2843             if skip_elems:
2844                 page_results = page_results[skip_elems:]
2845                 skip_elems = None
2846             if only_more is not None:
2847                 if len(page_results) < only_more:
2848                     only_more -= len(page_results)
2849                 else:
2850                     yield from page_results[:only_more]
2851                     break
2852             yield from page_results
2853
2854
2855 class PlaylistEntries:
2856     MissingEntry = object()
2857     is_exhausted = False
2858
2859     def __init__(self, ydl, info_dict):
2860         self.ydl = ydl
2861
2862         # _entries must be assigned now since infodict can change during iteration
2863         entries = info_dict.get('entries')
2864         if entries is None:
2865             raise EntryNotInPlaylist('There are no entries')
2866         elif isinstance(entries, list):
2867             self.is_exhausted = True
2868
2869         requested_entries = info_dict.get('requested_entries')
2870         self.is_incomplete = bool(requested_entries)
2871         if self.is_incomplete:
2872             assert self.is_exhausted
2873             self._entries = [self.MissingEntry] * max(requested_entries)
2874             for i, entry in zip(requested_entries, entries):
2875                 self._entries[i - 1] = entry
2876         elif isinstance(entries, (list, PagedList, LazyList)):
2877             self._entries = entries
2878         else:
2879             self._entries = LazyList(entries)
2880
2881     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2882         (?P<start>[+-]?\d+)?
2883         (?P<range>[:-]
2884             (?P<end>[+-]?\d+|inf(?:inite)?)?
2885             (?::(?P<step>[+-]?\d+))?
2886         )?''')
2887
2888     @classmethod
2889     def parse_playlist_items(cls, string):
2890         for segment in string.split(','):
2891             if not segment:
2892                 raise ValueError('There is two or more consecutive commas')
2893             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2894             if not mobj:
2895                 raise ValueError(f'{segment!r} is not a valid specification')
2896             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2897             if int_or_none(step) == 0:
2898                 raise ValueError(f'Step in {segment!r} cannot be zero')
2899             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2900
2901     def get_requested_items(self):
2902         playlist_items = self.ydl.params.get('playlist_items')
2903         playlist_start = self.ydl.params.get('playliststart', 1)
2904         playlist_end = self.ydl.params.get('playlistend')
2905         # For backwards compatibility, interpret -1 as whole list
2906         if playlist_end in (-1, None):
2907             playlist_end = ''
2908         if not playlist_items:
2909             playlist_items = f'{playlist_start}:{playlist_end}'
2910         elif playlist_start != 1 or playlist_end:
2911             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2912
2913         for index in self.parse_playlist_items(playlist_items):
2914             for i, entry in self[index]:
2915                 yield i, entry
2916                 if not entry:
2917                     continue
2918                 try:
2919                     # TODO: Add auto-generated fields
2920                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2921                 except (ExistingVideoReached, RejectedVideoReached):
2922                     return
2923
2924     def get_full_count(self):
2925         if self.is_exhausted and not self.is_incomplete:
2926             return len(self)
2927         elif isinstance(self._entries, InAdvancePagedList):
2928             if self._entries._pagesize == 1:
2929                 return self._entries._pagecount
2930
2931     @functools.cached_property
2932     def _getter(self):
2933         if isinstance(self._entries, list):
2934             def get_entry(i):
2935                 try:
2936                     entry = self._entries[i]
2937                 except IndexError:
2938                     entry = self.MissingEntry
2939                     if not self.is_incomplete:
2940                         raise self.IndexError()
2941                 if entry is self.MissingEntry:
2942                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2943                 return entry
2944         else:
2945             def get_entry(i):
2946                 try:
2947                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2948                 except (LazyList.IndexError, PagedList.IndexError):
2949                     raise self.IndexError()
2950         return get_entry
2951
2952     def __getitem__(self, idx):
2953         if isinstance(idx, int):
2954             idx = slice(idx, idx)
2955
2956         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2957         step = 1 if idx.step is None else idx.step
2958         if idx.start is None:
2959             start = 0 if step > 0 else len(self) - 1
2960         else:
2961             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2962
2963         # NB: Do not call len(self) when idx == [:]
2964         if idx.stop is None:
2965             stop = 0 if step < 0 else float('inf')
2966         else:
2967             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2968         stop += [-1, 1][step > 0]
2969
2970         for i in frange(start, stop, step):
2971             if i < 0:
2972                 continue
2973             try:
2974                 entry = self._getter(i)
2975             except self.IndexError:
2976                 self.is_exhausted = True
2977                 if step > 0:
2978                     break
2979                 continue
2980             yield i + 1, entry
2981
2982     def __len__(self):
2983         return len(tuple(self[:]))
2984
2985     class IndexError(IndexError):
2986         pass
2987
2988
2989 def uppercase_escape(s):
2990     unicode_escape = codecs.getdecoder('unicode_escape')
2991     return re.sub(
2992         r'\\U[0-9a-fA-F]{8}',
2993         lambda m: unicode_escape(m.group(0))[0],
2994         s)
2995
2996
2997 def lowercase_escape(s):
2998     unicode_escape = codecs.getdecoder('unicode_escape')
2999     return re.sub(
3000         r'\\u[0-9a-fA-F]{4}',
3001         lambda m: unicode_escape(m.group(0))[0],
3002         s)
3003
3004
3005 def escape_rfc3986(s):
3006     """Escape non-ASCII characters as suggested by RFC 3986"""
3007     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3008
3009
3010 def escape_url(url):
3011     """Escape URL as suggested by RFC 3986"""
3012     url_parsed = urllib.parse.urlparse(url)
3013     return url_parsed._replace(
3014         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3015         path=escape_rfc3986(url_parsed.path),
3016         params=escape_rfc3986(url_parsed.params),
3017         query=escape_rfc3986(url_parsed.query),
3018         fragment=escape_rfc3986(url_parsed.fragment)
3019     ).geturl()
3020
3021
3022 def parse_qs(url):
3023     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3024
3025
3026 def read_batch_urls(batch_fd):
3027     def fixup(url):
3028         if not isinstance(url, str):
3029             url = url.decode('utf-8', 'replace')
3030         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3031         for bom in BOM_UTF8:
3032             if url.startswith(bom):
3033                 url = url[len(bom):]
3034         url = url.lstrip()
3035         if not url or url.startswith(('#', ';', ']')):
3036             return False
3037         # "#" cannot be stripped out since it is part of the URI
3038         # However, it can be safely stripped out if following a whitespace
3039         return re.split(r'\s#', url, 1)[0].rstrip()
3040
3041     with contextlib.closing(batch_fd) as fd:
3042         return [url for url in map(fixup, fd) if url]
3043
3044
3045 def urlencode_postdata(*args, **kargs):
3046     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3047
3048
3049 def update_url_query(url, query):
3050     if not query:
3051         return url
3052     parsed_url = urllib.parse.urlparse(url)
3053     qs = urllib.parse.parse_qs(parsed_url.query)
3054     qs.update(query)
3055     return urllib.parse.urlunparse(parsed_url._replace(
3056         query=urllib.parse.urlencode(qs, True)))
3057
3058
3059 def update_Request(req, url=None, data=None, headers=None, query=None):
3060     req_headers = req.headers.copy()
3061     req_headers.update(headers or {})
3062     req_data = data or req.data
3063     req_url = update_url_query(url or req.get_full_url(), query)
3064     req_get_method = req.get_method()
3065     if req_get_method == 'HEAD':
3066         req_type = HEADRequest
3067     elif req_get_method == 'PUT':
3068         req_type = PUTRequest
3069     else:
3070         req_type = urllib.request.Request
3071     new_req = req_type(
3072         req_url, data=req_data, headers=req_headers,
3073         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3074     if hasattr(req, 'timeout'):
3075         new_req.timeout = req.timeout
3076     return new_req
3077
3078
3079 def _multipart_encode_impl(data, boundary):
3080     content_type = 'multipart/form-data; boundary=%s' % boundary
3081
3082     out = b''
3083     for k, v in data.items():
3084         out += b'--' + boundary.encode('ascii') + b'\r\n'
3085         if isinstance(k, str):
3086             k = k.encode()
3087         if isinstance(v, str):
3088             v = v.encode()
3089         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3090         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3091         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3092         if boundary.encode('ascii') in content:
3093             raise ValueError('Boundary overlaps with data')
3094         out += content
3095
3096     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3097
3098     return out, content_type
3099
3100
3101 def multipart_encode(data, boundary=None):
3102     '''
3103     Encode a dict to RFC 7578-compliant form-data
3104
3105     data:
3106         A dict where keys and values can be either Unicode or bytes-like
3107         objects.
3108     boundary:
3109         If specified a Unicode object, it's used as the boundary. Otherwise
3110         a random boundary is generated.
3111
3112     Reference: https://tools.ietf.org/html/rfc7578
3113     '''
3114     has_specified_boundary = boundary is not None
3115
3116     while True:
3117         if boundary is None:
3118             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3119
3120         try:
3121             out, content_type = _multipart_encode_impl(data, boundary)
3122             break
3123         except ValueError:
3124             if has_specified_boundary:
3125                 raise
3126             boundary = None
3127
3128     return out, content_type
3129
3130
3131 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3132     for val in map(d.get, variadic(key_or_keys)):
3133         if val is not None and (val or not skip_false_values):
3134             return val
3135     return default
3136
3137
3138 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3139     for f in funcs:
3140         try:
3141             val = f(*args, **kwargs)
3142         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3143             pass
3144         else:
3145             if expected_type is None or isinstance(val, expected_type):
3146                 return val
3147
3148
3149 def try_get(src, getter, expected_type=None):
3150     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3151
3152
3153 def filter_dict(dct, cndn=lambda _, v: v is not None):
3154     return {k: v for k, v in dct.items() if cndn(k, v)}
3155
3156
3157 def merge_dicts(*dicts):
3158     merged = {}
3159     for a_dict in dicts:
3160         for k, v in a_dict.items():
3161             if (v is not None and k not in merged
3162                     or isinstance(v, str) and merged[k] == ''):
3163                 merged[k] = v
3164     return merged
3165
3166
3167 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3168     return string if isinstance(string, str) else str(string, encoding, errors)
3169
3170
3171 US_RATINGS = {
3172     'G': 0,
3173     'PG': 10,
3174     'PG-13': 13,
3175     'R': 16,
3176     'NC': 18,
3177 }
3178
3179
3180 TV_PARENTAL_GUIDELINES = {
3181     'TV-Y': 0,
3182     'TV-Y7': 7,
3183     'TV-G': 0,
3184     'TV-PG': 0,
3185     'TV-14': 14,
3186     'TV-MA': 17,
3187 }
3188
3189
3190 def parse_age_limit(s):
3191     # isinstance(False, int) is True. So type() must be used instead
3192     if type(s) is int:  # noqa: E721
3193         return s if 0 <= s <= 21 else None
3194     elif not isinstance(s, str):
3195         return None
3196     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3197     if m:
3198         return int(m.group('age'))
3199     s = s.upper()
3200     if s in US_RATINGS:
3201         return US_RATINGS[s]
3202     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3203     if m:
3204         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3205     return None
3206
3207
3208 def strip_jsonp(code):
3209     return re.sub(
3210         r'''(?sx)^
3211             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3212             (?:\s*&&\s*(?P=func_name))?
3213             \s*\(\s*(?P<callback_data>.*)\);?
3214             \s*?(?://[^\n]*)*$''',
3215         r'\g<callback_data>', code)
3216
3217
3218 def js_to_json(code, vars={}, *, strict=False):
3219     # vars is a dict of var, val pairs to substitute
3220     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3221     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3222     INTEGER_TABLE = (
3223         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3224         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3225     )
3226
3227     def fix_kv(m):
3228         v = m.group(0)
3229         if v in ('true', 'false', 'null'):
3230             return v
3231         elif v in ('undefined', 'void 0'):
3232             return 'null'
3233         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3234             return ""
3235
3236         if v[0] in ("'", '"'):
3237             v = re.sub(r'(?s)\\.|"', lambda m: {
3238                 '"': '\\"',
3239                 "\\'": "'",
3240                 '\\\n': '',
3241                 '\\x': '\\u00',
3242             }.get(m.group(0), m.group(0)), v[1:-1])
3243         else:
3244             for regex, base in INTEGER_TABLE:
3245                 im = re.match(regex, v)
3246                 if im:
3247                     i = int(im.group(1), base)
3248                     return '"%d":' % i if v.endswith(':') else '%d' % i
3249
3250             if v in vars:
3251                 return vars[v]
3252             if strict:
3253                 raise ValueError(f'Unknown value: {v}')
3254
3255         return '"%s"' % v
3256
3257     def create_map(mobj):
3258         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3259
3260     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3261     if not strict:
3262         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3263
3264     return re.sub(r'''(?sx)
3265         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3266         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3267         {comment}|,(?={skip}[\]}}])|
3268         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3269         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3270         [0-9]+(?={skip}:)|
3271         !+
3272         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3273
3274
3275 def qualities(quality_ids):
3276     """ Get a numeric quality value out of a list of possible values """
3277     def q(qid):
3278         try:
3279             return quality_ids.index(qid)
3280         except ValueError:
3281             return -1
3282     return q
3283
3284
3285 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3286
3287
3288 DEFAULT_OUTTMPL = {
3289     'default': '%(title)s [%(id)s].%(ext)s',
3290     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3291 }
3292 OUTTMPL_TYPES = {
3293     'chapter': None,
3294     'subtitle': None,
3295     'thumbnail': None,
3296     'description': 'description',
3297     'annotation': 'annotations.xml',
3298     'infojson': 'info.json',
3299     'link': None,
3300     'pl_video': None,
3301     'pl_thumbnail': None,
3302     'pl_description': 'description',
3303     'pl_infojson': 'info.json',
3304 }
3305
3306 # As of [1] format syntax is:
3307 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3308 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3309 STR_FORMAT_RE_TMPL = r'''(?x)
3310     (?<!%)(?P<prefix>(?:%%)*)
3311     %
3312     (?P<has_key>\((?P<key>{0})\))?
3313     (?P<format>
3314         (?P<conversion>[#0\-+ ]+)?
3315         (?P<min_width>\d+)?
3316         (?P<precision>\.\d+)?
3317         (?P<len_mod>[hlL])?  # unused in python
3318         {1}  # conversion type
3319     )
3320 '''
3321
3322
3323 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3324
3325
3326 def limit_length(s, length):
3327     """ Add ellipses to overly long strings """
3328     if s is None:
3329         return None
3330     ELLIPSES = '...'
3331     if len(s) > length:
3332         return s[:length - len(ELLIPSES)] + ELLIPSES
3333     return s
3334
3335
3336 def version_tuple(v):
3337     return tuple(int(e) for e in re.split(r'[-.]', v))
3338
3339
3340 def is_outdated_version(version, limit, assume_new=True):
3341     if not version:
3342         return not assume_new
3343     try:
3344         return version_tuple(version) < version_tuple(limit)
3345     except ValueError:
3346         return not assume_new
3347
3348
3349 def ytdl_is_updateable():
3350     """ Returns if yt-dlp can be updated with -U """
3351
3352     from .update import is_non_updateable
3353
3354     return not is_non_updateable()
3355
3356
3357 def args_to_str(args):
3358     # Get a short string representation for a subprocess command
3359     return ' '.join(compat_shlex_quote(a) for a in args)
3360
3361
3362 def error_to_compat_str(err):
3363     return str(err)
3364
3365
3366 def error_to_str(err):
3367     return f'{type(err).__name__}: {err}'
3368
3369
3370 def mimetype2ext(mt):
3371     if mt is None:
3372         return None
3373
3374     mt, _, params = mt.partition(';')
3375     mt = mt.strip()
3376
3377     FULL_MAP = {
3378         'audio/mp4': 'm4a',
3379         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3380         # it's the most popular one
3381         'audio/mpeg': 'mp3',
3382         'audio/x-wav': 'wav',
3383         'audio/wav': 'wav',
3384         'audio/wave': 'wav',
3385     }
3386
3387     ext = FULL_MAP.get(mt)
3388     if ext is not None:
3389         return ext
3390
3391     SUBTYPE_MAP = {
3392         '3gpp': '3gp',
3393         'smptett+xml': 'tt',
3394         'ttaf+xml': 'dfxp',
3395         'ttml+xml': 'ttml',
3396         'x-flv': 'flv',
3397         'x-mp4-fragmented': 'mp4',
3398         'x-ms-sami': 'sami',
3399         'x-ms-wmv': 'wmv',
3400         'mpegurl': 'm3u8',
3401         'x-mpegurl': 'm3u8',
3402         'vnd.apple.mpegurl': 'm3u8',
3403         'dash+xml': 'mpd',
3404         'f4m+xml': 'f4m',
3405         'hds+xml': 'f4m',
3406         'vnd.ms-sstr+xml': 'ism',
3407         'quicktime': 'mov',
3408         'mp2t': 'ts',
3409         'x-wav': 'wav',
3410         'filmstrip+json': 'fs',
3411         'svg+xml': 'svg',
3412     }
3413
3414     _, _, subtype = mt.rpartition('/')
3415     ext = SUBTYPE_MAP.get(subtype.lower())
3416     if ext is not None:
3417         return ext
3418
3419     SUFFIX_MAP = {
3420         'json': 'json',
3421         'xml': 'xml',
3422         'zip': 'zip',
3423         'gzip': 'gz',
3424     }
3425
3426     _, _, suffix = subtype.partition('+')
3427     ext = SUFFIX_MAP.get(suffix)
3428     if ext is not None:
3429         return ext
3430
3431     return subtype.replace('+', '.')
3432
3433
3434 def ext2mimetype(ext_or_url):
3435     if not ext_or_url:
3436         return None
3437     if '.' not in ext_or_url:
3438         ext_or_url = f'file.{ext_or_url}'
3439     return mimetypes.guess_type(ext_or_url)[0]
3440
3441
3442 def parse_codecs(codecs_str):
3443     # http://tools.ietf.org/html/rfc6381
3444     if not codecs_str:
3445         return {}
3446     split_codecs = list(filter(None, map(
3447         str.strip, codecs_str.strip().strip(',').split(','))))
3448     vcodec, acodec, scodec, hdr = None, None, None, None
3449     for full_codec in split_codecs:
3450         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3451         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3452                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3453             if vcodec:
3454                 continue
3455             vcodec = full_codec
3456             if parts[0] in ('dvh1', 'dvhe'):
3457                 hdr = 'DV'
3458             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3459                 hdr = 'HDR10'
3460             elif parts[:2] == ['vp9', '2']:
3461                 hdr = 'HDR10'
3462         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3463                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3464             acodec = acodec or full_codec
3465         elif parts[0] in ('stpp', 'wvtt'):
3466             scodec = scodec or full_codec
3467         else:
3468             write_string(f'WARNING: Unknown codec {full_codec}\n')
3469     if vcodec or acodec or scodec:
3470         return {
3471             'vcodec': vcodec or 'none',
3472             'acodec': acodec or 'none',
3473             'dynamic_range': hdr,
3474             **({'scodec': scodec} if scodec is not None else {}),
3475         }
3476     elif len(split_codecs) == 2:
3477         return {
3478             'vcodec': split_codecs[0],
3479             'acodec': split_codecs[1],
3480         }
3481     return {}
3482
3483
3484 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3485     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3486
3487     allow_mkv = not preferences or 'mkv' in preferences
3488
3489     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3490         return 'mkv'  # TODO: any other format allows this?
3491
3492     # TODO: All codecs supported by parse_codecs isn't handled here
3493     COMPATIBLE_CODECS = {
3494         'mp4': {
3495             'av1', 'hevc', 'avc1', 'mp4a',  # fourcc (m3u8, mpd)
3496             'h264', 'aacl',  # Set in ISM
3497         },
3498         'webm': {
3499             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3500             'vp9x', 'vp8x',  # in the webm spec
3501         },
3502     }
3503
3504     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3505     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3506
3507     for ext in preferences or COMPATIBLE_CODECS.keys():
3508         codec_set = COMPATIBLE_CODECS.get(ext, set())
3509         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3510             return ext
3511
3512     COMPATIBLE_EXTS = (
3513         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3514         {'webm'},
3515     )
3516     for ext in preferences or vexts:
3517         current_exts = {ext, *vexts, *aexts}
3518         if ext == 'mkv' or current_exts == {ext} or any(
3519                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3520             return ext
3521     return 'mkv' if allow_mkv else preferences[-1]
3522
3523
3524 def urlhandle_detect_ext(url_handle):
3525     getheader = url_handle.headers.get
3526
3527     cd = getheader('Content-Disposition')
3528     if cd:
3529         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3530         if m:
3531             e = determine_ext(m.group('filename'), default_ext=None)
3532             if e:
3533                 return e
3534
3535     return mimetype2ext(getheader('Content-Type'))
3536
3537
3538 def encode_data_uri(data, mime_type):
3539     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3540
3541
3542 def age_restricted(content_limit, age_limit):
3543     """ Returns True iff the content should be blocked """
3544
3545     if age_limit is None:  # No limit set
3546         return False
3547     if content_limit is None:
3548         return False  # Content available for everyone
3549     return age_limit < content_limit
3550
3551
3552 # List of known byte-order-marks (BOM)
3553 BOMS = [
3554     (b'\xef\xbb\xbf', 'utf-8'),
3555     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3556     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3557     (b'\xff\xfe', 'utf-16-le'),
3558     (b'\xfe\xff', 'utf-16-be'),
3559 ]
3560
3561
3562 def is_html(first_bytes):
3563     """ Detect whether a file contains HTML by examining its first bytes. """
3564
3565     encoding = 'utf-8'
3566     for bom, enc in BOMS:
3567         while first_bytes.startswith(bom):
3568             encoding, first_bytes = enc, first_bytes[len(bom):]
3569
3570     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3571
3572
3573 def determine_protocol(info_dict):
3574     protocol = info_dict.get('protocol')
3575     if protocol is not None:
3576         return protocol
3577
3578     url = sanitize_url(info_dict['url'])
3579     if url.startswith('rtmp'):
3580         return 'rtmp'
3581     elif url.startswith('mms'):
3582         return 'mms'
3583     elif url.startswith('rtsp'):
3584         return 'rtsp'
3585
3586     ext = determine_ext(url)
3587     if ext == 'm3u8':
3588         return 'm3u8'
3589     elif ext == 'f4m':
3590         return 'f4m'
3591
3592     return urllib.parse.urlparse(url).scheme
3593
3594
3595 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3596     """ Render a list of rows, each as a list of values.
3597     Text after a \t will be right aligned """
3598     def width(string):
3599         return len(remove_terminal_sequences(string).replace('\t', ''))
3600
3601     def get_max_lens(table):
3602         return [max(width(str(v)) for v in col) for col in zip(*table)]
3603
3604     def filter_using_list(row, filterArray):
3605         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3606
3607     max_lens = get_max_lens(data) if hide_empty else []
3608     header_row = filter_using_list(header_row, max_lens)
3609     data = [filter_using_list(row, max_lens) for row in data]
3610
3611     table = [header_row] + data
3612     max_lens = get_max_lens(table)
3613     extra_gap += 1
3614     if delim:
3615         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3616         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3617     for row in table:
3618         for pos, text in enumerate(map(str, row)):
3619             if '\t' in text:
3620                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3621             else:
3622                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3623     ret = '\n'.join(''.join(row).rstrip() for row in table)
3624     return ret
3625
3626
3627 def _match_one(filter_part, dct, incomplete):
3628     # TODO: Generalize code with YoutubeDL._build_format_filter
3629     STRING_OPERATORS = {
3630         '*=': operator.contains,
3631         '^=': lambda attr, value: attr.startswith(value),
3632         '$=': lambda attr, value: attr.endswith(value),
3633         '~=': lambda attr, value: re.search(value, attr),
3634     }
3635     COMPARISON_OPERATORS = {
3636         **STRING_OPERATORS,
3637         '<=': operator.le,  # "<=" must be defined above "<"
3638         '<': operator.lt,
3639         '>=': operator.ge,
3640         '>': operator.gt,
3641         '=': operator.eq,
3642     }
3643
3644     if isinstance(incomplete, bool):
3645         is_incomplete = lambda _: incomplete
3646     else:
3647         is_incomplete = lambda k: k in incomplete
3648
3649     operator_rex = re.compile(r'''(?x)
3650         (?P<key>[a-z_]+)
3651         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3652         (?:
3653             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3654             (?P<strval>.+?)
3655         )
3656         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3657     m = operator_rex.fullmatch(filter_part.strip())
3658     if m:
3659         m = m.groupdict()
3660         unnegated_op = COMPARISON_OPERATORS[m['op']]
3661         if m['negation']:
3662             op = lambda attr, value: not unnegated_op(attr, value)
3663         else:
3664             op = unnegated_op
3665         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3666         if m['quote']:
3667             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3668         actual_value = dct.get(m['key'])
3669         numeric_comparison = None
3670         if isinstance(actual_value, (int, float)):
3671             # If the original field is a string and matching comparisonvalue is
3672             # a number we should respect the origin of the original field
3673             # and process comparison value as a string (see
3674             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3675             try:
3676                 numeric_comparison = int(comparison_value)
3677             except ValueError:
3678                 numeric_comparison = parse_filesize(comparison_value)
3679                 if numeric_comparison is None:
3680                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3681                 if numeric_comparison is None:
3682                     numeric_comparison = parse_duration(comparison_value)
3683         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3684             raise ValueError('Operator %s only supports string values!' % m['op'])
3685         if actual_value is None:
3686             return is_incomplete(m['key']) or m['none_inclusive']
3687         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3688
3689     UNARY_OPERATORS = {
3690         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3691         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3692     }
3693     operator_rex = re.compile(r'''(?x)
3694         (?P<op>%s)\s*(?P<key>[a-z_]+)
3695         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3696     m = operator_rex.fullmatch(filter_part.strip())
3697     if m:
3698         op = UNARY_OPERATORS[m.group('op')]
3699         actual_value = dct.get(m.group('key'))
3700         if is_incomplete(m.group('key')) and actual_value is None:
3701             return True
3702         return op(actual_value)
3703
3704     raise ValueError('Invalid filter part %r' % filter_part)
3705
3706
3707 def match_str(filter_str, dct, incomplete=False):
3708     """ Filter a dictionary with a simple string syntax.
3709     @returns           Whether the filter passes
3710     @param incomplete  Set of keys that is expected to be missing from dct.
3711                        Can be True/False to indicate all/none of the keys may be missing.
3712                        All conditions on incomplete keys pass if the key is missing
3713     """
3714     return all(
3715         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3716         for filter_part in re.split(r'(?<!\\)&', filter_str))
3717
3718
3719 def match_filter_func(filters):
3720     if not filters:
3721         return None
3722     filters = set(variadic(filters))
3723
3724     interactive = '-' in filters
3725     if interactive:
3726         filters.remove('-')
3727
3728     def _match_func(info_dict, incomplete=False):
3729         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3730             return NO_DEFAULT if interactive and not incomplete else None
3731         else:
3732             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3733             filter_str = ') | ('.join(map(str.strip, filters))
3734             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3735     return _match_func
3736
3737
3738 class download_range_func:
3739     def __init__(self, chapters, ranges):
3740         self.chapters, self.ranges = chapters, ranges
3741
3742     def __call__(self, info_dict, ydl):
3743         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3744                    else 'Cannot match chapters since chapter information is unavailable')
3745         for regex in self.chapters or []:
3746             for i, chapter in enumerate(info_dict.get('chapters') or []):
3747                 if re.search(regex, chapter['title']):
3748                     warning = None
3749                     yield {**chapter, 'index': i}
3750         if self.chapters and warning:
3751             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3752
3753         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3754
3755     def __eq__(self, other):
3756         return (isinstance(other, download_range_func)
3757                 and self.chapters == other.chapters and self.ranges == other.ranges)
3758
3759
3760 def parse_dfxp_time_expr(time_expr):
3761     if not time_expr:
3762         return
3763
3764     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3765     if mobj:
3766         return float(mobj.group('time_offset'))
3767
3768     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3769     if mobj:
3770         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3771
3772
3773 def srt_subtitles_timecode(seconds):
3774     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3775
3776
3777 def ass_subtitles_timecode(seconds):
3778     time = timetuple_from_msec(seconds * 1000)
3779     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3780
3781
3782 def dfxp2srt(dfxp_data):
3783     '''
3784     @param dfxp_data A bytes-like object containing DFXP data
3785     @returns A unicode object containing converted SRT data
3786     '''
3787     LEGACY_NAMESPACES = (
3788         (b'http://www.w3.org/ns/ttml', [
3789             b'http://www.w3.org/2004/11/ttaf1',
3790             b'http://www.w3.org/2006/04/ttaf1',
3791             b'http://www.w3.org/2006/10/ttaf1',
3792         ]),
3793         (b'http://www.w3.org/ns/ttml#styling', [
3794             b'http://www.w3.org/ns/ttml#style',
3795         ]),
3796     )
3797
3798     SUPPORTED_STYLING = [
3799         'color',
3800         'fontFamily',
3801         'fontSize',
3802         'fontStyle',
3803         'fontWeight',
3804         'textDecoration'
3805     ]
3806
3807     _x = functools.partial(xpath_with_ns, ns_map={
3808         'xml': 'http://www.w3.org/XML/1998/namespace',
3809         'ttml': 'http://www.w3.org/ns/ttml',
3810         'tts': 'http://www.w3.org/ns/ttml#styling',
3811     })
3812
3813     styles = {}
3814     default_style = {}
3815
3816     class TTMLPElementParser:
3817         _out = ''
3818         _unclosed_elements = []
3819         _applied_styles = []
3820
3821         def start(self, tag, attrib):
3822             if tag in (_x('ttml:br'), 'br'):
3823                 self._out += '\n'
3824             else:
3825                 unclosed_elements = []
3826                 style = {}
3827                 element_style_id = attrib.get('style')
3828                 if default_style:
3829                     style.update(default_style)
3830                 if element_style_id:
3831                     style.update(styles.get(element_style_id, {}))
3832                 for prop in SUPPORTED_STYLING:
3833                     prop_val = attrib.get(_x('tts:' + prop))
3834                     if prop_val:
3835                         style[prop] = prop_val
3836                 if style:
3837                     font = ''
3838                     for k, v in sorted(style.items()):
3839                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3840                             continue
3841                         if k == 'color':
3842                             font += ' color="%s"' % v
3843                         elif k == 'fontSize':
3844                             font += ' size="%s"' % v
3845                         elif k == 'fontFamily':
3846                             font += ' face="%s"' % v
3847                         elif k == 'fontWeight' and v == 'bold':
3848                             self._out += '<b>'
3849                             unclosed_elements.append('b')
3850                         elif k == 'fontStyle' and v == 'italic':
3851                             self._out += '<i>'
3852                             unclosed_elements.append('i')
3853                         elif k == 'textDecoration' and v == 'underline':
3854                             self._out += '<u>'
3855                             unclosed_elements.append('u')
3856                     if font:
3857                         self._out += '<font' + font + '>'
3858                         unclosed_elements.append('font')
3859                     applied_style = {}
3860                     if self._applied_styles:
3861                         applied_style.update(self._applied_styles[-1])
3862                     applied_style.update(style)
3863                     self._applied_styles.append(applied_style)
3864                 self._unclosed_elements.append(unclosed_elements)
3865
3866         def end(self, tag):
3867             if tag not in (_x('ttml:br'), 'br'):
3868                 unclosed_elements = self._unclosed_elements.pop()
3869                 for element in reversed(unclosed_elements):
3870                     self._out += '</%s>' % element
3871                 if unclosed_elements and self._applied_styles:
3872                     self._applied_styles.pop()
3873
3874         def data(self, data):
3875             self._out += data
3876
3877         def close(self):
3878             return self._out.strip()
3879
3880     def parse_node(node):
3881         target = TTMLPElementParser()
3882         parser = xml.etree.ElementTree.XMLParser(target=target)
3883         parser.feed(xml.etree.ElementTree.tostring(node))
3884         return parser.close()
3885
3886     for k, v in LEGACY_NAMESPACES:
3887         for ns in v:
3888             dfxp_data = dfxp_data.replace(ns, k)
3889
3890     dfxp = compat_etree_fromstring(dfxp_data)
3891     out = []
3892     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3893
3894     if not paras:
3895         raise ValueError('Invalid dfxp/TTML subtitle')
3896
3897     repeat = False
3898     while True:
3899         for style in dfxp.findall(_x('.//ttml:style')):
3900             style_id = style.get('id') or style.get(_x('xml:id'))
3901             if not style_id:
3902                 continue
3903             parent_style_id = style.get('style')
3904             if parent_style_id:
3905                 if parent_style_id not in styles:
3906                     repeat = True
3907                     continue
3908                 styles[style_id] = styles[parent_style_id].copy()
3909             for prop in SUPPORTED_STYLING:
3910                 prop_val = style.get(_x('tts:' + prop))
3911                 if prop_val:
3912                     styles.setdefault(style_id, {})[prop] = prop_val
3913         if repeat:
3914             repeat = False
3915         else:
3916             break
3917
3918     for p in ('body', 'div'):
3919         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3920         if ele is None:
3921             continue
3922         style = styles.get(ele.get('style'))
3923         if not style:
3924             continue
3925         default_style.update(style)
3926
3927     for para, index in zip(paras, itertools.count(1)):
3928         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3929         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3930         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3931         if begin_time is None:
3932             continue
3933         if not end_time:
3934             if not dur:
3935                 continue
3936             end_time = begin_time + dur
3937         out.append('%d\n%s --> %s\n%s\n\n' % (
3938             index,
3939             srt_subtitles_timecode(begin_time),
3940             srt_subtitles_timecode(end_time),
3941             parse_node(para)))
3942
3943     return ''.join(out)
3944
3945
3946 def cli_option(params, command_option, param, separator=None):
3947     param = params.get(param)
3948     return ([] if param is None
3949             else [command_option, str(param)] if separator is None
3950             else [f'{command_option}{separator}{param}'])
3951
3952
3953 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3954     param = params.get(param)
3955     assert param in (True, False, None)
3956     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3957
3958
3959 def cli_valueless_option(params, command_option, param, expected_value=True):
3960     return [command_option] if params.get(param) == expected_value else []
3961
3962
3963 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3964     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3965         if use_compat:
3966             return argdict
3967         else:
3968             argdict = None
3969     if argdict is None:
3970         return default
3971     assert isinstance(argdict, dict)
3972
3973     assert isinstance(keys, (list, tuple))
3974     for key_list in keys:
3975         arg_list = list(filter(
3976             lambda x: x is not None,
3977             [argdict.get(key.lower()) for key in variadic(key_list)]))
3978         if arg_list:
3979             return [arg for args in arg_list for arg in args]
3980     return default
3981
3982
3983 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3984     main_key, exe = main_key.lower(), exe.lower()
3985     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3986     keys = [f'{root_key}{k}' for k in (keys or [''])]
3987     if root_key in keys:
3988         if main_key != exe:
3989             keys.append((main_key, exe))
3990         keys.append('default')
3991     else:
3992         use_compat = False
3993     return cli_configuration_args(argdict, keys, default, use_compat)
3994
3995
3996 class ISO639Utils:
3997     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3998     _lang_map = {
3999         'aa': 'aar',
4000         'ab': 'abk',
4001         'ae': 'ave',
4002         'af': 'afr',
4003         'ak': 'aka',
4004         'am': 'amh',
4005         'an': 'arg',
4006         'ar': 'ara',
4007         'as': 'asm',
4008         'av': 'ava',
4009         'ay': 'aym',
4010         'az': 'aze',
4011         'ba': 'bak',
4012         'be': 'bel',
4013         'bg': 'bul',
4014         'bh': 'bih',
4015         'bi': 'bis',
4016         'bm': 'bam',
4017         'bn': 'ben',
4018         'bo': 'bod',
4019         'br': 'bre',
4020         'bs': 'bos',
4021         'ca': 'cat',
4022         'ce': 'che',
4023         'ch': 'cha',
4024         'co': 'cos',
4025         'cr': 'cre',
4026         'cs': 'ces',
4027         'cu': 'chu',
4028         'cv': 'chv',
4029         'cy': 'cym',
4030         'da': 'dan',
4031         'de': 'deu',
4032         'dv': 'div',
4033         'dz': 'dzo',
4034         'ee': 'ewe',
4035         'el': 'ell',
4036         'en': 'eng',
4037         'eo': 'epo',
4038         'es': 'spa',
4039         'et': 'est',
4040         'eu': 'eus',
4041         'fa': 'fas',
4042         'ff': 'ful',
4043         'fi': 'fin',
4044         'fj': 'fij',
4045         'fo': 'fao',
4046         'fr': 'fra',
4047         'fy': 'fry',
4048         'ga': 'gle',
4049         'gd': 'gla',
4050         'gl': 'glg',
4051         'gn': 'grn',
4052         'gu': 'guj',
4053         'gv': 'glv',
4054         'ha': 'hau',
4055         'he': 'heb',
4056         'iw': 'heb',  # Replaced by he in 1989 revision
4057         'hi': 'hin',
4058         'ho': 'hmo',
4059         'hr': 'hrv',
4060         'ht': 'hat',
4061         'hu': 'hun',
4062         'hy': 'hye',
4063         'hz': 'her',
4064         'ia': 'ina',
4065         'id': 'ind',
4066         'in': 'ind',  # Replaced by id in 1989 revision
4067         'ie': 'ile',
4068         'ig': 'ibo',
4069         'ii': 'iii',
4070         'ik': 'ipk',
4071         'io': 'ido',
4072         'is': 'isl',
4073         'it': 'ita',
4074         'iu': 'iku',
4075         'ja': 'jpn',
4076         'jv': 'jav',
4077         'ka': 'kat',
4078         'kg': 'kon',
4079         'ki': 'kik',
4080         'kj': 'kua',
4081         'kk': 'kaz',
4082         'kl': 'kal',
4083         'km': 'khm',
4084         'kn': 'kan',
4085         'ko': 'kor',
4086         'kr': 'kau',
4087         'ks': 'kas',
4088         'ku': 'kur',
4089         'kv': 'kom',
4090         'kw': 'cor',
4091         'ky': 'kir',
4092         'la': 'lat',
4093         'lb': 'ltz',
4094         'lg': 'lug',
4095         'li': 'lim',
4096         'ln': 'lin',
4097         'lo': 'lao',
4098         'lt': 'lit',
4099         'lu': 'lub',
4100         'lv': 'lav',
4101         'mg': 'mlg',
4102         'mh': 'mah',
4103         'mi': 'mri',
4104         'mk': 'mkd',
4105         'ml': 'mal',
4106         'mn': 'mon',
4107         'mr': 'mar',
4108         'ms': 'msa',
4109         'mt': 'mlt',
4110         'my': 'mya',
4111         'na': 'nau',
4112         'nb': 'nob',
4113         'nd': 'nde',
4114         'ne': 'nep',
4115         'ng': 'ndo',
4116         'nl': 'nld',
4117         'nn': 'nno',
4118         'no': 'nor',
4119         'nr': 'nbl',
4120         'nv': 'nav',
4121         'ny': 'nya',
4122         'oc': 'oci',
4123         'oj': 'oji',
4124         'om': 'orm',
4125         'or': 'ori',
4126         'os': 'oss',
4127         'pa': 'pan',
4128         'pi': 'pli',
4129         'pl': 'pol',
4130         'ps': 'pus',
4131         'pt': 'por',
4132         'qu': 'que',
4133         'rm': 'roh',
4134         'rn': 'run',
4135         'ro': 'ron',
4136         'ru': 'rus',
4137         'rw': 'kin',
4138         'sa': 'san',
4139         'sc': 'srd',
4140         'sd': 'snd',
4141         'se': 'sme',
4142         'sg': 'sag',
4143         'si': 'sin',
4144         'sk': 'slk',
4145         'sl': 'slv',
4146         'sm': 'smo',
4147         'sn': 'sna',
4148         'so': 'som',
4149         'sq': 'sqi',
4150         'sr': 'srp',
4151         'ss': 'ssw',
4152         'st': 'sot',
4153         'su': 'sun',
4154         'sv': 'swe',
4155         'sw': 'swa',
4156         'ta': 'tam',
4157         'te': 'tel',
4158         'tg': 'tgk',
4159         'th': 'tha',
4160         'ti': 'tir',
4161         'tk': 'tuk',
4162         'tl': 'tgl',
4163         'tn': 'tsn',
4164         'to': 'ton',
4165         'tr': 'tur',
4166         'ts': 'tso',
4167         'tt': 'tat',
4168         'tw': 'twi',
4169         'ty': 'tah',
4170         'ug': 'uig',
4171         'uk': 'ukr',
4172         'ur': 'urd',
4173         'uz': 'uzb',
4174         've': 'ven',
4175         'vi': 'vie',
4176         'vo': 'vol',
4177         'wa': 'wln',
4178         'wo': 'wol',
4179         'xh': 'xho',
4180         'yi': 'yid',
4181         'ji': 'yid',  # Replaced by yi in 1989 revision
4182         'yo': 'yor',
4183         'za': 'zha',
4184         'zh': 'zho',
4185         'zu': 'zul',
4186     }
4187
4188     @classmethod
4189     def short2long(cls, code):
4190         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4191         return cls._lang_map.get(code[:2])
4192
4193     @classmethod
4194     def long2short(cls, code):
4195         """Convert language code from ISO 639-2/T to ISO 639-1"""
4196         for short_name, long_name in cls._lang_map.items():
4197             if long_name == code:
4198                 return short_name
4199
4200
4201 class ISO3166Utils:
4202     # From http://data.okfn.org/data/core/country-list
4203     _country_map = {
4204         'AF': 'Afghanistan',
4205         'AX': 'Åland Islands',
4206         'AL': 'Albania',
4207         'DZ': 'Algeria',
4208         'AS': 'American Samoa',
4209         'AD': 'Andorra',
4210         'AO': 'Angola',
4211         'AI': 'Anguilla',
4212         'AQ': 'Antarctica',
4213         'AG': 'Antigua and Barbuda',
4214         'AR': 'Argentina',
4215         'AM': 'Armenia',
4216         'AW': 'Aruba',
4217         'AU': 'Australia',
4218         'AT': 'Austria',
4219         'AZ': 'Azerbaijan',
4220         'BS': 'Bahamas',
4221         'BH': 'Bahrain',
4222         'BD': 'Bangladesh',
4223         'BB': 'Barbados',
4224         'BY': 'Belarus',
4225         'BE': 'Belgium',
4226         'BZ': 'Belize',
4227         'BJ': 'Benin',
4228         'BM': 'Bermuda',
4229         'BT': 'Bhutan',
4230         'BO': 'Bolivia, Plurinational State of',
4231         'BQ': 'Bonaire, Sint Eustatius and Saba',
4232         'BA': 'Bosnia and Herzegovina',
4233         'BW': 'Botswana',
4234         'BV': 'Bouvet Island',
4235         'BR': 'Brazil',
4236         'IO': 'British Indian Ocean Territory',
4237         'BN': 'Brunei Darussalam',
4238         'BG': 'Bulgaria',
4239         'BF': 'Burkina Faso',
4240         'BI': 'Burundi',
4241         'KH': 'Cambodia',
4242         'CM': 'Cameroon',
4243         'CA': 'Canada',
4244         'CV': 'Cape Verde',
4245         'KY': 'Cayman Islands',
4246         'CF': 'Central African Republic',
4247         'TD': 'Chad',
4248         'CL': 'Chile',
4249         'CN': 'China',
4250         'CX': 'Christmas Island',
4251         'CC': 'Cocos (Keeling) Islands',
4252         'CO': 'Colombia',
4253         'KM': 'Comoros',
4254         'CG': 'Congo',
4255         'CD': 'Congo, the Democratic Republic of the',
4256         'CK': 'Cook Islands',
4257         'CR': 'Costa Rica',
4258         'CI': 'Côte d\'Ivoire',
4259         'HR': 'Croatia',
4260         'CU': 'Cuba',
4261         'CW': 'Curaçao',
4262         'CY': 'Cyprus',
4263         'CZ': 'Czech Republic',
4264         'DK': 'Denmark',
4265         'DJ': 'Djibouti',
4266         'DM': 'Dominica',
4267         'DO': 'Dominican Republic',
4268         'EC': 'Ecuador',
4269         'EG': 'Egypt',
4270         'SV': 'El Salvador',
4271         'GQ': 'Equatorial Guinea',
4272         'ER': 'Eritrea',
4273         'EE': 'Estonia',
4274         'ET': 'Ethiopia',
4275         'FK': 'Falkland Islands (Malvinas)',
4276         'FO': 'Faroe Islands',
4277         'FJ': 'Fiji',
4278         'FI': 'Finland',
4279         'FR': 'France',
4280         'GF': 'French Guiana',
4281         'PF': 'French Polynesia',
4282         'TF': 'French Southern Territories',
4283         'GA': 'Gabon',
4284         'GM': 'Gambia',
4285         'GE': 'Georgia',
4286         'DE': 'Germany',
4287         'GH': 'Ghana',
4288         'GI': 'Gibraltar',
4289         'GR': 'Greece',
4290         'GL': 'Greenland',
4291         'GD': 'Grenada',
4292         'GP': 'Guadeloupe',
4293         'GU': 'Guam',
4294         'GT': 'Guatemala',
4295         'GG': 'Guernsey',
4296         'GN': 'Guinea',
4297         'GW': 'Guinea-Bissau',
4298         'GY': 'Guyana',
4299         'HT': 'Haiti',
4300         'HM': 'Heard Island and McDonald Islands',
4301         'VA': 'Holy See (Vatican City State)',
4302         'HN': 'Honduras',
4303         'HK': 'Hong Kong',
4304         'HU': 'Hungary',
4305         'IS': 'Iceland',
4306         'IN': 'India',
4307         'ID': 'Indonesia',
4308         'IR': 'Iran, Islamic Republic of',
4309         'IQ': 'Iraq',
4310         'IE': 'Ireland',
4311         'IM': 'Isle of Man',
4312         'IL': 'Israel',
4313         'IT': 'Italy',
4314         'JM': 'Jamaica',
4315         'JP': 'Japan',
4316         'JE': 'Jersey',
4317         'JO': 'Jordan',
4318         'KZ': 'Kazakhstan',
4319         'KE': 'Kenya',
4320         'KI': 'Kiribati',
4321         'KP': 'Korea, Democratic People\'s Republic of',
4322         'KR': 'Korea, Republic of',
4323         'KW': 'Kuwait',
4324         'KG': 'Kyrgyzstan',
4325         'LA': 'Lao People\'s Democratic Republic',
4326         'LV': 'Latvia',
4327         'LB': 'Lebanon',
4328         'LS': 'Lesotho',
4329         'LR': 'Liberia',
4330         'LY': 'Libya',
4331         'LI': 'Liechtenstein',
4332         'LT': 'Lithuania',
4333         'LU': 'Luxembourg',
4334         'MO': 'Macao',
4335         'MK': 'Macedonia, the Former Yugoslav Republic of',
4336         'MG': 'Madagascar',
4337         'MW': 'Malawi',
4338         'MY': 'Malaysia',
4339         'MV': 'Maldives',
4340         'ML': 'Mali',
4341         'MT': 'Malta',
4342         'MH': 'Marshall Islands',
4343         'MQ': 'Martinique',
4344         'MR': 'Mauritania',
4345         'MU': 'Mauritius',
4346         'YT': 'Mayotte',
4347         'MX': 'Mexico',
4348         'FM': 'Micronesia, Federated States of',
4349         'MD': 'Moldova, Republic of',
4350         'MC': 'Monaco',
4351         'MN': 'Mongolia',
4352         'ME': 'Montenegro',
4353         'MS': 'Montserrat',
4354         'MA': 'Morocco',
4355         'MZ': 'Mozambique',
4356         'MM': 'Myanmar',
4357         'NA': 'Namibia',
4358         'NR': 'Nauru',
4359         'NP': 'Nepal',
4360         'NL': 'Netherlands',
4361         'NC': 'New Caledonia',
4362         'NZ': 'New Zealand',
4363         'NI': 'Nicaragua',
4364         'NE': 'Niger',
4365         'NG': 'Nigeria',
4366         'NU': 'Niue',
4367         'NF': 'Norfolk Island',
4368         'MP': 'Northern Mariana Islands',
4369         'NO': 'Norway',
4370         'OM': 'Oman',
4371         'PK': 'Pakistan',
4372         'PW': 'Palau',
4373         'PS': 'Palestine, State of',
4374         'PA': 'Panama',
4375         'PG': 'Papua New Guinea',
4376         'PY': 'Paraguay',
4377         'PE': 'Peru',
4378         'PH': 'Philippines',
4379         'PN': 'Pitcairn',
4380         'PL': 'Poland',
4381         'PT': 'Portugal',
4382         'PR': 'Puerto Rico',
4383         'QA': 'Qatar',
4384         'RE': 'Réunion',
4385         'RO': 'Romania',
4386         'RU': 'Russian Federation',
4387         'RW': 'Rwanda',
4388         'BL': 'Saint Barthélemy',
4389         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4390         'KN': 'Saint Kitts and Nevis',
4391         'LC': 'Saint Lucia',
4392         'MF': 'Saint Martin (French part)',
4393         'PM': 'Saint Pierre and Miquelon',
4394         'VC': 'Saint Vincent and the Grenadines',
4395         'WS': 'Samoa',
4396         'SM': 'San Marino',
4397         'ST': 'Sao Tome and Principe',
4398         'SA': 'Saudi Arabia',
4399         'SN': 'Senegal',
4400         'RS': 'Serbia',
4401         'SC': 'Seychelles',
4402         'SL': 'Sierra Leone',
4403         'SG': 'Singapore',
4404         'SX': 'Sint Maarten (Dutch part)',
4405         'SK': 'Slovakia',
4406         'SI': 'Slovenia',
4407         'SB': 'Solomon Islands',
4408         'SO': 'Somalia',
4409         'ZA': 'South Africa',
4410         'GS': 'South Georgia and the South Sandwich Islands',
4411         'SS': 'South Sudan',
4412         'ES': 'Spain',
4413         'LK': 'Sri Lanka',
4414         'SD': 'Sudan',
4415         'SR': 'Suriname',
4416         'SJ': 'Svalbard and Jan Mayen',
4417         'SZ': 'Swaziland',
4418         'SE': 'Sweden',
4419         'CH': 'Switzerland',
4420         'SY': 'Syrian Arab Republic',
4421         'TW': 'Taiwan, Province of China',
4422         'TJ': 'Tajikistan',
4423         'TZ': 'Tanzania, United Republic of',
4424         'TH': 'Thailand',
4425         'TL': 'Timor-Leste',
4426         'TG': 'Togo',
4427         'TK': 'Tokelau',
4428         'TO': 'Tonga',
4429         'TT': 'Trinidad and Tobago',
4430         'TN': 'Tunisia',
4431         'TR': 'Turkey',
4432         'TM': 'Turkmenistan',
4433         'TC': 'Turks and Caicos Islands',
4434         'TV': 'Tuvalu',
4435         'UG': 'Uganda',
4436         'UA': 'Ukraine',
4437         'AE': 'United Arab Emirates',
4438         'GB': 'United Kingdom',
4439         'US': 'United States',
4440         'UM': 'United States Minor Outlying Islands',
4441         'UY': 'Uruguay',
4442         'UZ': 'Uzbekistan',
4443         'VU': 'Vanuatu',
4444         'VE': 'Venezuela, Bolivarian Republic of',
4445         'VN': 'Viet Nam',
4446         'VG': 'Virgin Islands, British',
4447         'VI': 'Virgin Islands, U.S.',
4448         'WF': 'Wallis and Futuna',
4449         'EH': 'Western Sahara',
4450         'YE': 'Yemen',
4451         'ZM': 'Zambia',
4452         'ZW': 'Zimbabwe',
4453         # Not ISO 3166 codes, but used for IP blocks
4454         'AP': 'Asia/Pacific Region',
4455         'EU': 'Europe',
4456     }
4457
4458     @classmethod
4459     def short2full(cls, code):
4460         """Convert an ISO 3166-2 country code to the corresponding full name"""
4461         return cls._country_map.get(code.upper())
4462
4463
4464 class GeoUtils:
4465     # Major IPv4 address blocks per country
4466     _country_ip_map = {
4467         'AD': '46.172.224.0/19',
4468         'AE': '94.200.0.0/13',
4469         'AF': '149.54.0.0/17',
4470         'AG': '209.59.64.0/18',
4471         'AI': '204.14.248.0/21',
4472         'AL': '46.99.0.0/16',
4473         'AM': '46.70.0.0/15',
4474         'AO': '105.168.0.0/13',
4475         'AP': '182.50.184.0/21',
4476         'AQ': '23.154.160.0/24',
4477         'AR': '181.0.0.0/12',
4478         'AS': '202.70.112.0/20',
4479         'AT': '77.116.0.0/14',
4480         'AU': '1.128.0.0/11',
4481         'AW': '181.41.0.0/18',
4482         'AX': '185.217.4.0/22',
4483         'AZ': '5.197.0.0/16',
4484         'BA': '31.176.128.0/17',
4485         'BB': '65.48.128.0/17',
4486         'BD': '114.130.0.0/16',
4487         'BE': '57.0.0.0/8',
4488         'BF': '102.178.0.0/15',
4489         'BG': '95.42.0.0/15',
4490         'BH': '37.131.0.0/17',
4491         'BI': '154.117.192.0/18',
4492         'BJ': '137.255.0.0/16',
4493         'BL': '185.212.72.0/23',
4494         'BM': '196.12.64.0/18',
4495         'BN': '156.31.0.0/16',
4496         'BO': '161.56.0.0/16',
4497         'BQ': '161.0.80.0/20',
4498         'BR': '191.128.0.0/12',
4499         'BS': '24.51.64.0/18',
4500         'BT': '119.2.96.0/19',
4501         'BW': '168.167.0.0/16',
4502         'BY': '178.120.0.0/13',
4503         'BZ': '179.42.192.0/18',
4504         'CA': '99.224.0.0/11',
4505         'CD': '41.243.0.0/16',
4506         'CF': '197.242.176.0/21',
4507         'CG': '160.113.0.0/16',
4508         'CH': '85.0.0.0/13',
4509         'CI': '102.136.0.0/14',
4510         'CK': '202.65.32.0/19',
4511         'CL': '152.172.0.0/14',
4512         'CM': '102.244.0.0/14',
4513         'CN': '36.128.0.0/10',
4514         'CO': '181.240.0.0/12',
4515         'CR': '201.192.0.0/12',
4516         'CU': '152.206.0.0/15',
4517         'CV': '165.90.96.0/19',
4518         'CW': '190.88.128.0/17',
4519         'CY': '31.153.0.0/16',
4520         'CZ': '88.100.0.0/14',
4521         'DE': '53.0.0.0/8',
4522         'DJ': '197.241.0.0/17',
4523         'DK': '87.48.0.0/12',
4524         'DM': '192.243.48.0/20',
4525         'DO': '152.166.0.0/15',
4526         'DZ': '41.96.0.0/12',
4527         'EC': '186.68.0.0/15',
4528         'EE': '90.190.0.0/15',
4529         'EG': '156.160.0.0/11',
4530         'ER': '196.200.96.0/20',
4531         'ES': '88.0.0.0/11',
4532         'ET': '196.188.0.0/14',
4533         'EU': '2.16.0.0/13',
4534         'FI': '91.152.0.0/13',
4535         'FJ': '144.120.0.0/16',
4536         'FK': '80.73.208.0/21',
4537         'FM': '119.252.112.0/20',
4538         'FO': '88.85.32.0/19',
4539         'FR': '90.0.0.0/9',
4540         'GA': '41.158.0.0/15',
4541         'GB': '25.0.0.0/8',
4542         'GD': '74.122.88.0/21',
4543         'GE': '31.146.0.0/16',
4544         'GF': '161.22.64.0/18',
4545         'GG': '62.68.160.0/19',
4546         'GH': '154.160.0.0/12',
4547         'GI': '95.164.0.0/16',
4548         'GL': '88.83.0.0/19',
4549         'GM': '160.182.0.0/15',
4550         'GN': '197.149.192.0/18',
4551         'GP': '104.250.0.0/19',
4552         'GQ': '105.235.224.0/20',
4553         'GR': '94.64.0.0/13',
4554         'GT': '168.234.0.0/16',
4555         'GU': '168.123.0.0/16',
4556         'GW': '197.214.80.0/20',
4557         'GY': '181.41.64.0/18',
4558         'HK': '113.252.0.0/14',
4559         'HN': '181.210.0.0/16',
4560         'HR': '93.136.0.0/13',
4561         'HT': '148.102.128.0/17',
4562         'HU': '84.0.0.0/14',
4563         'ID': '39.192.0.0/10',
4564         'IE': '87.32.0.0/12',
4565         'IL': '79.176.0.0/13',
4566         'IM': '5.62.80.0/20',
4567         'IN': '117.192.0.0/10',
4568         'IO': '203.83.48.0/21',
4569         'IQ': '37.236.0.0/14',
4570         'IR': '2.176.0.0/12',
4571         'IS': '82.221.0.0/16',
4572         'IT': '79.0.0.0/10',
4573         'JE': '87.244.64.0/18',
4574         'JM': '72.27.0.0/17',
4575         'JO': '176.29.0.0/16',
4576         'JP': '133.0.0.0/8',
4577         'KE': '105.48.0.0/12',
4578         'KG': '158.181.128.0/17',
4579         'KH': '36.37.128.0/17',
4580         'KI': '103.25.140.0/22',
4581         'KM': '197.255.224.0/20',
4582         'KN': '198.167.192.0/19',
4583         'KP': '175.45.176.0/22',
4584         'KR': '175.192.0.0/10',
4585         'KW': '37.36.0.0/14',
4586         'KY': '64.96.0.0/15',
4587         'KZ': '2.72.0.0/13',
4588         'LA': '115.84.64.0/18',
4589         'LB': '178.135.0.0/16',
4590         'LC': '24.92.144.0/20',
4591         'LI': '82.117.0.0/19',
4592         'LK': '112.134.0.0/15',
4593         'LR': '102.183.0.0/16',
4594         'LS': '129.232.0.0/17',
4595         'LT': '78.56.0.0/13',
4596         'LU': '188.42.0.0/16',
4597         'LV': '46.109.0.0/16',
4598         'LY': '41.252.0.0/14',
4599         'MA': '105.128.0.0/11',
4600         'MC': '88.209.64.0/18',
4601         'MD': '37.246.0.0/16',
4602         'ME': '178.175.0.0/17',
4603         'MF': '74.112.232.0/21',
4604         'MG': '154.126.0.0/17',
4605         'MH': '117.103.88.0/21',
4606         'MK': '77.28.0.0/15',
4607         'ML': '154.118.128.0/18',
4608         'MM': '37.111.0.0/17',
4609         'MN': '49.0.128.0/17',
4610         'MO': '60.246.0.0/16',
4611         'MP': '202.88.64.0/20',
4612         'MQ': '109.203.224.0/19',
4613         'MR': '41.188.64.0/18',
4614         'MS': '208.90.112.0/22',
4615         'MT': '46.11.0.0/16',
4616         'MU': '105.16.0.0/12',
4617         'MV': '27.114.128.0/18',
4618         'MW': '102.70.0.0/15',
4619         'MX': '187.192.0.0/11',
4620         'MY': '175.136.0.0/13',
4621         'MZ': '197.218.0.0/15',
4622         'NA': '41.182.0.0/16',
4623         'NC': '101.101.0.0/18',
4624         'NE': '197.214.0.0/18',
4625         'NF': '203.17.240.0/22',
4626         'NG': '105.112.0.0/12',
4627         'NI': '186.76.0.0/15',
4628         'NL': '145.96.0.0/11',
4629         'NO': '84.208.0.0/13',
4630         'NP': '36.252.0.0/15',
4631         'NR': '203.98.224.0/19',
4632         'NU': '49.156.48.0/22',
4633         'NZ': '49.224.0.0/14',
4634         'OM': '5.36.0.0/15',
4635         'PA': '186.72.0.0/15',
4636         'PE': '186.160.0.0/14',
4637         'PF': '123.50.64.0/18',
4638         'PG': '124.240.192.0/19',
4639         'PH': '49.144.0.0/13',
4640         'PK': '39.32.0.0/11',
4641         'PL': '83.0.0.0/11',
4642         'PM': '70.36.0.0/20',
4643         'PR': '66.50.0.0/16',
4644         'PS': '188.161.0.0/16',
4645         'PT': '85.240.0.0/13',
4646         'PW': '202.124.224.0/20',
4647         'PY': '181.120.0.0/14',
4648         'QA': '37.210.0.0/15',
4649         'RE': '102.35.0.0/16',
4650         'RO': '79.112.0.0/13',
4651         'RS': '93.86.0.0/15',
4652         'RU': '5.136.0.0/13',
4653         'RW': '41.186.0.0/16',
4654         'SA': '188.48.0.0/13',
4655         'SB': '202.1.160.0/19',
4656         'SC': '154.192.0.0/11',
4657         'SD': '102.120.0.0/13',
4658         'SE': '78.64.0.0/12',
4659         'SG': '8.128.0.0/10',
4660         'SI': '188.196.0.0/14',
4661         'SK': '78.98.0.0/15',
4662         'SL': '102.143.0.0/17',
4663         'SM': '89.186.32.0/19',
4664         'SN': '41.82.0.0/15',
4665         'SO': '154.115.192.0/18',
4666         'SR': '186.179.128.0/17',
4667         'SS': '105.235.208.0/21',
4668         'ST': '197.159.160.0/19',
4669         'SV': '168.243.0.0/16',
4670         'SX': '190.102.0.0/20',
4671         'SY': '5.0.0.0/16',
4672         'SZ': '41.84.224.0/19',
4673         'TC': '65.255.48.0/20',
4674         'TD': '154.68.128.0/19',
4675         'TG': '196.168.0.0/14',
4676         'TH': '171.96.0.0/13',
4677         'TJ': '85.9.128.0/18',
4678         'TK': '27.96.24.0/21',
4679         'TL': '180.189.160.0/20',
4680         'TM': '95.85.96.0/19',
4681         'TN': '197.0.0.0/11',
4682         'TO': '175.176.144.0/21',
4683         'TR': '78.160.0.0/11',
4684         'TT': '186.44.0.0/15',
4685         'TV': '202.2.96.0/19',
4686         'TW': '120.96.0.0/11',
4687         'TZ': '156.156.0.0/14',
4688         'UA': '37.52.0.0/14',
4689         'UG': '102.80.0.0/13',
4690         'US': '6.0.0.0/8',
4691         'UY': '167.56.0.0/13',
4692         'UZ': '84.54.64.0/18',
4693         'VA': '212.77.0.0/19',
4694         'VC': '207.191.240.0/21',
4695         'VE': '186.88.0.0/13',
4696         'VG': '66.81.192.0/20',
4697         'VI': '146.226.0.0/16',
4698         'VN': '14.160.0.0/11',
4699         'VU': '202.80.32.0/20',
4700         'WF': '117.20.32.0/21',
4701         'WS': '202.4.32.0/19',
4702         'YE': '134.35.0.0/16',
4703         'YT': '41.242.116.0/22',
4704         'ZA': '41.0.0.0/11',
4705         'ZM': '102.144.0.0/13',
4706         'ZW': '102.177.192.0/18',
4707     }
4708
4709     @classmethod
4710     def random_ipv4(cls, code_or_block):
4711         if len(code_or_block) == 2:
4712             block = cls._country_ip_map.get(code_or_block.upper())
4713             if not block:
4714                 return None
4715         else:
4716             block = code_or_block
4717         addr, preflen = block.split('/')
4718         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4719         addr_max = addr_min | (0xffffffff >> int(preflen))
4720         return str(socket.inet_ntoa(
4721             struct.pack('!L', random.randint(addr_min, addr_max))))
4722
4723
4724 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4725     def __init__(self, proxies=None):
4726         # Set default handlers
4727         for type in ('http', 'https'):
4728             setattr(self, '%s_open' % type,
4729                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4730                         meth(r, proxy, type))
4731         urllib.request.ProxyHandler.__init__(self, proxies)
4732
4733     def proxy_open(self, req, proxy, type):
4734         req_proxy = req.headers.get('Ytdl-request-proxy')
4735         if req_proxy is not None:
4736             proxy = req_proxy
4737             del req.headers['Ytdl-request-proxy']
4738
4739         if proxy == '__noproxy__':
4740             return None  # No Proxy
4741         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4742             req.add_header('Ytdl-socks-proxy', proxy)
4743             # yt-dlp's http/https handlers do wrapping the socket with socks
4744             return None
4745         return urllib.request.ProxyHandler.proxy_open(
4746             self, req, proxy, type)
4747
4748
4749 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4750 # released into Public Domain
4751 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4752
4753 def long_to_bytes(n, blocksize=0):
4754     """long_to_bytes(n:long, blocksize:int) : string
4755     Convert a long integer to a byte string.
4756
4757     If optional blocksize is given and greater than zero, pad the front of the
4758     byte string with binary zeros so that the length is a multiple of
4759     blocksize.
4760     """
4761     # after much testing, this algorithm was deemed to be the fastest
4762     s = b''
4763     n = int(n)
4764     while n > 0:
4765         s = struct.pack('>I', n & 0xffffffff) + s
4766         n = n >> 32
4767     # strip off leading zeros
4768     for i in range(len(s)):
4769         if s[i] != b'\000'[0]:
4770             break
4771     else:
4772         # only happens when n == 0
4773         s = b'\000'
4774         i = 0
4775     s = s[i:]
4776     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4777     # de-padding being done above, but sigh...
4778     if blocksize > 0 and len(s) % blocksize:
4779         s = (blocksize - len(s) % blocksize) * b'\000' + s
4780     return s
4781
4782
4783 def bytes_to_long(s):
4784     """bytes_to_long(string) : long
4785     Convert a byte string to a long integer.
4786
4787     This is (essentially) the inverse of long_to_bytes().
4788     """
4789     acc = 0
4790     length = len(s)
4791     if length % 4:
4792         extra = (4 - length % 4)
4793         s = b'\000' * extra + s
4794         length = length + extra
4795     for i in range(0, length, 4):
4796         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4797     return acc
4798
4799
4800 def ohdave_rsa_encrypt(data, exponent, modulus):
4801     '''
4802     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4803
4804     Input:
4805         data: data to encrypt, bytes-like object
4806         exponent, modulus: parameter e and N of RSA algorithm, both integer
4807     Output: hex string of encrypted data
4808
4809     Limitation: supports one block encryption only
4810     '''
4811
4812     payload = int(binascii.hexlify(data[::-1]), 16)
4813     encrypted = pow(payload, exponent, modulus)
4814     return '%x' % encrypted
4815
4816
4817 def pkcs1pad(data, length):
4818     """
4819     Padding input data with PKCS#1 scheme
4820
4821     @param {int[]} data        input data
4822     @param {int}   length      target length
4823     @returns {int[]}           padded data
4824     """
4825     if len(data) > length - 11:
4826         raise ValueError('Input data too long for PKCS#1 padding')
4827
4828     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4829     return [0, 2] + pseudo_random + [0] + data
4830
4831
4832 def _base_n_table(n, table):
4833     if not table and not n:
4834         raise ValueError('Either table or n must be specified')
4835     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4836
4837     if n and n != len(table):
4838         raise ValueError(f'base {n} exceeds table length {len(table)}')
4839     return table
4840
4841
4842 def encode_base_n(num, n=None, table=None):
4843     """Convert given int to a base-n string"""
4844     table = _base_n_table(n, table)
4845     if not num:
4846         return table[0]
4847
4848     result, base = '', len(table)
4849     while num:
4850         result = table[num % base] + result
4851         num = num // base
4852     return result
4853
4854
4855 def decode_base_n(string, n=None, table=None):
4856     """Convert given base-n string to int"""
4857     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4858     result, base = 0, len(table)
4859     for char in string:
4860         result = result * base + table[char]
4861     return result
4862
4863
4864 def decode_base(value, digits):
4865     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4866                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4867     return decode_base_n(value, table=digits)
4868
4869
4870 def decode_packed_codes(code):
4871     mobj = re.search(PACKED_CODES_RE, code)
4872     obfuscated_code, base, count, symbols = mobj.groups()
4873     base = int(base)
4874     count = int(count)
4875     symbols = symbols.split('|')
4876     symbol_table = {}
4877
4878     while count:
4879         count -= 1
4880         base_n_count = encode_base_n(count, base)
4881         symbol_table[base_n_count] = symbols[count] or base_n_count
4882
4883     return re.sub(
4884         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4885         obfuscated_code)
4886
4887
4888 def caesar(s, alphabet, shift):
4889     if shift == 0:
4890         return s
4891     l = len(alphabet)
4892     return ''.join(
4893         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4894         for c in s)
4895
4896
4897 def rot47(s):
4898     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4899
4900
4901 def parse_m3u8_attributes(attrib):
4902     info = {}
4903     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4904         if val.startswith('"'):
4905             val = val[1:-1]
4906         info[key] = val
4907     return info
4908
4909
4910 def urshift(val, n):
4911     return val >> n if val >= 0 else (val + 0x100000000) >> n
4912
4913
4914 # Based on png2str() written by @gdkchan and improved by @yokrysty
4915 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4916 def decode_png(png_data):
4917     # Reference: https://www.w3.org/TR/PNG/
4918     header = png_data[8:]
4919
4920     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4921         raise OSError('Not a valid PNG file.')
4922
4923     int_map = {1: '>B', 2: '>H', 4: '>I'}
4924     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4925
4926     chunks = []
4927
4928     while header:
4929         length = unpack_integer(header[:4])
4930         header = header[4:]
4931
4932         chunk_type = header[:4]
4933         header = header[4:]
4934
4935         chunk_data = header[:length]
4936         header = header[length:]
4937
4938         header = header[4:]  # Skip CRC
4939
4940         chunks.append({
4941             'type': chunk_type,
4942             'length': length,
4943             'data': chunk_data
4944         })
4945
4946     ihdr = chunks[0]['data']
4947
4948     width = unpack_integer(ihdr[:4])
4949     height = unpack_integer(ihdr[4:8])
4950
4951     idat = b''
4952
4953     for chunk in chunks:
4954         if chunk['type'] == b'IDAT':
4955             idat += chunk['data']
4956
4957     if not idat:
4958         raise OSError('Unable to read PNG data.')
4959
4960     decompressed_data = bytearray(zlib.decompress(idat))
4961
4962     stride = width * 3
4963     pixels = []
4964
4965     def _get_pixel(idx):
4966         x = idx % stride
4967         y = idx // stride
4968         return pixels[y][x]
4969
4970     for y in range(height):
4971         basePos = y * (1 + stride)
4972         filter_type = decompressed_data[basePos]
4973
4974         current_row = []
4975
4976         pixels.append(current_row)
4977
4978         for x in range(stride):
4979             color = decompressed_data[1 + basePos + x]
4980             basex = y * stride + x
4981             left = 0
4982             up = 0
4983
4984             if x > 2:
4985                 left = _get_pixel(basex - 3)
4986             if y > 0:
4987                 up = _get_pixel(basex - stride)
4988
4989             if filter_type == 1:  # Sub
4990                 color = (color + left) & 0xff
4991             elif filter_type == 2:  # Up
4992                 color = (color + up) & 0xff
4993             elif filter_type == 3:  # Average
4994                 color = (color + ((left + up) >> 1)) & 0xff
4995             elif filter_type == 4:  # Paeth
4996                 a = left
4997                 b = up
4998                 c = 0
4999
5000                 if x > 2 and y > 0:
5001                     c = _get_pixel(basex - stride - 3)
5002
5003                 p = a + b - c
5004
5005                 pa = abs(p - a)
5006                 pb = abs(p - b)
5007                 pc = abs(p - c)
5008
5009                 if pa <= pb and pa <= pc:
5010                     color = (color + a) & 0xff
5011                 elif pb <= pc:
5012                     color = (color + b) & 0xff
5013                 else:
5014                     color = (color + c) & 0xff
5015
5016             current_row.append(color)
5017
5018     return width, height, pixels
5019
5020
5021 def write_xattr(path, key, value):
5022     # Windows: Write xattrs to NTFS Alternate Data Streams:
5023     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5024     if compat_os_name == 'nt':
5025         assert ':' not in key
5026         assert os.path.exists(path)
5027
5028         try:
5029             with open(f'{path}:{key}', 'wb') as f:
5030                 f.write(value)
5031         except OSError as e:
5032             raise XAttrMetadataError(e.errno, e.strerror)
5033         return
5034
5035     # UNIX Method 1. Use xattrs/pyxattrs modules
5036
5037     setxattr = None
5038     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5039         # Unicode arguments are not supported in pyxattr until version 0.5.0
5040         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5041         if version_tuple(xattr.__version__) >= (0, 5, 0):
5042             setxattr = xattr.set
5043     elif xattr:
5044         setxattr = xattr.setxattr
5045
5046     if setxattr:
5047         try:
5048             setxattr(path, key, value)
5049         except OSError as e:
5050             raise XAttrMetadataError(e.errno, e.strerror)
5051         return
5052
5053     # UNIX Method 2. Use setfattr/xattr executables
5054     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5055            else 'xattr' if check_executable('xattr', ['-h']) else None)
5056     if not exe:
5057         raise XAttrUnavailableError(
5058             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5059             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5060
5061     value = value.decode()
5062     try:
5063         _, stderr, returncode = Popen.run(
5064             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5065             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5066     except OSError as e:
5067         raise XAttrMetadataError(e.errno, e.strerror)
5068     if returncode:
5069         raise XAttrMetadataError(returncode, stderr)
5070
5071
5072 def random_birthday(year_field, month_field, day_field):
5073     start_date = datetime.date(1950, 1, 1)
5074     end_date = datetime.date(1995, 12, 31)
5075     offset = random.randint(0, (end_date - start_date).days)
5076     random_date = start_date + datetime.timedelta(offset)
5077     return {
5078         year_field: str(random_date.year),
5079         month_field: str(random_date.month),
5080         day_field: str(random_date.day),
5081     }
5082
5083
5084 # Templates for internet shortcut files, which are plain text files.
5085 DOT_URL_LINK_TEMPLATE = '''\
5086 [InternetShortcut]
5087 URL=%(url)s
5088 '''
5089
5090 DOT_WEBLOC_LINK_TEMPLATE = '''\
5091 <?xml version="1.0" encoding="UTF-8"?>
5092 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5093 <plist version="1.0">
5094 <dict>
5095 \t<key>URL</key>
5096 \t<string>%(url)s</string>
5097 </dict>
5098 </plist>
5099 '''
5100
5101 DOT_DESKTOP_LINK_TEMPLATE = '''\
5102 [Desktop Entry]
5103 Encoding=UTF-8
5104 Name=%(filename)s
5105 Type=Link
5106 URL=%(url)s
5107 Icon=text-html
5108 '''
5109
5110 LINK_TEMPLATES = {
5111     'url': DOT_URL_LINK_TEMPLATE,
5112     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5113     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5114 }
5115
5116
5117 def iri_to_uri(iri):
5118     """
5119     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5120
5121     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5122     """
5123
5124     iri_parts = urllib.parse.urlparse(iri)
5125
5126     if '[' in iri_parts.netloc:
5127         raise ValueError('IPv6 URIs are not, yet, supported.')
5128         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5129
5130     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5131
5132     net_location = ''
5133     if iri_parts.username:
5134         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5135         if iri_parts.password is not None:
5136             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5137         net_location += '@'
5138
5139     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5140     # The 'idna' encoding produces ASCII text.
5141     if iri_parts.port is not None and iri_parts.port != 80:
5142         net_location += ':' + str(iri_parts.port)
5143
5144     return urllib.parse.urlunparse(
5145         (iri_parts.scheme,
5146             net_location,
5147
5148             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5149
5150             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5151             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5152
5153             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5154             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5155
5156             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5157
5158     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5159
5160
5161 def to_high_limit_path(path):
5162     if sys.platform in ['win32', 'cygwin']:
5163         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5164         return '\\\\?\\' + os.path.abspath(path)
5165
5166     return path
5167
5168
5169 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5170     val = traverse_obj(obj, *variadic(field))
5171     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5172         return default
5173     return template % func(val)
5174
5175
5176 def clean_podcast_url(url):
5177     return re.sub(r'''(?x)
5178         (?:
5179             (?:
5180                 chtbl\.com/track|
5181                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5182                 play\.podtrac\.com
5183             )/[^/]+|
5184             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5185             flex\.acast\.com|
5186             pd(?:
5187                 cn\.co| # https://podcorn.com/analytics-prefix/
5188                 st\.fm # https://podsights.com/docs/
5189             )/e
5190         )/''', '', url)
5191
5192
5193 _HEX_TABLE = '0123456789abcdef'
5194
5195
5196 def random_uuidv4():
5197     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5198
5199
5200 def make_dir(path, to_screen=None):
5201     try:
5202         dn = os.path.dirname(path)
5203         if dn and not os.path.exists(dn):
5204             os.makedirs(dn)
5205         return True
5206     except OSError as err:
5207         if callable(to_screen) is not None:
5208             to_screen('unable to create directory ' + error_to_compat_str(err))
5209         return False
5210
5211
5212 def get_executable_path():
5213     from .update import _get_variant_and_executable_path
5214
5215     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5216
5217
5218 def load_plugins(name, suffix, namespace):
5219     classes = {}
5220     with contextlib.suppress(FileNotFoundError):
5221         plugins_spec = importlib.util.spec_from_file_location(
5222             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5223         plugins = importlib.util.module_from_spec(plugins_spec)
5224         sys.modules[plugins_spec.name] = plugins
5225         plugins_spec.loader.exec_module(plugins)
5226         for name in dir(plugins):
5227             if name in namespace:
5228                 continue
5229             if not name.endswith(suffix):
5230                 continue
5231             klass = getattr(plugins, name)
5232             classes[name] = namespace[name] = klass
5233     return classes
5234
5235
5236 def traverse_obj(
5237         obj, *path_list, default=None, expected_type=None, get_all=True,
5238         casesense=True, is_user_input=False, traverse_string=False):
5239     ''' Traverse nested list/dict/tuple
5240     @param path_list        A list of paths which are checked one by one.
5241                             Each path is a list of keys where each key is a:
5242                               - None:     Do nothing
5243                               - string:   A dictionary key
5244                               - int:      An index into a list
5245                               - tuple:    A list of keys all of which will be traversed
5246                               - Ellipsis: Fetch all values in the object
5247                               - Function: Takes the key and value as arguments
5248                                           and returns whether the key matches or not
5249     @param default          Default value to return
5250     @param expected_type    Only accept final value of this type (Can also be any callable)
5251     @param get_all          Return all the values obtained from a path or only the first one
5252     @param casesense        Whether to consider dictionary keys as case sensitive
5253     @param is_user_input    Whether the keys are generated from user input. If True,
5254                             strings are converted to int/slice if necessary
5255     @param traverse_string  Whether to traverse inside strings. If True, any
5256                             non-compatible object will also be converted into a string
5257     # TODO: Write tests
5258     '''
5259     if not casesense:
5260         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5261         path_list = (map(_lower, variadic(path)) for path in path_list)
5262
5263     def _traverse_obj(obj, path, _current_depth=0):
5264         nonlocal depth
5265         path = tuple(variadic(path))
5266         for i, key in enumerate(path):
5267             if None in (key, obj):
5268                 return obj
5269             if isinstance(key, (list, tuple)):
5270                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5271                 key = ...
5272             if key is ...:
5273                 obj = (obj.values() if isinstance(obj, dict)
5274                        else obj if isinstance(obj, (list, tuple, LazyList))
5275                        else str(obj) if traverse_string else [])
5276                 _current_depth += 1
5277                 depth = max(depth, _current_depth)
5278                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5279             elif callable(key):
5280                 if isinstance(obj, (list, tuple, LazyList)):
5281                     obj = enumerate(obj)
5282                 elif isinstance(obj, dict):
5283                     obj = obj.items()
5284                 else:
5285                     if not traverse_string:
5286                         return None
5287                     obj = str(obj)
5288                 _current_depth += 1
5289                 depth = max(depth, _current_depth)
5290                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5291             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5292                 obj = (obj.get(key) if casesense or (key in obj)
5293                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5294             else:
5295                 if is_user_input:
5296                     key = (int_or_none(key) if ':' not in key
5297                            else slice(*map(int_or_none, key.split(':'))))
5298                     if key == slice(None):
5299                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5300                 if not isinstance(key, (int, slice)):
5301                     return None
5302                 if not isinstance(obj, (list, tuple, LazyList)):
5303                     if not traverse_string:
5304                         return None
5305                     obj = str(obj)
5306                 try:
5307                     obj = obj[key]
5308                 except IndexError:
5309                     return None
5310         return obj
5311
5312     if isinstance(expected_type, type):
5313         type_test = lambda val: val if isinstance(val, expected_type) else None
5314     else:
5315         type_test = expected_type or IDENTITY
5316
5317     for path in path_list:
5318         depth = 0
5319         val = _traverse_obj(obj, path)
5320         if val is not None:
5321             if depth:
5322                 for _ in range(depth - 1):
5323                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5324                 val = [v for v in map(type_test, val) if v is not None]
5325                 if val:
5326                     return val if get_all else val[0]
5327             else:
5328                 val = type_test(val)
5329                 if val is not None:
5330                     return val
5331     return default
5332
5333
5334 def traverse_dict(dictn, keys, casesense=True):
5335     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5336                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5337     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5338
5339
5340 def get_first(obj, keys, **kwargs):
5341     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5342
5343
5344 def variadic(x, allowed_types=(str, bytes, dict)):
5345     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5346
5347
5348 def time_seconds(**kwargs):
5349     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5350     return t.timestamp()
5351
5352
5353 # create a JSON Web Signature (jws) with HS256 algorithm
5354 # the resulting format is in JWS Compact Serialization
5355 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5356 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5357 def jwt_encode_hs256(payload_data, key, headers={}):
5358     header_data = {
5359         'alg': 'HS256',
5360         'typ': 'JWT',
5361     }
5362     if headers:
5363         header_data.update(headers)
5364     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5365     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5366     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5367     signature_b64 = base64.b64encode(h.digest())
5368     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5369     return token
5370
5371
5372 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5373 def jwt_decode_hs256(jwt):
5374     header_b64, payload_b64, signature_b64 = jwt.split('.')
5375     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5376     return payload_data
5377
5378
5379 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5380
5381
5382 @functools.cache
5383 def supports_terminal_sequences(stream):
5384     if compat_os_name == 'nt':
5385         if not WINDOWS_VT_MODE:
5386             return False
5387     elif not os.getenv('TERM'):
5388         return False
5389     try:
5390         return stream.isatty()
5391     except BaseException:
5392         return False
5393
5394
5395 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5396     if get_windows_version() < (10, 0, 10586):
5397         return
5398     global WINDOWS_VT_MODE
5399     try:
5400         Popen.run('', shell=True)
5401     except Exception:
5402         return
5403
5404     WINDOWS_VT_MODE = True
5405     supports_terminal_sequences.cache_clear()
5406
5407
5408 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5409
5410
5411 def remove_terminal_sequences(string):
5412     return _terminal_sequences_re.sub('', string)
5413
5414
5415 def number_of_digits(number):
5416     return len('%d' % number)
5417
5418
5419 def join_nonempty(*values, delim='-', from_dict=None):
5420     if from_dict is not None:
5421         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5422     return delim.join(map(str, filter(None, values)))
5423
5424
5425 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5426     """
5427     Find the largest format dimensions in terms of video width and, for each thumbnail:
5428     * Modify the URL: Match the width with the provided regex and replace with the former width
5429     * Update dimensions
5430
5431     This function is useful with video services that scale the provided thumbnails on demand
5432     """
5433     _keys = ('width', 'height')
5434     max_dimensions = max(
5435         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5436         default=(0, 0))
5437     if not max_dimensions[0]:
5438         return thumbnails
5439     return [
5440         merge_dicts(
5441             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5442             dict(zip(_keys, max_dimensions)), thumbnail)
5443         for thumbnail in thumbnails
5444     ]
5445
5446
5447 def parse_http_range(range):
5448     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5449     if not range:
5450         return None, None, None
5451     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5452     if not crg:
5453         return None, None, None
5454     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5455
5456
5457 def read_stdin(what):
5458     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5459     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5460     return sys.stdin
5461
5462
5463 def determine_file_encoding(data):
5464     """
5465     Detect the text encoding used
5466     @returns (encoding, bytes to skip)
5467     """
5468
5469     # BOM marks are given priority over declarations
5470     for bom, enc in BOMS:
5471         if data.startswith(bom):
5472             return enc, len(bom)
5473
5474     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5475     # We ignore the endianness to get a good enough match
5476     data = data.replace(b'\0', b'')
5477     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5478     return mobj.group(1).decode() if mobj else None, 0
5479
5480
5481 class Config:
5482     own_args = None
5483     parsed_args = None
5484     filename = None
5485     __initialized = False
5486
5487     def __init__(self, parser, label=None):
5488         self.parser, self.label = parser, label
5489         self._loaded_paths, self.configs = set(), []
5490
5491     def init(self, args=None, filename=None):
5492         assert not self.__initialized
5493         self.own_args, self.filename = args, filename
5494         return self.load_configs()
5495
5496     def load_configs(self):
5497         directory = ''
5498         if self.filename:
5499             location = os.path.realpath(self.filename)
5500             directory = os.path.dirname(location)
5501             if location in self._loaded_paths:
5502                 return False
5503             self._loaded_paths.add(location)
5504
5505         self.__initialized = True
5506         opts, _ = self.parser.parse_known_args(self.own_args)
5507         self.parsed_args = self.own_args
5508         for location in opts.config_locations or []:
5509             if location == '-':
5510                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5511                 continue
5512             location = os.path.join(directory, expand_path(location))
5513             if os.path.isdir(location):
5514                 location = os.path.join(location, 'yt-dlp.conf')
5515             if not os.path.exists(location):
5516                 self.parser.error(f'config location {location} does not exist')
5517             self.append_config(self.read_file(location), location)
5518         return True
5519
5520     def __str__(self):
5521         label = join_nonempty(
5522             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5523             delim=' ')
5524         return join_nonempty(
5525             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5526             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5527             delim='\n')
5528
5529     @staticmethod
5530     def read_file(filename, default=[]):
5531         try:
5532             optionf = open(filename, 'rb')
5533         except OSError:
5534             return default  # silently skip if file is not present
5535         try:
5536             enc, skip = determine_file_encoding(optionf.read(512))
5537             optionf.seek(skip, io.SEEK_SET)
5538         except OSError:
5539             enc = None  # silently skip read errors
5540         try:
5541             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5542             contents = optionf.read().decode(enc or preferredencoding())
5543             res = shlex.split(contents, comments=True)
5544         except Exception as err:
5545             raise ValueError(f'Unable to parse "{filename}": {err}')
5546         finally:
5547             optionf.close()
5548         return res
5549
5550     @staticmethod
5551     def hide_login_info(opts):
5552         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5553         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5554
5555         def _scrub_eq(o):
5556             m = eqre.match(o)
5557             if m:
5558                 return m.group('key') + '=PRIVATE'
5559             else:
5560                 return o
5561
5562         opts = list(map(_scrub_eq, opts))
5563         for idx, opt in enumerate(opts):
5564             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5565                 opts[idx + 1] = 'PRIVATE'
5566         return opts
5567
5568     def append_config(self, *args, label=None):
5569         config = type(self)(self.parser, label)
5570         config._loaded_paths = self._loaded_paths
5571         if config.init(*args):
5572             self.configs.append(config)
5573
5574     @property
5575     def all_args(self):
5576         for config in reversed(self.configs):
5577             yield from config.all_args
5578         yield from self.parsed_args or []
5579
5580     def parse_known_args(self, **kwargs):
5581         return self.parser.parse_known_args(self.all_args, **kwargs)
5582
5583     def parse_args(self):
5584         return self.parser.parse_args(self.all_args)
5585
5586
5587 class WebSocketsWrapper():
5588     """Wraps websockets module to use in non-async scopes"""
5589     pool = None
5590
5591     def __init__(self, url, headers=None, connect=True):
5592         self.loop = asyncio.new_event_loop()
5593         # XXX: "loop" is deprecated
5594         self.conn = websockets.connect(
5595             url, extra_headers=headers, ping_interval=None,
5596             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5597         if connect:
5598             self.__enter__()
5599         atexit.register(self.__exit__, None, None, None)
5600
5601     def __enter__(self):
5602         if not self.pool:
5603             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5604         return self
5605
5606     def send(self, *args):
5607         self.run_with_loop(self.pool.send(*args), self.loop)
5608
5609     def recv(self, *args):
5610         return self.run_with_loop(self.pool.recv(*args), self.loop)
5611
5612     def __exit__(self, type, value, traceback):
5613         try:
5614             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5615         finally:
5616             self.loop.close()
5617             self._cancel_all_tasks(self.loop)
5618
5619     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5620     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5621     @staticmethod
5622     def run_with_loop(main, loop):
5623         if not asyncio.iscoroutine(main):
5624             raise ValueError(f'a coroutine was expected, got {main!r}')
5625
5626         try:
5627             return loop.run_until_complete(main)
5628         finally:
5629             loop.run_until_complete(loop.shutdown_asyncgens())
5630             if hasattr(loop, 'shutdown_default_executor'):
5631                 loop.run_until_complete(loop.shutdown_default_executor())
5632
5633     @staticmethod
5634     def _cancel_all_tasks(loop):
5635         to_cancel = asyncio.all_tasks(loop)
5636
5637         if not to_cancel:
5638             return
5639
5640         for task in to_cancel:
5641             task.cancel()
5642
5643         # XXX: "loop" is removed in python 3.10+
5644         loop.run_until_complete(
5645             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5646
5647         for task in to_cancel:
5648             if task.cancelled():
5649                 continue
5650             if task.exception() is not None:
5651                 loop.call_exception_handler({
5652                     'message': 'unhandled exception during asyncio.run() shutdown',
5653                     'exception': task.exception(),
5654                     'task': task,
5655                 })
5656
5657
5658 def merge_headers(*dicts):
5659     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5660     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5661
5662
5663 def cached_method(f):
5664     """Cache a method"""
5665     signature = inspect.signature(f)
5666
5667     @functools.wraps(f)
5668     def wrapper(self, *args, **kwargs):
5669         bound_args = signature.bind(self, *args, **kwargs)
5670         bound_args.apply_defaults()
5671         key = tuple(bound_args.arguments.values())
5672
5673         if not hasattr(self, '__cached_method__cache'):
5674             self.__cached_method__cache = {}
5675         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5676         if key not in cache:
5677             cache[key] = f(self, *args, **kwargs)
5678         return cache[key]
5679     return wrapper
5680
5681
5682 class classproperty:
5683     """property access for class methods"""
5684
5685     def __init__(self, func):
5686         functools.update_wrapper(self, func)
5687         self.func = func
5688
5689     def __get__(self, _, cls):
5690         return self.func(cls)
5691
5692
5693 class Namespace(types.SimpleNamespace):
5694     """Immutable namespace"""
5695
5696     def __iter__(self):
5697         return iter(self.__dict__.values())
5698
5699     @property
5700     def items_(self):
5701         return self.__dict__.items()
5702
5703
5704 MEDIA_EXTENSIONS = Namespace(
5705     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5706     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5707     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5708     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5709     thumbnails=('jpg', 'png', 'webp'),
5710     storyboards=('mhtml', ),
5711     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5712     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5713 )
5714 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5715 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5716
5717 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5718
5719
5720 class RetryManager:
5721     """Usage:
5722         for retry in RetryManager(...):
5723             try:
5724                 ...
5725             except SomeException as err:
5726                 retry.error = err
5727                 continue
5728     """
5729     attempt, _error = 0, None
5730
5731     def __init__(self, _retries, _error_callback, **kwargs):
5732         self.retries = _retries or 0
5733         self.error_callback = functools.partial(_error_callback, **kwargs)
5734
5735     def _should_retry(self):
5736         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5737
5738     @property
5739     def error(self):
5740         if self._error is NO_DEFAULT:
5741             return None
5742         return self._error
5743
5744     @error.setter
5745     def error(self, value):
5746         self._error = value
5747
5748     def __iter__(self):
5749         while self._should_retry():
5750             self.error = NO_DEFAULT
5751             self.attempt += 1
5752             yield self
5753             if self.error:
5754                 self.error_callback(self.error, self.attempt, self.retries)
5755
5756     @staticmethod
5757     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5758         """Utility function for reporting retries"""
5759         if count > retries:
5760             if error:
5761                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5762             raise e
5763
5764         if not count:
5765             return warn(e)
5766         elif isinstance(e, ExtractorError):
5767             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5768         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5769
5770         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5771         if delay:
5772             info(f'Sleeping {delay:.2f} seconds ...')
5773             time.sleep(delay)
5774
5775
5776 def make_archive_id(ie, video_id):
5777     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5778     return f'{ie_key.lower()} {video_id}'
5779
5780
5781 def truncate_string(s, left, right=0):
5782     assert left > 3 and right >= 0
5783     if s is None or len(s) <= left + right:
5784         return s
5785     return f'{s[:left-3]}...{s[-right:]}'
5786
5787
5788 # Deprecated
5789 has_certifi = bool(certifi)
5790 has_websockets = bool(websockets)