yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from .compat import functools  # isort: split
  51 from .compat import (
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_HTMLParseError,
  55     compat_os_name,
  56     compat_shlex_quote,
  57 )
  58 from .dependencies import brotli, certifi, websockets, xattr
  59 from .socks import ProxyType, sockssocket
  60
  61
  62 def register_socks_protocols():
  63     # "Register" SOCKS protocols
  64     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  65     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  66     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  67         if scheme not in urllib.parse.uses_netloc:
  68             urllib.parse.uses_netloc.append(scheme)
  69
  70
  71 # This is not clearly defined otherwise
  72 compiled_regex_type = type(re.compile(''))
  73
  74
  75 def random_user_agent():
  76     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  77     _CHROME_VERSIONS = (
  78         '90.0.4430.212',
  79         '90.0.4430.24',
  80         '90.0.4430.70',
  81         '90.0.4430.72',
  82         '90.0.4430.85',
  83         '90.0.4430.93',
  84         '91.0.4472.101',
  85         '91.0.4472.106',
  86         '91.0.4472.114',
  87         '91.0.4472.124',
  88         '91.0.4472.164',
  89         '91.0.4472.19',
  90         '91.0.4472.77',
  91         '92.0.4515.107',
  92         '92.0.4515.115',
  93         '92.0.4515.131',
  94         '92.0.4515.159',
  95         '92.0.4515.43',
  96         '93.0.4556.0',
  97         '93.0.4577.15',
  98         '93.0.4577.63',
  99         '93.0.4577.82',
 100         '94.0.4606.41',
 101         '94.0.4606.54',
 102         '94.0.4606.61',
 103         '94.0.4606.71',
 104         '94.0.4606.81',
 105         '94.0.4606.85',
 106         '95.0.4638.17',
 107         '95.0.4638.50',
 108         '95.0.4638.54',
 109         '95.0.4638.69',
 110         '95.0.4638.74',
 111         '96.0.4664.18',
 112         '96.0.4664.45',
 113         '96.0.4664.55',
 114         '96.0.4664.93',
 115         '97.0.4692.20',
 116     )
 117     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 118
 119
 120 SUPPORTED_ENCODINGS = [
 121     'gzip', 'deflate'
 122 ]
 123 if brotli:
 124     SUPPORTED_ENCODINGS.append('br')
 125
 126 std_headers = {
 127     'User-Agent': random_user_agent(),
 128     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 129     'Accept-Language': 'en-us,en;q=0.5',
 130     'Sec-Fetch-Mode': 'navigate',
 131 }
 132
 133
 134 USER_AGENTS = {
 135     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 136 }
 137
 138
 139 NO_DEFAULT = object()
 140 IDENTITY = lambda x: x
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151 }
 152
 153 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 154 TIMEZONE_NAMES = {
 155     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 156     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 157     'EST': -5, 'EDT': -4,  # Eastern
 158     'CST': -6, 'CDT': -5,  # Central
 159     'MST': -7, 'MDT': -6,  # Mountain
 160     'PST': -8, 'PDT': -7   # Pacific
 161 }
 162
 163 # needed for sanitizing filenames in restricted mode
 164 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 165                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 166                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 167
 168 DATE_FORMATS = (
 169     '%d %B %Y',
 170     '%d %b %Y',
 171     '%B %d %Y',
 172     '%B %dst %Y',
 173     '%B %dnd %Y',
 174     '%B %drd %Y',
 175     '%B %dth %Y',
 176     '%b %d %Y',
 177     '%b %dst %Y',
 178     '%b %dnd %Y',
 179     '%b %drd %Y',
 180     '%b %dth %Y',
 181     '%b %dst %Y %I:%M',
 182     '%b %dnd %Y %I:%M',
 183     '%b %drd %Y %I:%M',
 184     '%b %dth %Y %I:%M',
 185     '%Y %m %d',
 186     '%Y-%m-%d',
 187     '%Y.%m.%d.',
 188     '%Y/%m/%d',
 189     '%Y/%m/%d %H:%M',
 190     '%Y/%m/%d %H:%M:%S',
 191     '%Y%m%d%H%M',
 192     '%Y%m%d%H%M%S',
 193     '%Y%m%d',
 194     '%Y-%m-%d %H:%M',
 195     '%Y-%m-%d %H:%M:%S',
 196     '%Y-%m-%d %H:%M:%S.%f',
 197     '%Y-%m-%d %H:%M:%S:%f',
 198     '%d.%m.%Y %H:%M',
 199     '%d.%m.%Y %H.%M',
 200     '%Y-%m-%dT%H:%M:%SZ',
 201     '%Y-%m-%dT%H:%M:%S.%fZ',
 202     '%Y-%m-%dT%H:%M:%S.%f0Z',
 203     '%Y-%m-%dT%H:%M:%S',
 204     '%Y-%m-%dT%H:%M:%S.%f',
 205     '%Y-%m-%dT%H:%M',
 206     '%b %d %Y at %H:%M',
 207     '%b %d %Y at %H:%M:%S',
 208     '%B %d %Y at %H:%M',
 209     '%B %d %Y at %H:%M:%S',
 210     '%H:%M %d-%b-%Y',
 211 )
 212
 213 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 214 DATE_FORMATS_DAY_FIRST.extend([
 215     '%d-%m-%Y',
 216     '%d.%m.%Y',
 217     '%d.%m.%y',
 218     '%d/%m/%Y',
 219     '%d/%m/%y',
 220     '%d/%m/%Y %H:%M:%S',
 221     '%d-%m-%Y %H:%M',
 222 ])
 223
 224 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 225 DATE_FORMATS_MONTH_FIRST.extend([
 226     '%m-%d-%Y',
 227     '%m.%d.%Y',
 228     '%m/%d/%Y',
 229     '%m/%d/%y',
 230     '%m/%d/%Y %H:%M:%S',
 231 ])
 232
 233 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 234 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 235
 236 NUMBER_RE = r'\d+(?:\.\d+)?'
 237
 238
 239 @functools.cache
 240 def preferredencoding():
 241     """Get preferred encoding.
 242
 243     Returns the best encoding scheme for the system, based on
 244     locale.getpreferredencoding() and some further tweaks.
 245     """
 246     try:
 247         pref = locale.getpreferredencoding()
 248         'TEST'.encode(pref)
 249     except Exception:
 250         pref = 'UTF-8'
 251
 252     return pref
 253
 254
 255 def write_json_file(obj, fn):
 256     """ Encode obj as JSON and write it to fn, atomically if possible """
 257
 258     tf = tempfile.NamedTemporaryFile(
 259         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 260         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf, ensure_ascii=False)
 265         if sys.platform == 'win32':
 266             # Need to remove existing file on Windows, else os.rename raises
 267             # WindowsError or FileExistsError.
 268             with contextlib.suppress(OSError):
 269                 os.unlink(fn)
 270         with contextlib.suppress(OSError):
 271             mask = os.umask(0)
 272             os.umask(mask)
 273             os.chmod(tf.name, 0o666 & ~mask)
 274         os.rename(tf.name, fn)
 275     except Exception:
 276         with contextlib.suppress(OSError):
 277             os.remove(tf.name)
 278         raise
 279
 280
 281 def find_xpath_attr(node, xpath, key, val=None):
 282     """ Find the xpath xpath[@key=val] """
 283     assert re.match(r'^[a-zA-Z_-]+$', key)
 284     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 285     return node.find(expr)
 286
 287 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 288 # the namespace parameter
 289
 290
 291 def xpath_with_ns(path, ns_map):
 292     components = [c.split(':') for c in path.split('/')]
 293     replaced = []
 294     for c in components:
 295         if len(c) == 1:
 296             replaced.append(c[0])
 297         else:
 298             ns, tag = c
 299             replaced.append('{%s}%s' % (ns_map[ns], tag))
 300     return '/'.join(replaced)
 301
 302
 303 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 304     def _find_xpath(xpath):
 305         return node.find(xpath)
 306
 307     if isinstance(xpath, str):
 308         n = _find_xpath(xpath)
 309     else:
 310         for xp in xpath:
 311             n = _find_xpath(xp)
 312             if n is not None:
 313                 break
 314
 315     if n is None:
 316         if default is not NO_DEFAULT:
 317             return default
 318         elif fatal:
 319             name = xpath if name is None else name
 320             raise ExtractorError('Could not find XML element %s' % name)
 321         else:
 322             return None
 323     return n
 324
 325
 326 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 327     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 328     if n is None or n == default:
 329         return n
 330     if n.text is None:
 331         if default is not NO_DEFAULT:
 332             return default
 333         elif fatal:
 334             name = xpath if name is None else name
 335             raise ExtractorError('Could not find XML element\'s text %s' % name)
 336         else:
 337             return None
 338     return n.text
 339
 340
 341 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 342     n = find_xpath_attr(node, xpath, key)
 343     if n is None:
 344         if default is not NO_DEFAULT:
 345             return default
 346         elif fatal:
 347             name = f'{xpath}[@{key}]' if name is None else name
 348             raise ExtractorError('Could not find XML attribute %s' % name)
 349         else:
 350             return None
 351     return n.attrib[key]
 352
 353
 354 def get_element_by_id(id, html, **kwargs):
 355     """Return the content of the tag with the specified ID in the passed HTML document"""
 356     return get_element_by_attribute('id', id, html, **kwargs)
 357
 358
 359 def get_element_html_by_id(id, html, **kwargs):
 360     """Return the html of the tag with the specified ID in the passed HTML document"""
 361     return get_element_html_by_attribute('id', id, html, **kwargs)
 362
 363
 364 def get_element_by_class(class_name, html):
 365     """Return the content of the first tag with the specified class in the passed HTML document"""
 366     retval = get_elements_by_class(class_name, html)
 367     return retval[0] if retval else None
 368
 369
 370 def get_element_html_by_class(class_name, html):
 371     """Return the html of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_html_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_by_attribute(attribute, value, html, **kwargs):
 377     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_html_by_attribute(attribute, value, html, **kargs):
 382     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 383     return retval[0] if retval else None
 384
 385
 386 def get_elements_by_class(class_name, html, **kargs):
 387     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 388     return get_elements_by_attribute(
 389         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 390         html, escape_value=False)
 391
 392
 393 def get_elements_html_by_class(class_name, html):
 394     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 395     return get_elements_html_by_attribute(
 396         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 397         html, escape_value=False)
 398
 399
 400 def get_elements_by_attribute(*args, **kwargs):
 401     """Return the content of the tag with the specified attribute in the passed HTML document"""
 402     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 403
 404
 405 def get_elements_html_by_attribute(*args, **kwargs):
 406     """Return the html of the tag with the specified attribute in the passed HTML document"""
 407     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 408
 409
 410 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 411     """
 412     Return the text (content) and the html (whole) of the tag with the specified
 413     attribute in the passed HTML document
 414     """
 415
 416     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 417
 418     value = re.escape(value) if escape_value else value
 419
 420     partial_element_re = rf'''(?x)
 421         <(?P<tag>[a-zA-Z0-9:._-]+)
 422          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 423          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 424         '''
 425
 426     for m in re.finditer(partial_element_re, html):
 427         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 428
 429         yield (
 430             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 431             whole
 432         )
 433
 434
 435 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 436     """
 437     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 438     closing tag for the first opening tag it has encountered, and can be used
 439     as a context manager
 440     """
 441
 442     class HTMLBreakOnClosingTagException(Exception):
 443         pass
 444
 445     def __init__(self):
 446         self.tagstack = collections.deque()
 447         html.parser.HTMLParser.__init__(self)
 448
 449     def __enter__(self):
 450         return self
 451
 452     def __exit__(self, *_):
 453         self.close()
 454
 455     def close(self):
 456         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 457         # so data remains buffered; we no longer have any interest in it, thus
 458         # override this method to discard it
 459         pass
 460
 461     def handle_starttag(self, tag, _):
 462         self.tagstack.append(tag)
 463
 464     def handle_endtag(self, tag):
 465         if not self.tagstack:
 466             raise compat_HTMLParseError('no tags in the stack')
 467         while self.tagstack:
 468             inner_tag = self.tagstack.pop()
 469             if inner_tag == tag:
 470                 break
 471         else:
 472             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 473         if not self.tagstack:
 474             raise self.HTMLBreakOnClosingTagException()
 475
 476
 477 def get_element_text_and_html_by_tag(tag, html):
 478     """
 479     For the first element with the specified tag in the passed HTML document
 480     return its' content (text) and the whole element (html)
 481     """
 482     def find_or_raise(haystack, needle, exc):
 483         try:
 484             return haystack.index(needle)
 485         except ValueError:
 486             raise exc
 487     closing_tag = f'</{tag}>'
 488     whole_start = find_or_raise(
 489         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 490     content_start = find_or_raise(
 491         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 492     content_start += whole_start + 1
 493     with HTMLBreakOnClosingTagParser() as parser:
 494         parser.feed(html[whole_start:content_start])
 495         if not parser.tagstack or parser.tagstack[0] != tag:
 496             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 497         offset = content_start
 498         while offset < len(html):
 499             next_closing_tag_start = find_or_raise(
 500                 html[offset:], closing_tag,
 501                 compat_HTMLParseError(f'closing {tag} tag not found'))
 502             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 503             try:
 504                 parser.feed(html[offset:offset + next_closing_tag_end])
 505                 offset += next_closing_tag_end
 506             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 507                 return html[content_start:offset + next_closing_tag_start], \
 508                     html[whole_start:offset + next_closing_tag_end]
 509         raise compat_HTMLParseError('unexpected end of html')
 510
 511
 512 class HTMLAttributeParser(html.parser.HTMLParser):
 513     """Trivial HTML parser to gather the attributes for a single element"""
 514
 515     def __init__(self):
 516         self.attrs = {}
 517         html.parser.HTMLParser.__init__(self)
 518
 519     def handle_starttag(self, tag, attrs):
 520         self.attrs = dict(attrs)
 521
 522
 523 class HTMLListAttrsParser(html.parser.HTMLParser):
 524     """HTML parser to gather the attributes for the elements of a list"""
 525
 526     def __init__(self):
 527         html.parser.HTMLParser.__init__(self)
 528         self.items = []
 529         self._level = 0
 530
 531     def handle_starttag(self, tag, attrs):
 532         if tag == 'li' and self._level == 0:
 533             self.items.append(dict(attrs))
 534         self._level += 1
 535
 536     def handle_endtag(self, tag):
 537         self._level -= 1
 538
 539
 540 def extract_attributes(html_element):
 541     """Given a string for an HTML element such as
 542     <el
 543          a="foo" B="bar" c="&98;az" d=boz
 544          empty= noval entity="&amp;"
 545          sq='"' dq="'"
 546     >
 547     Decode and return a dictionary of attributes.
 548     {
 549         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 550         'empty': '', 'noval': None, 'entity': '&',
 551         'sq': '"', 'dq': '\''
 552     }.
 553     """
 554     parser = HTMLAttributeParser()
 555     with contextlib.suppress(compat_HTMLParseError):
 556         parser.feed(html_element)
 557         parser.close()
 558     return parser.attrs
 559
 560
 561 def parse_list(webpage):
 562     """Given a string for an series of HTML <li> elements,
 563     return a dictionary of their attributes"""
 564     parser = HTMLListAttrsParser()
 565     parser.feed(webpage)
 566     parser.close()
 567     return parser.items
 568
 569
 570 def clean_html(html):
 571     """Clean an HTML snippet into a readable string"""
 572
 573     if html is None:  # Convenience for sanitizing descriptions etc.
 574         return html
 575
 576     html = re.sub(r'\s+', ' ', html)
 577     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 578     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 579     # Strip html tags
 580     html = re.sub('<.*?>', '', html)
 581     # Replace html entities
 582     html = unescapeHTML(html)
 583     return html.strip()
 584
 585
 586 class LenientJSONDecoder(json.JSONDecoder):
 587     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 588         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 589         super().__init__(*args, **kwargs)
 590
 591     def decode(self, s):
 592         if self.transform_source:
 593             s = self.transform_source(s)
 594         if self.ignore_extra:
 595             return self.raw_decode(s.lstrip())[0]
 596         return super().decode(s)
 597
 598
 599 def sanitize_open(filename, open_mode):
 600     """Try to open the given filename, and slightly tweak it if this fails.
 601
 602     Attempts to open the given filename. If this fails, it tries to change
 603     the filename slightly, step by step, until it's either able to open it
 604     or it fails and raises a final exception, like the standard open()
 605     function.
 606
 607     It returns the tuple (stream, definitive_file_name).
 608     """
 609     if filename == '-':
 610         if sys.platform == 'win32':
 611             import msvcrt
 612
 613             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 614             with contextlib.suppress(io.UnsupportedOperation):
 615                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 616         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 617
 618     for attempt in range(2):
 619         try:
 620             try:
 621                 if sys.platform == 'win32':
 622                     # FIXME: An exclusive lock also locks the file from being read.
 623                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 624                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 625                     raise LockingUnsupportedError()
 626                 stream = locked_file(filename, open_mode, block=False).__enter__()
 627             except OSError:
 628                 stream = open(filename, open_mode)
 629             return stream, filename
 630         except OSError as err:
 631             if attempt or err.errno in (errno.EACCES,):
 632                 raise
 633             old_filename, filename = filename, sanitize_path(filename)
 634             if old_filename == filename:
 635                 raise
 636
 637
 638 def timeconvert(timestr):
 639     """Convert RFC 2822 defined time string into system timestamp"""
 640     timestamp = None
 641     timetuple = email.utils.parsedate_tz(timestr)
 642     if timetuple is not None:
 643         timestamp = email.utils.mktime_tz(timetuple)
 644     return timestamp
 645
 646
 647 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 648     """Sanitizes a string so it could be used as part of a filename.
 649     @param restricted   Use a stricter subset of allowed characters
 650     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 651                         If unset, yt-dlp's new sanitization rules are in effect
 652     """
 653     if s == '':
 654         return ''
 655
 656     def replace_insane(char):
 657         if restricted and char in ACCENT_CHARS:
 658             return ACCENT_CHARS[char]
 659         elif not restricted and char == '\n':
 660             return '\0 '
 661         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 662             # Replace with their full-width unicode counterparts
 663             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 664         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 665             return ''
 666         elif char == '"':
 667             return '' if restricted else '\''
 668         elif char == ':':
 669             return '\0_\0-' if restricted else '\0 \0-'
 670         elif char in '\\/|*<>':
 671             return '\0_'
 672         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 673             return '\0_'
 674         return char
 675
 676     if restricted and is_id is NO_DEFAULT:
 677         s = unicodedata.normalize('NFKC', s)
 678     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 679     result = ''.join(map(replace_insane, s))
 680     if is_id is NO_DEFAULT:
 681         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 682         STRIP_RE = r'(?:\0.|[ _-])*'
 683         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 684     result = result.replace('\0', '') or '_'
 685
 686     if not is_id:
 687         while '__' in result:
 688             result = result.replace('__', '_')
 689         result = result.strip('_')
 690         # Common case of "Foreign band name - English song title"
 691         if restricted and result.startswith('-_'):
 692             result = result[2:]
 693         if result.startswith('-'):
 694             result = '_' + result[len('-'):]
 695         result = result.lstrip('.')
 696         if not result:
 697             result = '_'
 698     return result
 699
 700
 701 def sanitize_path(s, force=False):
 702     """Sanitizes and normalizes path on Windows"""
 703     if sys.platform == 'win32':
 704         force = False
 705         drive_or_unc, _ = os.path.splitdrive(s)
 706     elif force:
 707         drive_or_unc = ''
 708     else:
 709         return s
 710
 711     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 712     if drive_or_unc:
 713         norm_path.pop(0)
 714     sanitized_path = [
 715         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 716         for path_part in norm_path]
 717     if drive_or_unc:
 718         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 719     elif force and s and s[0] == os.path.sep:
 720         sanitized_path.insert(0, os.path.sep)
 721     return os.path.join(*sanitized_path)
 722
 723
 724 def sanitize_url(url, *, scheme='http'):
 725     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 726     # the number of unwanted failures due to missing protocol
 727     if url is None:
 728         return
 729     elif url.startswith('//'):
 730         return f'{scheme}:{url}'
 731     # Fix some common typos seen so far
 732     COMMON_TYPOS = (
 733         # https://github.com/ytdl-org/youtube-dl/issues/15649
 734         (r'^httpss://', r'https://'),
 735         # https://bx1.be/lives/direct-tv/
 736         (r'^rmtp([es]?)://', r'rtmp\1://'),
 737     )
 738     for mistake, fixup in COMMON_TYPOS:
 739         if re.match(mistake, url):
 740             return re.sub(mistake, fixup, url)
 741     return url
 742
 743
 744 def extract_basic_auth(url):
 745     parts = urllib.parse.urlsplit(url)
 746     if parts.username is None:
 747         return url, None
 748     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 749         parts.hostname if parts.port is None
 750         else '%s:%d' % (parts.hostname, parts.port))))
 751     auth_payload = base64.b64encode(
 752         ('%s:%s' % (parts.username, parts.password or '')).encode())
 753     return url, f'Basic {auth_payload.decode()}'
 754
 755
 756 def sanitized_Request(url, *args, **kwargs):
 757     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 758     if auth_header is not None:
 759         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 760         headers['Authorization'] = auth_header
 761     return urllib.request.Request(url, *args, **kwargs)
 762
 763
 764 def expand_path(s):
 765     """Expand $ shell variables and ~"""
 766     return os.path.expandvars(compat_expanduser(s))
 767
 768
 769 def orderedSet(iterable, *, lazy=False):
 770     """Remove all duplicates from the input iterable"""
 771     def _iter():
 772         seen = []  # Do not use set since the items can be unhashable
 773         for x in iterable:
 774             if x not in seen:
 775                 seen.append(x)
 776                 yield x
 777
 778     return _iter() if lazy else list(_iter())
 779
 780
 781 def _htmlentity_transform(entity_with_semicolon):
 782     """Transforms an HTML entity to a character."""
 783     entity = entity_with_semicolon[:-1]
 784
 785     # Known non-numeric HTML entity
 786     if entity in html.entities.name2codepoint:
 787         return chr(html.entities.name2codepoint[entity])
 788
 789     # TODO: HTML5 allows entities without a semicolon.
 790     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 791     if entity_with_semicolon in html.entities.html5:
 792         return html.entities.html5[entity_with_semicolon]
 793
 794     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 795     if mobj is not None:
 796         numstr = mobj.group(1)
 797         if numstr.startswith('x'):
 798             base = 16
 799             numstr = '0%s' % numstr
 800         else:
 801             base = 10
 802         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 803         with contextlib.suppress(ValueError):
 804             return chr(int(numstr, base))
 805
 806     # Unknown entity in name, return its literal representation
 807     return '&%s;' % entity
 808
 809
 810 def unescapeHTML(s):
 811     if s is None:
 812         return None
 813     assert isinstance(s, str)
 814
 815     return re.sub(
 816         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 817
 818
 819 def escapeHTML(text):
 820     return (
 821         text
 822         .replace('&', '&amp;')
 823         .replace('<', '&lt;')
 824         .replace('>', '&gt;')
 825         .replace('"', '&quot;')
 826         .replace("'", '&#39;')
 827     )
 828
 829
 830 def process_communicate_or_kill(p, *args, **kwargs):
 831     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 832                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 833     return Popen.communicate_or_kill(p, *args, **kwargs)
 834
 835
 836 class Popen(subprocess.Popen):
 837     if sys.platform == 'win32':
 838         _startupinfo = subprocess.STARTUPINFO()
 839         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 840     else:
 841         _startupinfo = None
 842
 843     @staticmethod
 844     def _fix_pyinstaller_ld_path(env):
 845         """Restore LD_LIBRARY_PATH when using PyInstaller
 846             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 847                  https://github.com/yt-dlp/yt-dlp/issues/4573
 848         """
 849         if not hasattr(sys, '_MEIPASS'):
 850             return
 851
 852         def _fix(key):
 853             orig = env.get(f'{key}_ORIG')
 854             if orig is None:
 855                 env.pop(key, None)
 856             else:
 857                 env[key] = orig
 858
 859         _fix('LD_LIBRARY_PATH')  # Linux
 860         _fix('DYLD_LIBRARY_PATH')  # macOS
 861
 862     def __init__(self, *args, env=None, text=False, **kwargs):
 863         if env is None:
 864             env = os.environ.copy()
 865         self._fix_pyinstaller_ld_path(env)
 866
 867         if text is True:
 868             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 869             kwargs.setdefault('encoding', 'utf-8')
 870             kwargs.setdefault('errors', 'replace')
 871         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 872
 873     def communicate_or_kill(self, *args, **kwargs):
 874         try:
 875             return self.communicate(*args, **kwargs)
 876         except BaseException:  # Including KeyboardInterrupt
 877             self.kill(timeout=None)
 878             raise
 879
 880     def kill(self, *, timeout=0):
 881         super().kill()
 882         if timeout != 0:
 883             self.wait(timeout=timeout)
 884
 885     @classmethod
 886     def run(cls, *args, timeout=None, **kwargs):
 887         with cls(*args, **kwargs) as proc:
 888             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 889             return stdout or '', stderr or '', proc.returncode
 890
 891
 892 def get_subprocess_encoding():
 893     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 894         # For subprocess calls, encode with locale encoding
 895         # Refer to http://stackoverflow.com/a/9951851/35070
 896         encoding = preferredencoding()
 897     else:
 898         encoding = sys.getfilesystemencoding()
 899     if encoding is None:
 900         encoding = 'utf-8'
 901     return encoding
 902
 903
 904 def encodeFilename(s, for_subprocess=False):
 905     assert isinstance(s, str)
 906     return s
 907
 908
 909 def decodeFilename(b, for_subprocess=False):
 910     return b
 911
 912
 913 def encodeArgument(s):
 914     # Legacy code that uses byte strings
 915     # Uncomment the following line after fixing all post processors
 916     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 917     return s if isinstance(s, str) else s.decode('ascii')
 918
 919
 920 def decodeArgument(b):
 921     return b
 922
 923
 924 def decodeOption(optval):
 925     if optval is None:
 926         return optval
 927     if isinstance(optval, bytes):
 928         optval = optval.decode(preferredencoding())
 929
 930     assert isinstance(optval, str)
 931     return optval
 932
 933
 934 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 935
 936
 937 def timetuple_from_msec(msec):
 938     secs, msec = divmod(msec, 1000)
 939     mins, secs = divmod(secs, 60)
 940     hrs, mins = divmod(mins, 60)
 941     return _timetuple(hrs, mins, secs, msec)
 942
 943
 944 def formatSeconds(secs, delim=':', msec=False):
 945     time = timetuple_from_msec(secs * 1000)
 946     if time.hours:
 947         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 948     elif time.minutes:
 949         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 950     else:
 951         ret = '%d' % time.seconds
 952     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 953
 954
 955 def _ssl_load_windows_store_certs(ssl_context, storename):
 956     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 957     try:
 958         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 959                  if encoding == 'x509_asn' and (
 960                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 961     except PermissionError:
 962         return
 963     for cert in certs:
 964         with contextlib.suppress(ssl.SSLError):
 965             ssl_context.load_verify_locations(cadata=cert)
 966
 967
 968 def make_HTTPS_handler(params, **kwargs):
 969     opts_check_certificate = not params.get('nocheckcertificate')
 970     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 971     context.check_hostname = opts_check_certificate
 972     if params.get('legacyserverconnect'):
 973         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 974         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 975         context.set_ciphers('DEFAULT')
 976
 977     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 978     if opts_check_certificate:
 979         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 980             context.load_verify_locations(cafile=certifi.where())
 981         else:
 982             try:
 983                 context.load_default_certs()
 984                 # Work around the issue in load_default_certs when there are bad certificates. See:
 985                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 986                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 987             except ssl.SSLError:
 988                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 989                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 990                     for storename in ('CA', 'ROOT'):
 991                         _ssl_load_windows_store_certs(context, storename)
 992                 context.set_default_verify_paths()
 993
 994     client_certfile = params.get('client_certificate')
 995     if client_certfile:
 996         try:
 997             context.load_cert_chain(
 998                 client_certfile, keyfile=params.get('client_certificate_key'),
 999                 password=params.get('client_certificate_password'))
1000         except ssl.SSLError:
1001             raise YoutubeDLError('Unable to load client certificate')
1002
1003     # Some servers may reject requests if ALPN extension is not sent. See:
1004     # https://github.com/python/cpython/issues/85140
1005     # https://github.com/yt-dlp/yt-dlp/issues/3878
1006     with contextlib.suppress(NotImplementedError):
1007         context.set_alpn_protocols(['http/1.1'])
1008
1009     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1010
1011
1012 def bug_reports_message(before=';'):
1013     from .update import REPOSITORY
1014
1015     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1016            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1017
1018     before = before.rstrip()
1019     if not before or before.endswith(('.', '!', '?')):
1020         msg = msg[0].title() + msg[1:]
1021
1022     return (before + ' ' if before else '') + msg
1023
1024
1025 class YoutubeDLError(Exception):
1026     """Base exception for YoutubeDL errors."""
1027     msg = None
1028
1029     def __init__(self, msg=None):
1030         if msg is not None:
1031             self.msg = msg
1032         elif self.msg is None:
1033             self.msg = type(self).__name__
1034         super().__init__(self.msg)
1035
1036
1037 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1038 if hasattr(ssl, 'CertificateError'):
1039     network_exceptions.append(ssl.CertificateError)
1040 network_exceptions = tuple(network_exceptions)
1041
1042
1043 class ExtractorError(YoutubeDLError):
1044     """Error during info extraction."""
1045
1046     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1047         """ tb, if given, is the original traceback (so that it can be printed out).
1048         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1049         """
1050         if sys.exc_info()[0] in network_exceptions:
1051             expected = True
1052
1053         self.orig_msg = str(msg)
1054         self.traceback = tb
1055         self.expected = expected
1056         self.cause = cause
1057         self.video_id = video_id
1058         self.ie = ie
1059         self.exc_info = sys.exc_info()  # preserve original exception
1060         if isinstance(self.exc_info[1], ExtractorError):
1061             self.exc_info = self.exc_info[1].exc_info
1062
1063         super().__init__(''.join((
1064             format_field(ie, None, '[%s] '),
1065             format_field(video_id, None, '%s: '),
1066             msg,
1067             format_field(cause, None, ' (caused by %r)'),
1068             '' if expected else bug_reports_message())))
1069
1070     def format_traceback(self):
1071         return join_nonempty(
1072             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1073             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1074             delim='\n') or None
1075
1076
1077 class UnsupportedError(ExtractorError):
1078     def __init__(self, url):
1079         super().__init__(
1080             'Unsupported URL: %s' % url, expected=True)
1081         self.url = url
1082
1083
1084 class RegexNotFoundError(ExtractorError):
1085     """Error when a regex didn't match"""
1086     pass
1087
1088
1089 class GeoRestrictedError(ExtractorError):
1090     """Geographic restriction Error exception.
1091
1092     This exception may be thrown when a video is not available from your
1093     geographic location due to geographic restrictions imposed by a website.
1094     """
1095
1096     def __init__(self, msg, countries=None, **kwargs):
1097         kwargs['expected'] = True
1098         super().__init__(msg, **kwargs)
1099         self.countries = countries
1100
1101
1102 class UserNotLive(ExtractorError):
1103     """Error when a channel/user is not live"""
1104
1105     def __init__(self, msg=None, **kwargs):
1106         kwargs['expected'] = True
1107         super().__init__(msg or 'The channel is not currently live', **kwargs)
1108
1109
1110 class DownloadError(YoutubeDLError):
1111     """Download Error exception.
1112
1113     This exception may be thrown by FileDownloader objects if they are not
1114     configured to continue on errors. They will contain the appropriate
1115     error message.
1116     """
1117
1118     def __init__(self, msg, exc_info=None):
1119         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1120         super().__init__(msg)
1121         self.exc_info = exc_info
1122
1123
1124 class EntryNotInPlaylist(YoutubeDLError):
1125     """Entry not in playlist exception.
1126
1127     This exception will be thrown by YoutubeDL when a requested entry
1128     is not found in the playlist info_dict
1129     """
1130     msg = 'Entry not found in info'
1131
1132
1133 class SameFileError(YoutubeDLError):
1134     """Same File exception.
1135
1136     This exception will be thrown by FileDownloader objects if they detect
1137     multiple files would have to be downloaded to the same file on disk.
1138     """
1139     msg = 'Fixed output name but more than one file to download'
1140
1141     def __init__(self, filename=None):
1142         if filename is not None:
1143             self.msg += f': {filename}'
1144         super().__init__(self.msg)
1145
1146
1147 class PostProcessingError(YoutubeDLError):
1148     """Post Processing exception.
1149
1150     This exception may be raised by PostProcessor's .run() method to
1151     indicate an error in the postprocessing task.
1152     """
1153
1154
1155 class DownloadCancelled(YoutubeDLError):
1156     """ Exception raised when the download queue should be interrupted """
1157     msg = 'The download was cancelled'
1158
1159
1160 class ExistingVideoReached(DownloadCancelled):
1161     """ --break-on-existing triggered """
1162     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1163
1164
1165 class RejectedVideoReached(DownloadCancelled):
1166     """ --break-on-reject triggered """
1167     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1168
1169
1170 class MaxDownloadsReached(DownloadCancelled):
1171     """ --max-downloads limit has been reached. """
1172     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1173
1174
1175 class ReExtractInfo(YoutubeDLError):
1176     """ Video info needs to be re-extracted. """
1177
1178     def __init__(self, msg, expected=False):
1179         super().__init__(msg)
1180         self.expected = expected
1181
1182
1183 class ThrottledDownload(ReExtractInfo):
1184     """ Download speed below --throttled-rate. """
1185     msg = 'The download speed is below throttle limit'
1186
1187     def __init__(self):
1188         super().__init__(self.msg, expected=False)
1189
1190
1191 class UnavailableVideoError(YoutubeDLError):
1192     """Unavailable Format exception.
1193
1194     This exception will be thrown when a video is requested
1195     in a format that is not available for that video.
1196     """
1197     msg = 'Unable to download video'
1198
1199     def __init__(self, err=None):
1200         if err is not None:
1201             self.msg += f': {err}'
1202         super().__init__(self.msg)
1203
1204
1205 class ContentTooShortError(YoutubeDLError):
1206     """Content Too Short exception.
1207
1208     This exception may be raised by FileDownloader objects when a file they
1209     download is too small for what the server announced first, indicating
1210     the connection was probably interrupted.
1211     """
1212
1213     def __init__(self, downloaded, expected):
1214         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1215         # Both in bytes
1216         self.downloaded = downloaded
1217         self.expected = expected
1218
1219
1220 class XAttrMetadataError(YoutubeDLError):
1221     def __init__(self, code=None, msg='Unknown error'):
1222         super().__init__(msg)
1223         self.code = code
1224         self.msg = msg
1225
1226         # Parsing code and msg
1227         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1228                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1229             self.reason = 'NO_SPACE'
1230         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231             self.reason = 'VALUE_TOO_LONG'
1232         else:
1233             self.reason = 'NOT_SUPPORTED'
1234
1235
1236 class XAttrUnavailableError(YoutubeDLError):
1237     pass
1238
1239
1240 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1241     hc = http_class(*args, **kwargs)
1242     source_address = ydl_handler._params.get('source_address')
1243
1244     if source_address is not None:
1245         # This is to workaround _create_connection() from socket where it will try all
1246         # address data from getaddrinfo() including IPv6. This filters the result from
1247         # getaddrinfo() based on the source_address value.
1248         # This is based on the cpython socket.create_connection() function.
1249         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1250         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1251             host, port = address
1252             err = None
1253             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1254             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1255             ip_addrs = [addr for addr in addrs if addr[0] == af]
1256             if addrs and not ip_addrs:
1257                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1258                 raise OSError(
1259                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1260                     % (ip_version, source_address[0]))
1261             for res in ip_addrs:
1262                 af, socktype, proto, canonname, sa = res
1263                 sock = None
1264                 try:
1265                     sock = socket.socket(af, socktype, proto)
1266                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1267                         sock.settimeout(timeout)
1268                     sock.bind(source_address)
1269                     sock.connect(sa)
1270                     err = None  # Explicitly break reference cycle
1271                     return sock
1272                 except OSError as _:
1273                     err = _
1274                     if sock is not None:
1275                         sock.close()
1276             if err is not None:
1277                 raise err
1278             else:
1279                 raise OSError('getaddrinfo returns an empty list')
1280         if hasattr(hc, '_create_connection'):
1281             hc._create_connection = _create_connection
1282         hc.source_address = (source_address, 0)
1283
1284     return hc
1285
1286
1287 def handle_youtubedl_headers(headers):
1288     filtered_headers = headers
1289
1290     if 'Youtubedl-no-compression' in filtered_headers:
1291         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1292         del filtered_headers['Youtubedl-no-compression']
1293
1294     return filtered_headers
1295
1296
1297 class YoutubeDLHandler(urllib.request.HTTPHandler):
1298     """Handler for HTTP requests and responses.
1299
1300     This class, when installed with an OpenerDirector, automatically adds
1301     the standard headers to every HTTP request and handles gzipped and
1302     deflated responses from web servers. If compression is to be avoided in
1303     a particular request, the original request in the program code only has
1304     to include the HTTP header "Youtubedl-no-compression", which will be
1305     removed before making the real request.
1306
1307     Part of this code was copied from:
1308
1309     http://techknack.net/python-urllib2-handlers/
1310
1311     Andrew Rowls, the author of that code, agreed to release it to the
1312     public domain.
1313     """
1314
1315     def __init__(self, params, *args, **kwargs):
1316         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1317         self._params = params
1318
1319     def http_open(self, req):
1320         conn_class = http.client.HTTPConnection
1321
1322         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1323         if socks_proxy:
1324             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1325             del req.headers['Ytdl-socks-proxy']
1326
1327         return self.do_open(functools.partial(
1328             _create_http_connection, self, conn_class, False),
1329             req)
1330
1331     @staticmethod
1332     def deflate(data):
1333         if not data:
1334             return data
1335         try:
1336             return zlib.decompress(data, -zlib.MAX_WBITS)
1337         except zlib.error:
1338             return zlib.decompress(data)
1339
1340     @staticmethod
1341     def brotli(data):
1342         if not data:
1343             return data
1344         return brotli.decompress(data)
1345
1346     def http_request(self, req):
1347         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1348         # always respected by websites, some tend to give out URLs with non percent-encoded
1349         # non-ASCII characters (see telemb.py, ard.py [#3412])
1350         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1351         # To work around aforementioned issue we will replace request's original URL with
1352         # percent-encoded one
1353         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1354         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1355         url = req.get_full_url()
1356         url_escaped = escape_url(url)
1357
1358         # Substitute URL if any change after escaping
1359         if url != url_escaped:
1360             req = update_Request(req, url=url_escaped)
1361
1362         for h, v in self._params.get('http_headers', std_headers).items():
1363             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1364             # The dict keys are capitalized because of this bug by urllib
1365             if h.capitalize() not in req.headers:
1366                 req.add_header(h, v)
1367
1368         if 'Accept-encoding' not in req.headers:
1369             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1370
1371         req.headers = handle_youtubedl_headers(req.headers)
1372
1373         return super().do_request_(req)
1374
1375     def http_response(self, req, resp):
1376         old_resp = resp
1377         # gzip
1378         if resp.headers.get('Content-encoding', '') == 'gzip':
1379             content = resp.read()
1380             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1381             try:
1382                 uncompressed = io.BytesIO(gz.read())
1383             except OSError as original_ioerror:
1384                 # There may be junk add the end of the file
1385                 # See http://stackoverflow.com/q/4928560/35070 for details
1386                 for i in range(1, 1024):
1387                     try:
1388                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1389                         uncompressed = io.BytesIO(gz.read())
1390                     except OSError:
1391                         continue
1392                     break
1393                 else:
1394                     raise original_ioerror
1395             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1396             resp.msg = old_resp.msg
1397             del resp.headers['Content-encoding']
1398         # deflate
1399         if resp.headers.get('Content-encoding', '') == 'deflate':
1400             gz = io.BytesIO(self.deflate(resp.read()))
1401             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1402             resp.msg = old_resp.msg
1403             del resp.headers['Content-encoding']
1404         # brotli
1405         if resp.headers.get('Content-encoding', '') == 'br':
1406             resp = urllib.request.addinfourl(
1407                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1408             resp.msg = old_resp.msg
1409             del resp.headers['Content-encoding']
1410         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1411         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1412         if 300 <= resp.code < 400:
1413             location = resp.headers.get('Location')
1414             if location:
1415                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1416                 location = location.encode('iso-8859-1').decode()
1417                 location_escaped = escape_url(location)
1418                 if location != location_escaped:
1419                     del resp.headers['Location']
1420                     resp.headers['Location'] = location_escaped
1421         return resp
1422
1423     https_request = http_request
1424     https_response = http_response
1425
1426
1427 def make_socks_conn_class(base_class, socks_proxy):
1428     assert issubclass(base_class, (
1429         http.client.HTTPConnection, http.client.HTTPSConnection))
1430
1431     url_components = urllib.parse.urlparse(socks_proxy)
1432     if url_components.scheme.lower() == 'socks5':
1433         socks_type = ProxyType.SOCKS5
1434     elif url_components.scheme.lower() in ('socks', 'socks4'):
1435         socks_type = ProxyType.SOCKS4
1436     elif url_components.scheme.lower() == 'socks4a':
1437         socks_type = ProxyType.SOCKS4A
1438
1439     def unquote_if_non_empty(s):
1440         if not s:
1441             return s
1442         return urllib.parse.unquote_plus(s)
1443
1444     proxy_args = (
1445         socks_type,
1446         url_components.hostname, url_components.port or 1080,
1447         True,  # Remote DNS
1448         unquote_if_non_empty(url_components.username),
1449         unquote_if_non_empty(url_components.password),
1450     )
1451
1452     class SocksConnection(base_class):
1453         def connect(self):
1454             self.sock = sockssocket()
1455             self.sock.setproxy(*proxy_args)
1456             if isinstance(self.timeout, (int, float)):
1457                 self.sock.settimeout(self.timeout)
1458             self.sock.connect((self.host, self.port))
1459
1460             if isinstance(self, http.client.HTTPSConnection):
1461                 if hasattr(self, '_context'):  # Python > 2.6
1462                     self.sock = self._context.wrap_socket(
1463                         self.sock, server_hostname=self.host)
1464                 else:
1465                     self.sock = ssl.wrap_socket(self.sock)
1466
1467     return SocksConnection
1468
1469
1470 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1471     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1472         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1473         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1474         self._params = params
1475
1476     def https_open(self, req):
1477         kwargs = {}
1478         conn_class = self._https_conn_class
1479
1480         if hasattr(self, '_context'):  # python > 2.6
1481             kwargs['context'] = self._context
1482         if hasattr(self, '_check_hostname'):  # python 3.x
1483             kwargs['check_hostname'] = self._check_hostname
1484
1485         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1486         if socks_proxy:
1487             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1488             del req.headers['Ytdl-socks-proxy']
1489
1490         try:
1491             return self.do_open(
1492                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1493         except urllib.error.URLError as e:
1494             if (isinstance(e.reason, ssl.SSLError)
1495                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1496                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1497             raise
1498
1499
1500 def is_path_like(f):
1501     return isinstance(f, (str, bytes, os.PathLike))
1502
1503
1504 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1505     """
1506     See [1] for cookie file format.
1507
1508     1. https://curl.haxx.se/docs/http-cookies.html
1509     """
1510     _HTTPONLY_PREFIX = '#HttpOnly_'
1511     _ENTRY_LEN = 7
1512     _HEADER = '''# Netscape HTTP Cookie File
1513 # This file is generated by yt-dlp.  Do not edit.
1514
1515 '''
1516     _CookieFileEntry = collections.namedtuple(
1517         'CookieFileEntry',
1518         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1519
1520     def __init__(self, filename=None, *args, **kwargs):
1521         super().__init__(None, *args, **kwargs)
1522         if is_path_like(filename):
1523             filename = os.fspath(filename)
1524         self.filename = filename
1525
1526     @staticmethod
1527     def _true_or_false(cndn):
1528         return 'TRUE' if cndn else 'FALSE'
1529
1530     @contextlib.contextmanager
1531     def open(self, file, *, write=False):
1532         if is_path_like(file):
1533             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1534                 yield f
1535         else:
1536             if write:
1537                 file.truncate(0)
1538             yield file
1539
1540     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1541         now = time.time()
1542         for cookie in self:
1543             if (not ignore_discard and cookie.discard
1544                     or not ignore_expires and cookie.is_expired(now)):
1545                 continue
1546             name, value = cookie.name, cookie.value
1547             if value is None:
1548                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1549                 # with no name, whereas http.cookiejar regards it as a
1550                 # cookie with no value.
1551                 name, value = '', name
1552             f.write('%s\n' % '\t'.join((
1553                 cookie.domain,
1554                 self._true_or_false(cookie.domain.startswith('.')),
1555                 cookie.path,
1556                 self._true_or_false(cookie.secure),
1557                 str_or_none(cookie.expires, default=''),
1558                 name, value
1559             )))
1560
1561     def save(self, filename=None, *args, **kwargs):
1562         """
1563         Save cookies to a file.
1564         Code is taken from CPython 3.6
1565         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1566
1567         if filename is None:
1568             if self.filename is not None:
1569                 filename = self.filename
1570             else:
1571                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1572
1573         # Store session cookies with `expires` set to 0 instead of an empty string
1574         for cookie in self:
1575             if cookie.expires is None:
1576                 cookie.expires = 0
1577
1578         with self.open(filename, write=True) as f:
1579             f.write(self._HEADER)
1580             self._really_save(f, *args, **kwargs)
1581
1582     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1583         """Load cookies from a file."""
1584         if filename is None:
1585             if self.filename is not None:
1586                 filename = self.filename
1587             else:
1588                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1589
1590         def prepare_line(line):
1591             if line.startswith(self._HTTPONLY_PREFIX):
1592                 line = line[len(self._HTTPONLY_PREFIX):]
1593             # comments and empty lines are fine
1594             if line.startswith('#') or not line.strip():
1595                 return line
1596             cookie_list = line.split('\t')
1597             if len(cookie_list) != self._ENTRY_LEN:
1598                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1599             cookie = self._CookieFileEntry(*cookie_list)
1600             if cookie.expires_at and not cookie.expires_at.isdigit():
1601                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1602             return line
1603
1604         cf = io.StringIO()
1605         with self.open(filename) as f:
1606             for line in f:
1607                 try:
1608                     cf.write(prepare_line(line))
1609                 except http.cookiejar.LoadError as e:
1610                     if f'{line.strip()} '[0] in '[{"':
1611                         raise http.cookiejar.LoadError(
1612                             'Cookies file must be Netscape formatted, not JSON. See  '
1613                             'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1614                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1615                     continue
1616         cf.seek(0)
1617         self._really_load(cf, filename, ignore_discard, ignore_expires)
1618         # Session cookies are denoted by either `expires` field set to
1619         # an empty string or 0. MozillaCookieJar only recognizes the former
1620         # (see [1]). So we need force the latter to be recognized as session
1621         # cookies on our own.
1622         # Session cookies may be important for cookies-based authentication,
1623         # e.g. usually, when user does not check 'Remember me' check box while
1624         # logging in on a site, some important cookies are stored as session
1625         # cookies so that not recognizing them will result in failed login.
1626         # 1. https://bugs.python.org/issue17164
1627         for cookie in self:
1628             # Treat `expires=0` cookies as session cookies
1629             if cookie.expires == 0:
1630                 cookie.expires = None
1631                 cookie.discard = True
1632
1633
1634 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1635     def __init__(self, cookiejar=None):
1636         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1637
1638     def http_response(self, request, response):
1639         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1640
1641     https_request = urllib.request.HTTPCookieProcessor.http_request
1642     https_response = http_response
1643
1644
1645 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1646     """YoutubeDL redirect handler
1647
1648     The code is based on HTTPRedirectHandler implementation from CPython [1].
1649
1650     This redirect handler solves two issues:
1651      - ensures redirect URL is always unicode under python 2
1652      - introduces support for experimental HTTP response status code
1653        308 Permanent Redirect [2] used by some sites [3]
1654
1655     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1656     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1657     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1658     """
1659
1660     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1661
1662     def redirect_request(self, req, fp, code, msg, headers, newurl):
1663         """Return a Request or None in response to a redirect.
1664
1665         This is called by the http_error_30x methods when a
1666         redirection response is received.  If a redirection should
1667         take place, return a new Request to allow http_error_30x to
1668         perform the redirect.  Otherwise, raise HTTPError if no-one
1669         else should try to handle this url.  Return None if you can't
1670         but another Handler might.
1671         """
1672         m = req.get_method()
1673         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1674                  or code in (301, 302, 303) and m == "POST")):
1675             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1676         # Strictly (according to RFC 2616), 301 or 302 in response to
1677         # a POST MUST NOT cause a redirection without confirmation
1678         # from the user (of urllib.request, in this case).  In practice,
1679         # essentially all clients do redirect in this case, so we do
1680         # the same.
1681
1682         # Be conciliant with URIs containing a space.  This is mainly
1683         # redundant with the more complete encoding done in http_error_302(),
1684         # but it is kept for compatibility with other callers.
1685         newurl = newurl.replace(' ', '%20')
1686
1687         CONTENT_HEADERS = ("content-length", "content-type")
1688         # NB: don't use dict comprehension for python 2.6 compatibility
1689         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1690
1691         # A 303 must either use GET or HEAD for subsequent request
1692         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1693         if code == 303 and m != 'HEAD':
1694             m = 'GET'
1695         # 301 and 302 redirects are commonly turned into a GET from a POST
1696         # for subsequent requests by browsers, so we'll do the same.
1697         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1698         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1699         if code in (301, 302) and m == 'POST':
1700             m = 'GET'
1701
1702         return urllib.request.Request(
1703             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1704             unverifiable=True, method=m)
1705
1706
1707 def extract_timezone(date_str):
1708     m = re.search(
1709         r'''(?x)
1710             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1711             (?P<tz>Z|                                            # just the UTC Z, or
1712                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1713                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714                    [ ]?                                          # optional space
1715                 (?P<sign>\+|-)                                   # +/-
1716                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1717             $)
1718         ''', date_str)
1719     if not m:
1720         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1721         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1722         if timezone is not None:
1723             date_str = date_str[:-len(m.group('tz'))]
1724         timezone = datetime.timedelta(hours=timezone or 0)
1725     else:
1726         date_str = date_str[:-len(m.group('tz'))]
1727         if not m.group('sign'):
1728             timezone = datetime.timedelta()
1729         else:
1730             sign = 1 if m.group('sign') == '+' else -1
1731             timezone = datetime.timedelta(
1732                 hours=sign * int(m.group('hours')),
1733                 minutes=sign * int(m.group('minutes')))
1734     return timezone, date_str
1735
1736
1737 def parse_iso8601(date_str, delimiter='T', timezone=None):
1738     """ Return a UNIX timestamp from the given date """
1739
1740     if date_str is None:
1741         return None
1742
1743     date_str = re.sub(r'\.[0-9]+', '', date_str)
1744
1745     if timezone is None:
1746         timezone, date_str = extract_timezone(date_str)
1747
1748     with contextlib.suppress(ValueError):
1749         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1750         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1751         return calendar.timegm(dt.timetuple())
1752
1753
1754 def date_formats(day_first=True):
1755     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1756
1757
1758 def unified_strdate(date_str, day_first=True):
1759     """Return a string with the date in the format YYYYMMDD"""
1760
1761     if date_str is None:
1762         return None
1763     upload_date = None
1764     # Replace commas
1765     date_str = date_str.replace(',', ' ')
1766     # Remove AM/PM + timezone
1767     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1768     _, date_str = extract_timezone(date_str)
1769
1770     for expression in date_formats(day_first):
1771         with contextlib.suppress(ValueError):
1772             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1773     if upload_date is None:
1774         timetuple = email.utils.parsedate_tz(date_str)
1775         if timetuple:
1776             with contextlib.suppress(ValueError):
1777                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1778     if upload_date is not None:
1779         return str(upload_date)
1780
1781
1782 def unified_timestamp(date_str, day_first=True):
1783     if date_str is None:
1784         return None
1785
1786     date_str = re.sub(r'\s+', ' ', re.sub(
1787         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1788
1789     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1790     timezone, date_str = extract_timezone(date_str)
1791
1792     # Remove AM/PM + timezone
1793     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1794
1795     # Remove unrecognized timezones from ISO 8601 alike timestamps
1796     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1797     if m:
1798         date_str = date_str[:-len(m.group('tz'))]
1799
1800     # Python only supports microseconds, so remove nanoseconds
1801     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1802     if m:
1803         date_str = m.group(1)
1804
1805     for expression in date_formats(day_first):
1806         with contextlib.suppress(ValueError):
1807             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1808             return calendar.timegm(dt.timetuple())
1809
1810     timetuple = email.utils.parsedate_tz(date_str)
1811     if timetuple:
1812         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1813
1814
1815 def determine_ext(url, default_ext='unknown_video'):
1816     if url is None or '.' not in url:
1817         return default_ext
1818     guess = url.partition('?')[0].rpartition('.')[2]
1819     if re.match(r'^[A-Za-z0-9]+$', guess):
1820         return guess
1821     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1822     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1823         return guess.rstrip('/')
1824     else:
1825         return default_ext
1826
1827
1828 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1829     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1830
1831
1832 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1833     R"""
1834     Return a datetime object from a string.
1835     Supported format:
1836         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1837
1838     @param format       strftime format of DATE
1839     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1840                         auto: round to the unit provided in date_str (if applicable).
1841     """
1842     auto_precision = False
1843     if precision == 'auto':
1844         auto_precision = True
1845         precision = 'microsecond'
1846     today = datetime_round(datetime.datetime.utcnow(), precision)
1847     if date_str in ('now', 'today'):
1848         return today
1849     if date_str == 'yesterday':
1850         return today - datetime.timedelta(days=1)
1851     match = re.match(
1852         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1853         date_str)
1854     if match is not None:
1855         start_time = datetime_from_str(match.group('start'), precision, format)
1856         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1857         unit = match.group('unit')
1858         if unit == 'month' or unit == 'year':
1859             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1860             unit = 'day'
1861         else:
1862             if unit == 'week':
1863                 unit = 'day'
1864                 time *= 7
1865             delta = datetime.timedelta(**{unit + 's': time})
1866             new_date = start_time + delta
1867         if auto_precision:
1868             return datetime_round(new_date, unit)
1869         return new_date
1870
1871     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1872
1873
1874 def date_from_str(date_str, format='%Y%m%d', strict=False):
1875     R"""
1876     Return a date object from a string using datetime_from_str
1877
1878     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1879                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1880     """
1881     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1882         raise ValueError(f'Invalid date format "{date_str}"')
1883     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1884
1885
1886 def datetime_add_months(dt, months):
1887     """Increment/Decrement a datetime object by months."""
1888     month = dt.month + months - 1
1889     year = dt.year + month // 12
1890     month = month % 12 + 1
1891     day = min(dt.day, calendar.monthrange(year, month)[1])
1892     return dt.replace(year, month, day)
1893
1894
1895 def datetime_round(dt, precision='day'):
1896     """
1897     Round a datetime object's time to a specific precision
1898     """
1899     if precision == 'microsecond':
1900         return dt
1901
1902     unit_seconds = {
1903         'day': 86400,
1904         'hour': 3600,
1905         'minute': 60,
1906         'second': 1,
1907     }
1908     roundto = lambda x, n: ((x + n / 2) // n) * n
1909     timestamp = calendar.timegm(dt.timetuple())
1910     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1911
1912
1913 def hyphenate_date(date_str):
1914     """
1915     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1916     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1917     if match is not None:
1918         return '-'.join(match.groups())
1919     else:
1920         return date_str
1921
1922
1923 class DateRange:
1924     """Represents a time interval between two dates"""
1925
1926     def __init__(self, start=None, end=None):
1927         """start and end must be strings in the format accepted by date"""
1928         if start is not None:
1929             self.start = date_from_str(start, strict=True)
1930         else:
1931             self.start = datetime.datetime.min.date()
1932         if end is not None:
1933             self.end = date_from_str(end, strict=True)
1934         else:
1935             self.end = datetime.datetime.max.date()
1936         if self.start > self.end:
1937             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1938
1939     @classmethod
1940     def day(cls, day):
1941         """Returns a range that only contains the given day"""
1942         return cls(day, day)
1943
1944     def __contains__(self, date):
1945         """Check if the date is in the range"""
1946         if not isinstance(date, datetime.date):
1947             date = date_from_str(date)
1948         return self.start <= date <= self.end
1949
1950     def __str__(self):
1951         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1952
1953     def __eq__(self, other):
1954         return (isinstance(other, DateRange)
1955                 and self.start == other.start and self.end == other.end)
1956
1957
1958 def platform_name():
1959     """ Returns the platform name as a str """
1960     deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
1961     return platform.platform()
1962
1963
1964 @functools.cache
1965 def system_identifier():
1966     python_implementation = platform.python_implementation()
1967     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1968         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1969     libc_ver = []
1970     with contextlib.suppress(OSError):  # We may not have access to the executable
1971         libc_ver = platform.libc_ver()
1972
1973     return 'Python %s (%s %s) - %s %s' % (
1974         platform.python_version(),
1975         python_implementation,
1976         platform.architecture()[0],
1977         platform.platform(),
1978         format_field(join_nonempty(*libc_ver, delim=' '), None, '(%s)'),
1979     )
1980
1981
1982 @functools.cache
1983 def get_windows_version():
1984     ''' Get Windows version. returns () if it's not running on Windows '''
1985     if compat_os_name == 'nt':
1986         return version_tuple(platform.win32_ver()[1])
1987     else:
1988         return ()
1989
1990
1991 def write_string(s, out=None, encoding=None):
1992     assert isinstance(s, str)
1993     out = out or sys.stderr
1994
1995     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1996         s = re.sub(r'([\r\n]+)', r' \1', s)
1997
1998     enc, buffer = None, out
1999     if 'b' in getattr(out, 'mode', ''):
2000         enc = encoding or preferredencoding()
2001     elif hasattr(out, 'buffer'):
2002         buffer = out.buffer
2003         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2004
2005     buffer.write(s.encode(enc, 'ignore') if enc else s)
2006     out.flush()
2007
2008
2009 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2010     from . import _IN_CLI
2011     if _IN_CLI:
2012         if msg in deprecation_warning._cache:
2013             return
2014         deprecation_warning._cache.add(msg)
2015         if printer:
2016             return printer(f'{msg}{bug_reports_message()}', **kwargs)
2017         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2018     else:
2019         import warnings
2020         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2021
2022
2023 deprecation_warning._cache = set()
2024
2025
2026 def bytes_to_intlist(bs):
2027     if not bs:
2028         return []
2029     if isinstance(bs[0], int):  # Python 3
2030         return list(bs)
2031     else:
2032         return [ord(c) for c in bs]
2033
2034
2035 def intlist_to_bytes(xs):
2036     if not xs:
2037         return b''
2038     return struct.pack('%dB' % len(xs), *xs)
2039
2040
2041 class LockingUnsupportedError(OSError):
2042     msg = 'File locking is not supported'
2043
2044     def __init__(self):
2045         super().__init__(self.msg)
2046
2047
2048 # Cross-platform file locking
2049 if sys.platform == 'win32':
2050     import ctypes
2051     import ctypes.wintypes
2052     import msvcrt
2053
2054     class OVERLAPPED(ctypes.Structure):
2055         _fields_ = [
2056             ('Internal', ctypes.wintypes.LPVOID),
2057             ('InternalHigh', ctypes.wintypes.LPVOID),
2058             ('Offset', ctypes.wintypes.DWORD),
2059             ('OffsetHigh', ctypes.wintypes.DWORD),
2060             ('hEvent', ctypes.wintypes.HANDLE),
2061         ]
2062
2063     kernel32 = ctypes.windll.kernel32
2064     LockFileEx = kernel32.LockFileEx
2065     LockFileEx.argtypes = [
2066         ctypes.wintypes.HANDLE,     # hFile
2067         ctypes.wintypes.DWORD,      # dwFlags
2068         ctypes.wintypes.DWORD,      # dwReserved
2069         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2070         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2071         ctypes.POINTER(OVERLAPPED)  # Overlapped
2072     ]
2073     LockFileEx.restype = ctypes.wintypes.BOOL
2074     UnlockFileEx = kernel32.UnlockFileEx
2075     UnlockFileEx.argtypes = [
2076         ctypes.wintypes.HANDLE,     # hFile
2077         ctypes.wintypes.DWORD,      # dwReserved
2078         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2079         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2080         ctypes.POINTER(OVERLAPPED)  # Overlapped
2081     ]
2082     UnlockFileEx.restype = ctypes.wintypes.BOOL
2083     whole_low = 0xffffffff
2084     whole_high = 0x7fffffff
2085
2086     def _lock_file(f, exclusive, block):
2087         overlapped = OVERLAPPED()
2088         overlapped.Offset = 0
2089         overlapped.OffsetHigh = 0
2090         overlapped.hEvent = 0
2091         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2092
2093         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2094                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2095                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2096             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2097             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2098
2099     def _unlock_file(f):
2100         assert f._lock_file_overlapped_p
2101         handle = msvcrt.get_osfhandle(f.fileno())
2102         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2103             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2104
2105 else:
2106     try:
2107         import fcntl
2108
2109         def _lock_file(f, exclusive, block):
2110             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2111             if not block:
2112                 flags |= fcntl.LOCK_NB
2113             try:
2114                 fcntl.flock(f, flags)
2115             except BlockingIOError:
2116                 raise
2117             except OSError:  # AOSP does not have flock()
2118                 fcntl.lockf(f, flags)
2119
2120         def _unlock_file(f):
2121             try:
2122                 fcntl.flock(f, fcntl.LOCK_UN)
2123             except OSError:
2124                 fcntl.lockf(f, fcntl.LOCK_UN)
2125
2126     except ImportError:
2127
2128         def _lock_file(f, exclusive, block):
2129             raise LockingUnsupportedError()
2130
2131         def _unlock_file(f):
2132             raise LockingUnsupportedError()
2133
2134
2135 class locked_file:
2136     locked = False
2137
2138     def __init__(self, filename, mode, block=True, encoding=None):
2139         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2140             raise NotImplementedError(mode)
2141         self.mode, self.block = mode, block
2142
2143         writable = any(f in mode for f in 'wax+')
2144         readable = any(f in mode for f in 'r+')
2145         flags = functools.reduce(operator.ior, (
2146             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2147             getattr(os, 'O_BINARY', 0),  # Windows only
2148             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2149             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2150             os.O_APPEND if 'a' in mode else 0,
2151             os.O_EXCL if 'x' in mode else 0,
2152             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2153         ))
2154
2155         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2156
2157     def __enter__(self):
2158         exclusive = 'r' not in self.mode
2159         try:
2160             _lock_file(self.f, exclusive, self.block)
2161             self.locked = True
2162         except OSError:
2163             self.f.close()
2164             raise
2165         if 'w' in self.mode:
2166             try:
2167                 self.f.truncate()
2168             except OSError as e:
2169                 if e.errno not in (
2170                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2171                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2172                 ):
2173                     raise
2174         return self
2175
2176     def unlock(self):
2177         if not self.locked:
2178             return
2179         try:
2180             _unlock_file(self.f)
2181         finally:
2182             self.locked = False
2183
2184     def __exit__(self, *_):
2185         try:
2186             self.unlock()
2187         finally:
2188             self.f.close()
2189
2190     open = __enter__
2191     close = __exit__
2192
2193     def __getattr__(self, attr):
2194         return getattr(self.f, attr)
2195
2196     def __iter__(self):
2197         return iter(self.f)
2198
2199
2200 @functools.cache
2201 def get_filesystem_encoding():
2202     encoding = sys.getfilesystemencoding()
2203     return encoding if encoding is not None else 'utf-8'
2204
2205
2206 def shell_quote(args):
2207     quoted_args = []
2208     encoding = get_filesystem_encoding()
2209     for a in args:
2210         if isinstance(a, bytes):
2211             # We may get a filename encoded with 'encodeFilename'
2212             a = a.decode(encoding)
2213         quoted_args.append(compat_shlex_quote(a))
2214     return ' '.join(quoted_args)
2215
2216
2217 def smuggle_url(url, data):
2218     """ Pass additional data in a URL for internal use. """
2219
2220     url, idata = unsmuggle_url(url, {})
2221     data.update(idata)
2222     sdata = urllib.parse.urlencode(
2223         {'__youtubedl_smuggle': json.dumps(data)})
2224     return url + '#' + sdata
2225
2226
2227 def unsmuggle_url(smug_url, default=None):
2228     if '#__youtubedl_smuggle' not in smug_url:
2229         return smug_url, default
2230     url, _, sdata = smug_url.rpartition('#')
2231     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2232     data = json.loads(jsond)
2233     return url, data
2234
2235
2236 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2237     """ Formats numbers with decimal sufixes like K, M, etc """
2238     num, factor = float_or_none(num), float(factor)
2239     if num is None or num < 0:
2240         return None
2241     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2242     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2243     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2244     if factor == 1024:
2245         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2246     converted = num / (factor ** exponent)
2247     return fmt % (converted, suffix)
2248
2249
2250 def format_bytes(bytes):
2251     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2252
2253
2254 def lookup_unit_table(unit_table, s):
2255     units_re = '|'.join(re.escape(u) for u in unit_table)
2256     m = re.match(
2257         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2258     if not m:
2259         return None
2260     num_str = m.group('num').replace(',', '.')
2261     mult = unit_table[m.group('unit')]
2262     return int(float(num_str) * mult)
2263
2264
2265 def parse_filesize(s):
2266     if s is None:
2267         return None
2268
2269     # The lower-case forms are of course incorrect and unofficial,
2270     # but we support those too
2271     _UNIT_TABLE = {
2272         'B': 1,
2273         'b': 1,
2274         'bytes': 1,
2275         'KiB': 1024,
2276         'KB': 1000,
2277         'kB': 1024,
2278         'Kb': 1000,
2279         'kb': 1000,
2280         'kilobytes': 1000,
2281         'kibibytes': 1024,
2282         'MiB': 1024 ** 2,
2283         'MB': 1000 ** 2,
2284         'mB': 1024 ** 2,
2285         'Mb': 1000 ** 2,
2286         'mb': 1000 ** 2,
2287         'megabytes': 1000 ** 2,
2288         'mebibytes': 1024 ** 2,
2289         'GiB': 1024 ** 3,
2290         'GB': 1000 ** 3,
2291         'gB': 1024 ** 3,
2292         'Gb': 1000 ** 3,
2293         'gb': 1000 ** 3,
2294         'gigabytes': 1000 ** 3,
2295         'gibibytes': 1024 ** 3,
2296         'TiB': 1024 ** 4,
2297         'TB': 1000 ** 4,
2298         'tB': 1024 ** 4,
2299         'Tb': 1000 ** 4,
2300         'tb': 1000 ** 4,
2301         'terabytes': 1000 ** 4,
2302         'tebibytes': 1024 ** 4,
2303         'PiB': 1024 ** 5,
2304         'PB': 1000 ** 5,
2305         'pB': 1024 ** 5,
2306         'Pb': 1000 ** 5,
2307         'pb': 1000 ** 5,
2308         'petabytes': 1000 ** 5,
2309         'pebibytes': 1024 ** 5,
2310         'EiB': 1024 ** 6,
2311         'EB': 1000 ** 6,
2312         'eB': 1024 ** 6,
2313         'Eb': 1000 ** 6,
2314         'eb': 1000 ** 6,
2315         'exabytes': 1000 ** 6,
2316         'exbibytes': 1024 ** 6,
2317         'ZiB': 1024 ** 7,
2318         'ZB': 1000 ** 7,
2319         'zB': 1024 ** 7,
2320         'Zb': 1000 ** 7,
2321         'zb': 1000 ** 7,
2322         'zettabytes': 1000 ** 7,
2323         'zebibytes': 1024 ** 7,
2324         'YiB': 1024 ** 8,
2325         'YB': 1000 ** 8,
2326         'yB': 1024 ** 8,
2327         'Yb': 1000 ** 8,
2328         'yb': 1000 ** 8,
2329         'yottabytes': 1000 ** 8,
2330         'yobibytes': 1024 ** 8,
2331     }
2332
2333     return lookup_unit_table(_UNIT_TABLE, s)
2334
2335
2336 def parse_count(s):
2337     if s is None:
2338         return None
2339
2340     s = re.sub(r'^[^\d]+\s', '', s).strip()
2341
2342     if re.match(r'^[\d,.]+$', s):
2343         return str_to_int(s)
2344
2345     _UNIT_TABLE = {
2346         'k': 1000,
2347         'K': 1000,
2348         'm': 1000 ** 2,
2349         'M': 1000 ** 2,
2350         'kk': 1000 ** 2,
2351         'KK': 1000 ** 2,
2352         'b': 1000 ** 3,
2353         'B': 1000 ** 3,
2354     }
2355
2356     ret = lookup_unit_table(_UNIT_TABLE, s)
2357     if ret is not None:
2358         return ret
2359
2360     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2361     if mobj:
2362         return str_to_int(mobj.group(1))
2363
2364
2365 def parse_resolution(s, *, lenient=False):
2366     if s is None:
2367         return {}
2368
2369     if lenient:
2370         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2371     else:
2372         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2373     if mobj:
2374         return {
2375             'width': int(mobj.group('w')),
2376             'height': int(mobj.group('h')),
2377         }
2378
2379     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2380     if mobj:
2381         return {'height': int(mobj.group(1))}
2382
2383     mobj = re.search(r'\b([48])[kK]\b', s)
2384     if mobj:
2385         return {'height': int(mobj.group(1)) * 540}
2386
2387     return {}
2388
2389
2390 def parse_bitrate(s):
2391     if not isinstance(s, str):
2392         return
2393     mobj = re.search(r'\b(\d+)\s*kbps', s)
2394     if mobj:
2395         return int(mobj.group(1))
2396
2397
2398 def month_by_name(name, lang='en'):
2399     """ Return the number of a month by (locale-independently) English name """
2400
2401     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2402
2403     try:
2404         return month_names.index(name) + 1
2405     except ValueError:
2406         return None
2407
2408
2409 def month_by_abbreviation(abbrev):
2410     """ Return the number of a month by (locale-independently) English
2411         abbreviations """
2412
2413     try:
2414         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2415     except ValueError:
2416         return None
2417
2418
2419 def fix_xml_ampersands(xml_str):
2420     """Replace all the '&' by '&amp;' in XML"""
2421     return re.sub(
2422         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2423         '&amp;',
2424         xml_str)
2425
2426
2427 def setproctitle(title):
2428     assert isinstance(title, str)
2429
2430     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2431     try:
2432         import ctypes
2433     except ImportError:
2434         return
2435
2436     try:
2437         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2438     except OSError:
2439         return
2440     except TypeError:
2441         # LoadLibrary in Windows Python 2.7.13 only expects
2442         # a bytestring, but since unicode_literals turns
2443         # every string into a unicode string, it fails.
2444         return
2445     title_bytes = title.encode()
2446     buf = ctypes.create_string_buffer(len(title_bytes))
2447     buf.value = title_bytes
2448     try:
2449         libc.prctl(15, buf, 0, 0, 0)
2450     except AttributeError:
2451         return  # Strange libc, just skip this
2452
2453
2454 def remove_start(s, start):
2455     return s[len(start):] if s is not None and s.startswith(start) else s
2456
2457
2458 def remove_end(s, end):
2459     return s[:-len(end)] if s is not None and s.endswith(end) else s
2460
2461
2462 def remove_quotes(s):
2463     if s is None or len(s) < 2:
2464         return s
2465     for quote in ('"', "'", ):
2466         if s[0] == quote and s[-1] == quote:
2467             return s[1:-1]
2468     return s
2469
2470
2471 def get_domain(url):
2472     """
2473     This implementation is inconsistent, but is kept for compatibility.
2474     Use this only for "webpage_url_domain"
2475     """
2476     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2477
2478
2479 def url_basename(url):
2480     path = urllib.parse.urlparse(url).path
2481     return path.strip('/').split('/')[-1]
2482
2483
2484 def base_url(url):
2485     return re.match(r'https?://[^?#]+/', url).group()
2486
2487
2488 def urljoin(base, path):
2489     if isinstance(path, bytes):
2490         path = path.decode()
2491     if not isinstance(path, str) or not path:
2492         return None
2493     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2494         return path
2495     if isinstance(base, bytes):
2496         base = base.decode()
2497     if not isinstance(base, str) or not re.match(
2498             r'^(?:https?:)?//', base):
2499         return None
2500     return urllib.parse.urljoin(base, path)
2501
2502
2503 class HEADRequest(urllib.request.Request):
2504     def get_method(self):
2505         return 'HEAD'
2506
2507
2508 class PUTRequest(urllib.request.Request):
2509     def get_method(self):
2510         return 'PUT'
2511
2512
2513 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2514     if get_attr and v is not None:
2515         v = getattr(v, get_attr, None)
2516     try:
2517         return int(v) * invscale // scale
2518     except (ValueError, TypeError, OverflowError):
2519         return default
2520
2521
2522 def str_or_none(v, default=None):
2523     return default if v is None else str(v)
2524
2525
2526 def str_to_int(int_str):
2527     """ A more relaxed version of int_or_none """
2528     if isinstance(int_str, int):
2529         return int_str
2530     elif isinstance(int_str, str):
2531         int_str = re.sub(r'[,\.\+]', '', int_str)
2532         return int_or_none(int_str)
2533
2534
2535 def float_or_none(v, scale=1, invscale=1, default=None):
2536     if v is None:
2537         return default
2538     try:
2539         return float(v) * invscale / scale
2540     except (ValueError, TypeError):
2541         return default
2542
2543
2544 def bool_or_none(v, default=None):
2545     return v if isinstance(v, bool) else default
2546
2547
2548 def strip_or_none(v, default=None):
2549     return v.strip() if isinstance(v, str) else default
2550
2551
2552 def url_or_none(url):
2553     if not url or not isinstance(url, str):
2554         return None
2555     url = url.strip()
2556     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2557
2558
2559 def request_to_url(req):
2560     if isinstance(req, urllib.request.Request):
2561         return req.get_full_url()
2562     else:
2563         return req
2564
2565
2566 def strftime_or_none(timestamp, date_format, default=None):
2567     datetime_object = None
2568     try:
2569         if isinstance(timestamp, (int, float)):  # unix timestamp
2570             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2571         elif isinstance(timestamp, str):  # assume YYYYMMDD
2572             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2573         date_format = re.sub(  # Support %s on windows
2574             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2575         return datetime_object.strftime(date_format)
2576     except (ValueError, TypeError, AttributeError):
2577         return default
2578
2579
2580 def parse_duration(s):
2581     if not isinstance(s, str):
2582         return None
2583     s = s.strip()
2584     if not s:
2585         return None
2586
2587     days, hours, mins, secs, ms = [None] * 5
2588     m = re.match(r'''(?x)
2589             (?P<before_secs>
2590                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2591             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2592             (?P<ms>[.:][0-9]+)?Z?$
2593         ''', s)
2594     if m:
2595         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2596     else:
2597         m = re.match(
2598             r'''(?ix)(?:P?
2599                 (?:
2600                     [0-9]+\s*y(?:ears?)?,?\s*
2601                 )?
2602                 (?:
2603                     [0-9]+\s*m(?:onths?)?,?\s*
2604                 )?
2605                 (?:
2606                     [0-9]+\s*w(?:eeks?)?,?\s*
2607                 )?
2608                 (?:
2609                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2610                 )?
2611                 T)?
2612                 (?:
2613                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2614                 )?
2615                 (?:
2616                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2617                 )?
2618                 (?:
2619                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2620                 )?Z?$''', s)
2621         if m:
2622             days, hours, mins, secs, ms = m.groups()
2623         else:
2624             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2625             if m:
2626                 hours, mins = m.groups()
2627             else:
2628                 return None
2629
2630     if ms:
2631         ms = ms.replace(':', '.')
2632     return sum(float(part or 0) * mult for part, mult in (
2633         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2634
2635
2636 def prepend_extension(filename, ext, expected_real_ext=None):
2637     name, real_ext = os.path.splitext(filename)
2638     return (
2639         f'{name}.{ext}{real_ext}'
2640         if not expected_real_ext or real_ext[1:] == expected_real_ext
2641         else f'{filename}.{ext}')
2642
2643
2644 def replace_extension(filename, ext, expected_real_ext=None):
2645     name, real_ext = os.path.splitext(filename)
2646     return '{}.{}'.format(
2647         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2648         ext)
2649
2650
2651 def check_executable(exe, args=[]):
2652     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2653     args can be a list of arguments for a short output (like -version) """
2654     try:
2655         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2656     except OSError:
2657         return False
2658     return exe
2659
2660
2661 def _get_exe_version_output(exe, args, *, to_screen=None):
2662     if to_screen:
2663         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2664     try:
2665         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2666         # SIGTTOU if yt-dlp is run in the background.
2667         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2668         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2669                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2670     except OSError:
2671         return False
2672     return stdout
2673
2674
2675 def detect_exe_version(output, version_re=None, unrecognized='present'):
2676     assert isinstance(output, str)
2677     if version_re is None:
2678         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2679     m = re.search(version_re, output)
2680     if m:
2681         return m.group(1)
2682     else:
2683         return unrecognized
2684
2685
2686 def get_exe_version(exe, args=['--version'],
2687                     version_re=None, unrecognized='present'):
2688     """ Returns the version of the specified executable,
2689     or False if the executable is not present """
2690     out = _get_exe_version_output(exe, args)
2691     return detect_exe_version(out, version_re, unrecognized) if out else False
2692
2693
2694 def frange(start=0, stop=None, step=1):
2695     """Float range"""
2696     if stop is None:
2697         start, stop = 0, start
2698     sign = [-1, 1][step > 0] if step else 0
2699     while sign * start < sign * stop:
2700         yield start
2701         start += step
2702
2703
2704 class LazyList(collections.abc.Sequence):
2705     """Lazy immutable list from an iterable
2706     Note that slices of a LazyList are lists and not LazyList"""
2707
2708     class IndexError(IndexError):
2709         pass
2710
2711     def __init__(self, iterable, *, reverse=False, _cache=None):
2712         self._iterable = iter(iterable)
2713         self._cache = [] if _cache is None else _cache
2714         self._reversed = reverse
2715
2716     def __iter__(self):
2717         if self._reversed:
2718             # We need to consume the entire iterable to iterate in reverse
2719             yield from self.exhaust()
2720             return
2721         yield from self._cache
2722         for item in self._iterable:
2723             self._cache.append(item)
2724             yield item
2725
2726     def _exhaust(self):
2727         self._cache.extend(self._iterable)
2728         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2729         return self._cache
2730
2731     def exhaust(self):
2732         """Evaluate the entire iterable"""
2733         return self._exhaust()[::-1 if self._reversed else 1]
2734
2735     @staticmethod
2736     def _reverse_index(x):
2737         return None if x is None else ~x
2738
2739     def __getitem__(self, idx):
2740         if isinstance(idx, slice):
2741             if self._reversed:
2742                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2743             start, stop, step = idx.start, idx.stop, idx.step or 1
2744         elif isinstance(idx, int):
2745             if self._reversed:
2746                 idx = self._reverse_index(idx)
2747             start, stop, step = idx, idx, 0
2748         else:
2749             raise TypeError('indices must be integers or slices')
2750         if ((start or 0) < 0 or (stop or 0) < 0
2751                 or (start is None and step < 0)
2752                 or (stop is None and step > 0)):
2753             # We need to consume the entire iterable to be able to slice from the end
2754             # Obviously, never use this with infinite iterables
2755             self._exhaust()
2756             try:
2757                 return self._cache[idx]
2758             except IndexError as e:
2759                 raise self.IndexError(e) from e
2760         n = max(start or 0, stop or 0) - len(self._cache) + 1
2761         if n > 0:
2762             self._cache.extend(itertools.islice(self._iterable, n))
2763         try:
2764             return self._cache[idx]
2765         except IndexError as e:
2766             raise self.IndexError(e) from e
2767
2768     def __bool__(self):
2769         try:
2770             self[-1] if self._reversed else self[0]
2771         except self.IndexError:
2772             return False
2773         return True
2774
2775     def __len__(self):
2776         self._exhaust()
2777         return len(self._cache)
2778
2779     def __reversed__(self):
2780         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2781
2782     def __copy__(self):
2783         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2784
2785     def __repr__(self):
2786         # repr and str should mimic a list. So we exhaust the iterable
2787         return repr(self.exhaust())
2788
2789     def __str__(self):
2790         return repr(self.exhaust())
2791
2792
2793 class PagedList:
2794
2795     class IndexError(IndexError):
2796         pass
2797
2798     def __len__(self):
2799         # This is only useful for tests
2800         return len(self.getslice())
2801
2802     def __init__(self, pagefunc, pagesize, use_cache=True):
2803         self._pagefunc = pagefunc
2804         self._pagesize = pagesize
2805         self._pagecount = float('inf')
2806         self._use_cache = use_cache
2807         self._cache = {}
2808
2809     def getpage(self, pagenum):
2810         page_results = self._cache.get(pagenum)
2811         if page_results is None:
2812             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2813         if self._use_cache:
2814             self._cache[pagenum] = page_results
2815         return page_results
2816
2817     def getslice(self, start=0, end=None):
2818         return list(self._getslice(start, end))
2819
2820     def _getslice(self, start, end):
2821         raise NotImplementedError('This method must be implemented by subclasses')
2822
2823     def __getitem__(self, idx):
2824         assert self._use_cache, 'Indexing PagedList requires cache'
2825         if not isinstance(idx, int) or idx < 0:
2826             raise TypeError('indices must be non-negative integers')
2827         entries = self.getslice(idx, idx + 1)
2828         if not entries:
2829             raise self.IndexError()
2830         return entries[0]
2831
2832
2833 class OnDemandPagedList(PagedList):
2834     """Download pages until a page with less than maximum results"""
2835
2836     def _getslice(self, start, end):
2837         for pagenum in itertools.count(start // self._pagesize):
2838             firstid = pagenum * self._pagesize
2839             nextfirstid = pagenum * self._pagesize + self._pagesize
2840             if start >= nextfirstid:
2841                 continue
2842
2843             startv = (
2844                 start % self._pagesize
2845                 if firstid <= start < nextfirstid
2846                 else 0)
2847             endv = (
2848                 ((end - 1) % self._pagesize) + 1
2849                 if (end is not None and firstid <= end <= nextfirstid)
2850                 else None)
2851
2852             try:
2853                 page_results = self.getpage(pagenum)
2854             except Exception:
2855                 self._pagecount = pagenum - 1
2856                 raise
2857             if startv != 0 or endv is not None:
2858                 page_results = page_results[startv:endv]
2859             yield from page_results
2860
2861             # A little optimization - if current page is not "full", ie. does
2862             # not contain page_size videos then we can assume that this page
2863             # is the last one - there are no more ids on further pages -
2864             # i.e. no need to query again.
2865             if len(page_results) + startv < self._pagesize:
2866                 break
2867
2868             # If we got the whole page, but the next page is not interesting,
2869             # break out early as well
2870             if end == nextfirstid:
2871                 break
2872
2873
2874 class InAdvancePagedList(PagedList):
2875     """PagedList with total number of pages known in advance"""
2876
2877     def __init__(self, pagefunc, pagecount, pagesize):
2878         PagedList.__init__(self, pagefunc, pagesize, True)
2879         self._pagecount = pagecount
2880
2881     def _getslice(self, start, end):
2882         start_page = start // self._pagesize
2883         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2884         skip_elems = start - start_page * self._pagesize
2885         only_more = None if end is None else end - start
2886         for pagenum in range(start_page, end_page):
2887             page_results = self.getpage(pagenum)
2888             if skip_elems:
2889                 page_results = page_results[skip_elems:]
2890                 skip_elems = None
2891             if only_more is not None:
2892                 if len(page_results) < only_more:
2893                     only_more -= len(page_results)
2894                 else:
2895                     yield from page_results[:only_more]
2896                     break
2897             yield from page_results
2898
2899
2900 class PlaylistEntries:
2901     MissingEntry = object()
2902     is_exhausted = False
2903
2904     def __init__(self, ydl, info_dict):
2905         self.ydl = ydl
2906
2907         # _entries must be assigned now since infodict can change during iteration
2908         entries = info_dict.get('entries')
2909         if entries is None:
2910             raise EntryNotInPlaylist('There are no entries')
2911         elif isinstance(entries, list):
2912             self.is_exhausted = True
2913
2914         requested_entries = info_dict.get('requested_entries')
2915         self.is_incomplete = bool(requested_entries)
2916         if self.is_incomplete:
2917             assert self.is_exhausted
2918             self._entries = [self.MissingEntry] * max(requested_entries)
2919             for i, entry in zip(requested_entries, entries):
2920                 self._entries[i - 1] = entry
2921         elif isinstance(entries, (list, PagedList, LazyList)):
2922             self._entries = entries
2923         else:
2924             self._entries = LazyList(entries)
2925
2926     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2927         (?P<start>[+-]?\d+)?
2928         (?P<range>[:-]
2929             (?P<end>[+-]?\d+|inf(?:inite)?)?
2930             (?::(?P<step>[+-]?\d+))?
2931         )?''')
2932
2933     @classmethod
2934     def parse_playlist_items(cls, string):
2935         for segment in string.split(','):
2936             if not segment:
2937                 raise ValueError('There is two or more consecutive commas')
2938             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2939             if not mobj:
2940                 raise ValueError(f'{segment!r} is not a valid specification')
2941             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2942             if int_or_none(step) == 0:
2943                 raise ValueError(f'Step in {segment!r} cannot be zero')
2944             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2945
2946     def get_requested_items(self):
2947         playlist_items = self.ydl.params.get('playlist_items')
2948         playlist_start = self.ydl.params.get('playliststart', 1)
2949         playlist_end = self.ydl.params.get('playlistend')
2950         # For backwards compatibility, interpret -1 as whole list
2951         if playlist_end in (-1, None):
2952             playlist_end = ''
2953         if not playlist_items:
2954             playlist_items = f'{playlist_start}:{playlist_end}'
2955         elif playlist_start != 1 or playlist_end:
2956             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2957
2958         for index in self.parse_playlist_items(playlist_items):
2959             for i, entry in self[index]:
2960                 yield i, entry
2961                 if not entry:
2962                     continue
2963                 try:
2964                     # TODO: Add auto-generated fields
2965                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2966                 except (ExistingVideoReached, RejectedVideoReached):
2967                     return
2968
2969     def get_full_count(self):
2970         if self.is_exhausted and not self.is_incomplete:
2971             return len(self)
2972         elif isinstance(self._entries, InAdvancePagedList):
2973             if self._entries._pagesize == 1:
2974                 return self._entries._pagecount
2975
2976     @functools.cached_property
2977     def _getter(self):
2978         if isinstance(self._entries, list):
2979             def get_entry(i):
2980                 try:
2981                     entry = self._entries[i]
2982                 except IndexError:
2983                     entry = self.MissingEntry
2984                     if not self.is_incomplete:
2985                         raise self.IndexError()
2986                 if entry is self.MissingEntry:
2987                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2988                 return entry
2989         else:
2990             def get_entry(i):
2991                 try:
2992                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2993                 except (LazyList.IndexError, PagedList.IndexError):
2994                     raise self.IndexError()
2995         return get_entry
2996
2997     def __getitem__(self, idx):
2998         if isinstance(idx, int):
2999             idx = slice(idx, idx)
3000
3001         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3002         step = 1 if idx.step is None else idx.step
3003         if idx.start is None:
3004             start = 0 if step > 0 else len(self) - 1
3005         else:
3006             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3007
3008         # NB: Do not call len(self) when idx == [:]
3009         if idx.stop is None:
3010             stop = 0 if step < 0 else float('inf')
3011         else:
3012             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3013         stop += [-1, 1][step > 0]
3014
3015         for i in frange(start, stop, step):
3016             if i < 0:
3017                 continue
3018             try:
3019                 entry = self._getter(i)
3020             except self.IndexError:
3021                 self.is_exhausted = True
3022                 if step > 0:
3023                     break
3024                 continue
3025             yield i + 1, entry
3026
3027     def __len__(self):
3028         return len(tuple(self[:]))
3029
3030     class IndexError(IndexError):
3031         pass
3032
3033
3034 def uppercase_escape(s):
3035     unicode_escape = codecs.getdecoder('unicode_escape')
3036     return re.sub(
3037         r'\\U[0-9a-fA-F]{8}',
3038         lambda m: unicode_escape(m.group(0))[0],
3039         s)
3040
3041
3042 def lowercase_escape(s):
3043     unicode_escape = codecs.getdecoder('unicode_escape')
3044     return re.sub(
3045         r'\\u[0-9a-fA-F]{4}',
3046         lambda m: unicode_escape(m.group(0))[0],
3047         s)
3048
3049
3050 def escape_rfc3986(s):
3051     """Escape non-ASCII characters as suggested by RFC 3986"""
3052     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3053
3054
3055 def escape_url(url):
3056     """Escape URL as suggested by RFC 3986"""
3057     url_parsed = urllib.parse.urlparse(url)
3058     return url_parsed._replace(
3059         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3060         path=escape_rfc3986(url_parsed.path),
3061         params=escape_rfc3986(url_parsed.params),
3062         query=escape_rfc3986(url_parsed.query),
3063         fragment=escape_rfc3986(url_parsed.fragment)
3064     ).geturl()
3065
3066
3067 def parse_qs(url):
3068     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3069
3070
3071 def read_batch_urls(batch_fd):
3072     def fixup(url):
3073         if not isinstance(url, str):
3074             url = url.decode('utf-8', 'replace')
3075         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3076         for bom in BOM_UTF8:
3077             if url.startswith(bom):
3078                 url = url[len(bom):]
3079         url = url.lstrip()
3080         if not url or url.startswith(('#', ';', ']')):
3081             return False
3082         # "#" cannot be stripped out since it is part of the URI
3083         # However, it can be safely stripped out if following a whitespace
3084         return re.split(r'\s#', url, 1)[0].rstrip()
3085
3086     with contextlib.closing(batch_fd) as fd:
3087         return [url for url in map(fixup, fd) if url]
3088
3089
3090 def urlencode_postdata(*args, **kargs):
3091     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3092
3093
3094 def update_url_query(url, query):
3095     if not query:
3096         return url
3097     parsed_url = urllib.parse.urlparse(url)
3098     qs = urllib.parse.parse_qs(parsed_url.query)
3099     qs.update(query)
3100     return urllib.parse.urlunparse(parsed_url._replace(
3101         query=urllib.parse.urlencode(qs, True)))
3102
3103
3104 def update_Request(req, url=None, data=None, headers=None, query=None):
3105     req_headers = req.headers.copy()
3106     req_headers.update(headers or {})
3107     req_data = data or req.data
3108     req_url = update_url_query(url or req.get_full_url(), query)
3109     req_get_method = req.get_method()
3110     if req_get_method == 'HEAD':
3111         req_type = HEADRequest
3112     elif req_get_method == 'PUT':
3113         req_type = PUTRequest
3114     else:
3115         req_type = urllib.request.Request
3116     new_req = req_type(
3117         req_url, data=req_data, headers=req_headers,
3118         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3119     if hasattr(req, 'timeout'):
3120         new_req.timeout = req.timeout
3121     return new_req
3122
3123
3124 def _multipart_encode_impl(data, boundary):
3125     content_type = 'multipart/form-data; boundary=%s' % boundary
3126
3127     out = b''
3128     for k, v in data.items():
3129         out += b'--' + boundary.encode('ascii') + b'\r\n'
3130         if isinstance(k, str):
3131             k = k.encode()
3132         if isinstance(v, str):
3133             v = v.encode()
3134         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3135         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3136         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3137         if boundary.encode('ascii') in content:
3138             raise ValueError('Boundary overlaps with data')
3139         out += content
3140
3141     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3142
3143     return out, content_type
3144
3145
3146 def multipart_encode(data, boundary=None):
3147     '''
3148     Encode a dict to RFC 7578-compliant form-data
3149
3150     data:
3151         A dict where keys and values can be either Unicode or bytes-like
3152         objects.
3153     boundary:
3154         If specified a Unicode object, it's used as the boundary. Otherwise
3155         a random boundary is generated.
3156
3157     Reference: https://tools.ietf.org/html/rfc7578
3158     '''
3159     has_specified_boundary = boundary is not None
3160
3161     while True:
3162         if boundary is None:
3163             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3164
3165         try:
3166             out, content_type = _multipart_encode_impl(data, boundary)
3167             break
3168         except ValueError:
3169             if has_specified_boundary:
3170                 raise
3171             boundary = None
3172
3173     return out, content_type
3174
3175
3176 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3177     for val in map(d.get, variadic(key_or_keys)):
3178         if val is not None and (val or not skip_false_values):
3179             return val
3180     return default
3181
3182
3183 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3184     for f in funcs:
3185         try:
3186             val = f(*args, **kwargs)
3187         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3188             pass
3189         else:
3190             if expected_type is None or isinstance(val, expected_type):
3191                 return val
3192
3193
3194 def try_get(src, getter, expected_type=None):
3195     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3196
3197
3198 def filter_dict(dct, cndn=lambda _, v: v is not None):
3199     return {k: v for k, v in dct.items() if cndn(k, v)}
3200
3201
3202 def merge_dicts(*dicts):
3203     merged = {}
3204     for a_dict in dicts:
3205         for k, v in a_dict.items():
3206             if (v is not None and k not in merged
3207                     or isinstance(v, str) and merged[k] == ''):
3208                 merged[k] = v
3209     return merged
3210
3211
3212 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3213     return string if isinstance(string, str) else str(string, encoding, errors)
3214
3215
3216 US_RATINGS = {
3217     'G': 0,
3218     'PG': 10,
3219     'PG-13': 13,
3220     'R': 16,
3221     'NC': 18,
3222 }
3223
3224
3225 TV_PARENTAL_GUIDELINES = {
3226     'TV-Y': 0,
3227     'TV-Y7': 7,
3228     'TV-G': 0,
3229     'TV-PG': 0,
3230     'TV-14': 14,
3231     'TV-MA': 17,
3232 }
3233
3234
3235 def parse_age_limit(s):
3236     # isinstance(False, int) is True. So type() must be used instead
3237     if type(s) is int:  # noqa: E721
3238         return s if 0 <= s <= 21 else None
3239     elif not isinstance(s, str):
3240         return None
3241     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3242     if m:
3243         return int(m.group('age'))
3244     s = s.upper()
3245     if s in US_RATINGS:
3246         return US_RATINGS[s]
3247     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3248     if m:
3249         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3250     return None
3251
3252
3253 def strip_jsonp(code):
3254     return re.sub(
3255         r'''(?sx)^
3256             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3257             (?:\s*&&\s*(?P=func_name))?
3258             \s*\(\s*(?P<callback_data>.*)\);?
3259             \s*?(?://[^\n]*)*$''',
3260         r'\g<callback_data>', code)
3261
3262
3263 def js_to_json(code, vars={}, *, strict=False):
3264     # vars is a dict of var, val pairs to substitute
3265     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3266     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3267     INTEGER_TABLE = (
3268         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3269         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3270     )
3271
3272     def fix_kv(m):
3273         v = m.group(0)
3274         if v in ('true', 'false', 'null'):
3275             return v
3276         elif v in ('undefined', 'void 0'):
3277             return 'null'
3278         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3279             return ""
3280
3281         if v[0] in ("'", '"'):
3282             v = re.sub(r'(?s)\\.|"', lambda m: {
3283                 '"': '\\"',
3284                 "\\'": "'",
3285                 '\\\n': '',
3286                 '\\x': '\\u00',
3287             }.get(m.group(0), m.group(0)), v[1:-1])
3288         else:
3289             for regex, base in INTEGER_TABLE:
3290                 im = re.match(regex, v)
3291                 if im:
3292                     i = int(im.group(1), base)
3293                     return '"%d":' % i if v.endswith(':') else '%d' % i
3294
3295             if v in vars:
3296                 return vars[v]
3297             if strict:
3298                 raise ValueError(f'Unknown value: {v}')
3299
3300         return '"%s"' % v
3301
3302     def create_map(mobj):
3303         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3304
3305     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3306     if not strict:
3307         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3308
3309     return re.sub(r'''(?sx)
3310         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3311         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3312         {comment}|,(?={skip}[\]}}])|
3313         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3314         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3315         [0-9]+(?={skip}:)|
3316         !+
3317         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3318
3319
3320 def qualities(quality_ids):
3321     """ Get a numeric quality value out of a list of possible values """
3322     def q(qid):
3323         try:
3324             return quality_ids.index(qid)
3325         except ValueError:
3326             return -1
3327     return q
3328
3329
3330 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3331
3332
3333 DEFAULT_OUTTMPL = {
3334     'default': '%(title)s [%(id)s].%(ext)s',
3335     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3336 }
3337 OUTTMPL_TYPES = {
3338     'chapter': None,
3339     'subtitle': None,
3340     'thumbnail': None,
3341     'description': 'description',
3342     'annotation': 'annotations.xml',
3343     'infojson': 'info.json',
3344     'link': None,
3345     'pl_video': None,
3346     'pl_thumbnail': None,
3347     'pl_description': 'description',
3348     'pl_infojson': 'info.json',
3349 }
3350
3351 # As of [1] format syntax is:
3352 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3353 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3354 STR_FORMAT_RE_TMPL = r'''(?x)
3355     (?<!%)(?P<prefix>(?:%%)*)
3356     %
3357     (?P<has_key>\((?P<key>{0})\))?
3358     (?P<format>
3359         (?P<conversion>[#0\-+ ]+)?
3360         (?P<min_width>\d+)?
3361         (?P<precision>\.\d+)?
3362         (?P<len_mod>[hlL])?  # unused in python
3363         {1}  # conversion type
3364     )
3365 '''
3366
3367
3368 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3369
3370
3371 def limit_length(s, length):
3372     """ Add ellipses to overly long strings """
3373     if s is None:
3374         return None
3375     ELLIPSES = '...'
3376     if len(s) > length:
3377         return s[:length - len(ELLIPSES)] + ELLIPSES
3378     return s
3379
3380
3381 def version_tuple(v):
3382     return tuple(int(e) for e in re.split(r'[-.]', v))
3383
3384
3385 def is_outdated_version(version, limit, assume_new=True):
3386     if not version:
3387         return not assume_new
3388     try:
3389         return version_tuple(version) < version_tuple(limit)
3390     except ValueError:
3391         return not assume_new
3392
3393
3394 def ytdl_is_updateable():
3395     """ Returns if yt-dlp can be updated with -U """
3396
3397     from .update import is_non_updateable
3398
3399     return not is_non_updateable()
3400
3401
3402 def args_to_str(args):
3403     # Get a short string representation for a subprocess command
3404     return ' '.join(compat_shlex_quote(a) for a in args)
3405
3406
3407 def error_to_compat_str(err):
3408     return str(err)
3409
3410
3411 def error_to_str(err):
3412     return f'{type(err).__name__}: {err}'
3413
3414
3415 def mimetype2ext(mt):
3416     if mt is None:
3417         return None
3418
3419     mt, _, params = mt.partition(';')
3420     mt = mt.strip()
3421
3422     FULL_MAP = {
3423         'audio/mp4': 'm4a',
3424         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3425         # it's the most popular one
3426         'audio/mpeg': 'mp3',
3427         'audio/x-wav': 'wav',
3428         'audio/wav': 'wav',
3429         'audio/wave': 'wav',
3430     }
3431
3432     ext = FULL_MAP.get(mt)
3433     if ext is not None:
3434         return ext
3435
3436     SUBTYPE_MAP = {
3437         '3gpp': '3gp',
3438         'smptett+xml': 'tt',
3439         'ttaf+xml': 'dfxp',
3440         'ttml+xml': 'ttml',
3441         'x-flv': 'flv',
3442         'x-mp4-fragmented': 'mp4',
3443         'x-ms-sami': 'sami',
3444         'x-ms-wmv': 'wmv',
3445         'mpegurl': 'm3u8',
3446         'x-mpegurl': 'm3u8',
3447         'vnd.apple.mpegurl': 'm3u8',
3448         'dash+xml': 'mpd',
3449         'f4m+xml': 'f4m',
3450         'hds+xml': 'f4m',
3451         'vnd.ms-sstr+xml': 'ism',
3452         'quicktime': 'mov',
3453         'mp2t': 'ts',
3454         'x-wav': 'wav',
3455         'filmstrip+json': 'fs',
3456         'svg+xml': 'svg',
3457     }
3458
3459     _, _, subtype = mt.rpartition('/')
3460     ext = SUBTYPE_MAP.get(subtype.lower())
3461     if ext is not None:
3462         return ext
3463
3464     SUFFIX_MAP = {
3465         'json': 'json',
3466         'xml': 'xml',
3467         'zip': 'zip',
3468         'gzip': 'gz',
3469     }
3470
3471     _, _, suffix = subtype.partition('+')
3472     ext = SUFFIX_MAP.get(suffix)
3473     if ext is not None:
3474         return ext
3475
3476     return subtype.replace('+', '.')
3477
3478
3479 def ext2mimetype(ext_or_url):
3480     if not ext_or_url:
3481         return None
3482     if '.' not in ext_or_url:
3483         ext_or_url = f'file.{ext_or_url}'
3484     return mimetypes.guess_type(ext_or_url)[0]
3485
3486
3487 def parse_codecs(codecs_str):
3488     # http://tools.ietf.org/html/rfc6381
3489     if not codecs_str:
3490         return {}
3491     split_codecs = list(filter(None, map(
3492         str.strip, codecs_str.strip().strip(',').split(','))))
3493     vcodec, acodec, scodec, hdr = None, None, None, None
3494     for full_codec in split_codecs:
3495         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3496         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3497                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3498             if vcodec:
3499                 continue
3500             vcodec = full_codec
3501             if parts[0] in ('dvh1', 'dvhe'):
3502                 hdr = 'DV'
3503             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3504                 hdr = 'HDR10'
3505             elif parts[:2] == ['vp9', '2']:
3506                 hdr = 'HDR10'
3507         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3508                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3509             acodec = acodec or full_codec
3510         elif parts[0] in ('stpp', 'wvtt'):
3511             scodec = scodec or full_codec
3512         else:
3513             write_string(f'WARNING: Unknown codec {full_codec}\n')
3514     if vcodec or acodec or scodec:
3515         return {
3516             'vcodec': vcodec or 'none',
3517             'acodec': acodec or 'none',
3518             'dynamic_range': hdr,
3519             **({'scodec': scodec} if scodec is not None else {}),
3520         }
3521     elif len(split_codecs) == 2:
3522         return {
3523             'vcodec': split_codecs[0],
3524             'acodec': split_codecs[1],
3525         }
3526     return {}
3527
3528
3529 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3530     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3531
3532     allow_mkv = not preferences or 'mkv' in preferences
3533
3534     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3535         return 'mkv'  # TODO: any other format allows this?
3536
3537     # TODO: All codecs supported by parse_codecs isn't handled here
3538     COMPATIBLE_CODECS = {
3539         'mp4': {
3540             'av1', 'hevc', 'avc1', 'mp4a',  # fourcc (m3u8, mpd)
3541             'h264', 'aacl',  # Set in ISM
3542         },
3543         'webm': {
3544             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3545             'vp9x', 'vp8x',  # in the webm spec
3546         },
3547     }
3548
3549     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3550     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3551
3552     for ext in preferences or COMPATIBLE_CODECS.keys():
3553         codec_set = COMPATIBLE_CODECS.get(ext, set())
3554         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3555             return ext
3556
3557     COMPATIBLE_EXTS = (
3558         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3559         {'webm'},
3560     )
3561     for ext in preferences or vexts:
3562         current_exts = {ext, *vexts, *aexts}
3563         if ext == 'mkv' or current_exts == {ext} or any(
3564                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3565             return ext
3566     return 'mkv' if allow_mkv else preferences[-1]
3567
3568
3569 def urlhandle_detect_ext(url_handle):
3570     getheader = url_handle.headers.get
3571
3572     cd = getheader('Content-Disposition')
3573     if cd:
3574         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3575         if m:
3576             e = determine_ext(m.group('filename'), default_ext=None)
3577             if e:
3578                 return e
3579
3580     return mimetype2ext(getheader('Content-Type'))
3581
3582
3583 def encode_data_uri(data, mime_type):
3584     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3585
3586
3587 def age_restricted(content_limit, age_limit):
3588     """ Returns True iff the content should be blocked """
3589
3590     if age_limit is None:  # No limit set
3591         return False
3592     if content_limit is None:
3593         return False  # Content available for everyone
3594     return age_limit < content_limit
3595
3596
3597 # List of known byte-order-marks (BOM)
3598 BOMS = [
3599     (b'\xef\xbb\xbf', 'utf-8'),
3600     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3601     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3602     (b'\xff\xfe', 'utf-16-le'),
3603     (b'\xfe\xff', 'utf-16-be'),
3604 ]
3605
3606
3607 def is_html(first_bytes):
3608     """ Detect whether a file contains HTML by examining its first bytes. """
3609
3610     encoding = 'utf-8'
3611     for bom, enc in BOMS:
3612         while first_bytes.startswith(bom):
3613             encoding, first_bytes = enc, first_bytes[len(bom):]
3614
3615     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3616
3617
3618 def determine_protocol(info_dict):
3619     protocol = info_dict.get('protocol')
3620     if protocol is not None:
3621         return protocol
3622
3623     url = sanitize_url(info_dict['url'])
3624     if url.startswith('rtmp'):
3625         return 'rtmp'
3626     elif url.startswith('mms'):
3627         return 'mms'
3628     elif url.startswith('rtsp'):
3629         return 'rtsp'
3630
3631     ext = determine_ext(url)
3632     if ext == 'm3u8':
3633         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3634     elif ext == 'f4m':
3635         return 'f4m'
3636
3637     return urllib.parse.urlparse(url).scheme
3638
3639
3640 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3641     """ Render a list of rows, each as a list of values.
3642     Text after a \t will be right aligned """
3643     def width(string):
3644         return len(remove_terminal_sequences(string).replace('\t', ''))
3645
3646     def get_max_lens(table):
3647         return [max(width(str(v)) for v in col) for col in zip(*table)]
3648
3649     def filter_using_list(row, filterArray):
3650         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3651
3652     max_lens = get_max_lens(data) if hide_empty else []
3653     header_row = filter_using_list(header_row, max_lens)
3654     data = [filter_using_list(row, max_lens) for row in data]
3655
3656     table = [header_row] + data
3657     max_lens = get_max_lens(table)
3658     extra_gap += 1
3659     if delim:
3660         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3661         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3662     for row in table:
3663         for pos, text in enumerate(map(str, row)):
3664             if '\t' in text:
3665                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3666             else:
3667                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3668     ret = '\n'.join(''.join(row).rstrip() for row in table)
3669     return ret
3670
3671
3672 def _match_one(filter_part, dct, incomplete):
3673     # TODO: Generalize code with YoutubeDL._build_format_filter
3674     STRING_OPERATORS = {
3675         '*=': operator.contains,
3676         '^=': lambda attr, value: attr.startswith(value),
3677         '$=': lambda attr, value: attr.endswith(value),
3678         '~=': lambda attr, value: re.search(value, attr),
3679     }
3680     COMPARISON_OPERATORS = {
3681         **STRING_OPERATORS,
3682         '<=': operator.le,  # "<=" must be defined above "<"
3683         '<': operator.lt,
3684         '>=': operator.ge,
3685         '>': operator.gt,
3686         '=': operator.eq,
3687     }
3688
3689     if isinstance(incomplete, bool):
3690         is_incomplete = lambda _: incomplete
3691     else:
3692         is_incomplete = lambda k: k in incomplete
3693
3694     operator_rex = re.compile(r'''(?x)
3695         (?P<key>[a-z_]+)
3696         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3697         (?:
3698             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3699             (?P<strval>.+?)
3700         )
3701         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3702     m = operator_rex.fullmatch(filter_part.strip())
3703     if m:
3704         m = m.groupdict()
3705         unnegated_op = COMPARISON_OPERATORS[m['op']]
3706         if m['negation']:
3707             op = lambda attr, value: not unnegated_op(attr, value)
3708         else:
3709             op = unnegated_op
3710         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3711         if m['quote']:
3712             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3713         actual_value = dct.get(m['key'])
3714         numeric_comparison = None
3715         if isinstance(actual_value, (int, float)):
3716             # If the original field is a string and matching comparisonvalue is
3717             # a number we should respect the origin of the original field
3718             # and process comparison value as a string (see
3719             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3720             try:
3721                 numeric_comparison = int(comparison_value)
3722             except ValueError:
3723                 numeric_comparison = parse_filesize(comparison_value)
3724                 if numeric_comparison is None:
3725                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3726                 if numeric_comparison is None:
3727                     numeric_comparison = parse_duration(comparison_value)
3728         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3729             raise ValueError('Operator %s only supports string values!' % m['op'])
3730         if actual_value is None:
3731             return is_incomplete(m['key']) or m['none_inclusive']
3732         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3733
3734     UNARY_OPERATORS = {
3735         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3736         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3737     }
3738     operator_rex = re.compile(r'''(?x)
3739         (?P<op>%s)\s*(?P<key>[a-z_]+)
3740         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3741     m = operator_rex.fullmatch(filter_part.strip())
3742     if m:
3743         op = UNARY_OPERATORS[m.group('op')]
3744         actual_value = dct.get(m.group('key'))
3745         if is_incomplete(m.group('key')) and actual_value is None:
3746             return True
3747         return op(actual_value)
3748
3749     raise ValueError('Invalid filter part %r' % filter_part)
3750
3751
3752 def match_str(filter_str, dct, incomplete=False):
3753     """ Filter a dictionary with a simple string syntax.
3754     @returns           Whether the filter passes
3755     @param incomplete  Set of keys that is expected to be missing from dct.
3756                        Can be True/False to indicate all/none of the keys may be missing.
3757                        All conditions on incomplete keys pass if the key is missing
3758     """
3759     return all(
3760         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3761         for filter_part in re.split(r'(?<!\\)&', filter_str))
3762
3763
3764 def match_filter_func(filters):
3765     if not filters:
3766         return None
3767     filters = set(variadic(filters))
3768
3769     interactive = '-' in filters
3770     if interactive:
3771         filters.remove('-')
3772
3773     def _match_func(info_dict, incomplete=False):
3774         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3775             return NO_DEFAULT if interactive and not incomplete else None
3776         else:
3777             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3778             filter_str = ') | ('.join(map(str.strip, filters))
3779             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3780     return _match_func
3781
3782
3783 class download_range_func:
3784     def __init__(self, chapters, ranges):
3785         self.chapters, self.ranges = chapters, ranges
3786
3787     def __call__(self, info_dict, ydl):
3788         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3789                    else 'Cannot match chapters since chapter information is unavailable')
3790         for regex in self.chapters or []:
3791             for i, chapter in enumerate(info_dict.get('chapters') or []):
3792                 if re.search(regex, chapter['title']):
3793                     warning = None
3794                     yield {**chapter, 'index': i}
3795         if self.chapters and warning:
3796             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3797
3798         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3799
3800     def __eq__(self, other):
3801         return (isinstance(other, download_range_func)
3802                 and self.chapters == other.chapters and self.ranges == other.ranges)
3803
3804
3805 def parse_dfxp_time_expr(time_expr):
3806     if not time_expr:
3807         return
3808
3809     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3810     if mobj:
3811         return float(mobj.group('time_offset'))
3812
3813     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3814     if mobj:
3815         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3816
3817
3818 def srt_subtitles_timecode(seconds):
3819     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3820
3821
3822 def ass_subtitles_timecode(seconds):
3823     time = timetuple_from_msec(seconds * 1000)
3824     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3825
3826
3827 def dfxp2srt(dfxp_data):
3828     '''
3829     @param dfxp_data A bytes-like object containing DFXP data
3830     @returns A unicode object containing converted SRT data
3831     '''
3832     LEGACY_NAMESPACES = (
3833         (b'http://www.w3.org/ns/ttml', [
3834             b'http://www.w3.org/2004/11/ttaf1',
3835             b'http://www.w3.org/2006/04/ttaf1',
3836             b'http://www.w3.org/2006/10/ttaf1',
3837         ]),
3838         (b'http://www.w3.org/ns/ttml#styling', [
3839             b'http://www.w3.org/ns/ttml#style',
3840         ]),
3841     )
3842
3843     SUPPORTED_STYLING = [
3844         'color',
3845         'fontFamily',
3846         'fontSize',
3847         'fontStyle',
3848         'fontWeight',
3849         'textDecoration'
3850     ]
3851
3852     _x = functools.partial(xpath_with_ns, ns_map={
3853         'xml': 'http://www.w3.org/XML/1998/namespace',
3854         'ttml': 'http://www.w3.org/ns/ttml',
3855         'tts': 'http://www.w3.org/ns/ttml#styling',
3856     })
3857
3858     styles = {}
3859     default_style = {}
3860
3861     class TTMLPElementParser:
3862         _out = ''
3863         _unclosed_elements = []
3864         _applied_styles = []
3865
3866         def start(self, tag, attrib):
3867             if tag in (_x('ttml:br'), 'br'):
3868                 self._out += '\n'
3869             else:
3870                 unclosed_elements = []
3871                 style = {}
3872                 element_style_id = attrib.get('style')
3873                 if default_style:
3874                     style.update(default_style)
3875                 if element_style_id:
3876                     style.update(styles.get(element_style_id, {}))
3877                 for prop in SUPPORTED_STYLING:
3878                     prop_val = attrib.get(_x('tts:' + prop))
3879                     if prop_val:
3880                         style[prop] = prop_val
3881                 if style:
3882                     font = ''
3883                     for k, v in sorted(style.items()):
3884                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3885                             continue
3886                         if k == 'color':
3887                             font += ' color="%s"' % v
3888                         elif k == 'fontSize':
3889                             font += ' size="%s"' % v
3890                         elif k == 'fontFamily':
3891                             font += ' face="%s"' % v
3892                         elif k == 'fontWeight' and v == 'bold':
3893                             self._out += '<b>'
3894                             unclosed_elements.append('b')
3895                         elif k == 'fontStyle' and v == 'italic':
3896                             self._out += '<i>'
3897                             unclosed_elements.append('i')
3898                         elif k == 'textDecoration' and v == 'underline':
3899                             self._out += '<u>'
3900                             unclosed_elements.append('u')
3901                     if font:
3902                         self._out += '<font' + font + '>'
3903                         unclosed_elements.append('font')
3904                     applied_style = {}
3905                     if self._applied_styles:
3906                         applied_style.update(self._applied_styles[-1])
3907                     applied_style.update(style)
3908                     self._applied_styles.append(applied_style)
3909                 self._unclosed_elements.append(unclosed_elements)
3910
3911         def end(self, tag):
3912             if tag not in (_x('ttml:br'), 'br'):
3913                 unclosed_elements = self._unclosed_elements.pop()
3914                 for element in reversed(unclosed_elements):
3915                     self._out += '</%s>' % element
3916                 if unclosed_elements and self._applied_styles:
3917                     self._applied_styles.pop()
3918
3919         def data(self, data):
3920             self._out += data
3921
3922         def close(self):
3923             return self._out.strip()
3924
3925     def parse_node(node):
3926         target = TTMLPElementParser()
3927         parser = xml.etree.ElementTree.XMLParser(target=target)
3928         parser.feed(xml.etree.ElementTree.tostring(node))
3929         return parser.close()
3930
3931     for k, v in LEGACY_NAMESPACES:
3932         for ns in v:
3933             dfxp_data = dfxp_data.replace(ns, k)
3934
3935     dfxp = compat_etree_fromstring(dfxp_data)
3936     out = []
3937     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3938
3939     if not paras:
3940         raise ValueError('Invalid dfxp/TTML subtitle')
3941
3942     repeat = False
3943     while True:
3944         for style in dfxp.findall(_x('.//ttml:style')):
3945             style_id = style.get('id') or style.get(_x('xml:id'))
3946             if not style_id:
3947                 continue
3948             parent_style_id = style.get('style')
3949             if parent_style_id:
3950                 if parent_style_id not in styles:
3951                     repeat = True
3952                     continue
3953                 styles[style_id] = styles[parent_style_id].copy()
3954             for prop in SUPPORTED_STYLING:
3955                 prop_val = style.get(_x('tts:' + prop))
3956                 if prop_val:
3957                     styles.setdefault(style_id, {})[prop] = prop_val
3958         if repeat:
3959             repeat = False
3960         else:
3961             break
3962
3963     for p in ('body', 'div'):
3964         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3965         if ele is None:
3966             continue
3967         style = styles.get(ele.get('style'))
3968         if not style:
3969             continue
3970         default_style.update(style)
3971
3972     for para, index in zip(paras, itertools.count(1)):
3973         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3974         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3975         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3976         if begin_time is None:
3977             continue
3978         if not end_time:
3979             if not dur:
3980                 continue
3981             end_time = begin_time + dur
3982         out.append('%d\n%s --> %s\n%s\n\n' % (
3983             index,
3984             srt_subtitles_timecode(begin_time),
3985             srt_subtitles_timecode(end_time),
3986             parse_node(para)))
3987
3988     return ''.join(out)
3989
3990
3991 def cli_option(params, command_option, param, separator=None):
3992     param = params.get(param)
3993     return ([] if param is None
3994             else [command_option, str(param)] if separator is None
3995             else [f'{command_option}{separator}{param}'])
3996
3997
3998 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3999     param = params.get(param)
4000     assert param in (True, False, None)
4001     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4002
4003
4004 def cli_valueless_option(params, command_option, param, expected_value=True):
4005     return [command_option] if params.get(param) == expected_value else []
4006
4007
4008 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4009     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4010         if use_compat:
4011             return argdict
4012         else:
4013             argdict = None
4014     if argdict is None:
4015         return default
4016     assert isinstance(argdict, dict)
4017
4018     assert isinstance(keys, (list, tuple))
4019     for key_list in keys:
4020         arg_list = list(filter(
4021             lambda x: x is not None,
4022             [argdict.get(key.lower()) for key in variadic(key_list)]))
4023         if arg_list:
4024             return [arg for args in arg_list for arg in args]
4025     return default
4026
4027
4028 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4029     main_key, exe = main_key.lower(), exe.lower()
4030     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4031     keys = [f'{root_key}{k}' for k in (keys or [''])]
4032     if root_key in keys:
4033         if main_key != exe:
4034             keys.append((main_key, exe))
4035         keys.append('default')
4036     else:
4037         use_compat = False
4038     return cli_configuration_args(argdict, keys, default, use_compat)
4039
4040
4041 class ISO639Utils:
4042     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4043     _lang_map = {
4044         'aa': 'aar',
4045         'ab': 'abk',
4046         'ae': 'ave',
4047         'af': 'afr',
4048         'ak': 'aka',
4049         'am': 'amh',
4050         'an': 'arg',
4051         'ar': 'ara',
4052         'as': 'asm',
4053         'av': 'ava',
4054         'ay': 'aym',
4055         'az': 'aze',
4056         'ba': 'bak',
4057         'be': 'bel',
4058         'bg': 'bul',
4059         'bh': 'bih',
4060         'bi': 'bis',
4061         'bm': 'bam',
4062         'bn': 'ben',
4063         'bo': 'bod',
4064         'br': 'bre',
4065         'bs': 'bos',
4066         'ca': 'cat',
4067         'ce': 'che',
4068         'ch': 'cha',
4069         'co': 'cos',
4070         'cr': 'cre',
4071         'cs': 'ces',
4072         'cu': 'chu',
4073         'cv': 'chv',
4074         'cy': 'cym',
4075         'da': 'dan',
4076         'de': 'deu',
4077         'dv': 'div',
4078         'dz': 'dzo',
4079         'ee': 'ewe',
4080         'el': 'ell',
4081         'en': 'eng',
4082         'eo': 'epo',
4083         'es': 'spa',
4084         'et': 'est',
4085         'eu': 'eus',
4086         'fa': 'fas',
4087         'ff': 'ful',
4088         'fi': 'fin',
4089         'fj': 'fij',
4090         'fo': 'fao',
4091         'fr': 'fra',
4092         'fy': 'fry',
4093         'ga': 'gle',
4094         'gd': 'gla',
4095         'gl': 'glg',
4096         'gn': 'grn',
4097         'gu': 'guj',
4098         'gv': 'glv',
4099         'ha': 'hau',
4100         'he': 'heb',
4101         'iw': 'heb',  # Replaced by he in 1989 revision
4102         'hi': 'hin',
4103         'ho': 'hmo',
4104         'hr': 'hrv',
4105         'ht': 'hat',
4106         'hu': 'hun',
4107         'hy': 'hye',
4108         'hz': 'her',
4109         'ia': 'ina',
4110         'id': 'ind',
4111         'in': 'ind',  # Replaced by id in 1989 revision
4112         'ie': 'ile',
4113         'ig': 'ibo',
4114         'ii': 'iii',
4115         'ik': 'ipk',
4116         'io': 'ido',
4117         'is': 'isl',
4118         'it': 'ita',
4119         'iu': 'iku',
4120         'ja': 'jpn',
4121         'jv': 'jav',
4122         'ka': 'kat',
4123         'kg': 'kon',
4124         'ki': 'kik',
4125         'kj': 'kua',
4126         'kk': 'kaz',
4127         'kl': 'kal',
4128         'km': 'khm',
4129         'kn': 'kan',
4130         'ko': 'kor',
4131         'kr': 'kau',
4132         'ks': 'kas',
4133         'ku': 'kur',
4134         'kv': 'kom',
4135         'kw': 'cor',
4136         'ky': 'kir',
4137         'la': 'lat',
4138         'lb': 'ltz',
4139         'lg': 'lug',
4140         'li': 'lim',
4141         'ln': 'lin',
4142         'lo': 'lao',
4143         'lt': 'lit',
4144         'lu': 'lub',
4145         'lv': 'lav',
4146         'mg': 'mlg',
4147         'mh': 'mah',
4148         'mi': 'mri',
4149         'mk': 'mkd',
4150         'ml': 'mal',
4151         'mn': 'mon',
4152         'mr': 'mar',
4153         'ms': 'msa',
4154         'mt': 'mlt',
4155         'my': 'mya',
4156         'na': 'nau',
4157         'nb': 'nob',
4158         'nd': 'nde',
4159         'ne': 'nep',
4160         'ng': 'ndo',
4161         'nl': 'nld',
4162         'nn': 'nno',
4163         'no': 'nor',
4164         'nr': 'nbl',
4165         'nv': 'nav',
4166         'ny': 'nya',
4167         'oc': 'oci',
4168         'oj': 'oji',
4169         'om': 'orm',
4170         'or': 'ori',
4171         'os': 'oss',
4172         'pa': 'pan',
4173         'pi': 'pli',
4174         'pl': 'pol',
4175         'ps': 'pus',
4176         'pt': 'por',
4177         'qu': 'que',
4178         'rm': 'roh',
4179         'rn': 'run',
4180         'ro': 'ron',
4181         'ru': 'rus',
4182         'rw': 'kin',
4183         'sa': 'san',
4184         'sc': 'srd',
4185         'sd': 'snd',
4186         'se': 'sme',
4187         'sg': 'sag',
4188         'si': 'sin',
4189         'sk': 'slk',
4190         'sl': 'slv',
4191         'sm': 'smo',
4192         'sn': 'sna',
4193         'so': 'som',
4194         'sq': 'sqi',
4195         'sr': 'srp',
4196         'ss': 'ssw',
4197         'st': 'sot',
4198         'su': 'sun',
4199         'sv': 'swe',
4200         'sw': 'swa',
4201         'ta': 'tam',
4202         'te': 'tel',
4203         'tg': 'tgk',
4204         'th': 'tha',
4205         'ti': 'tir',
4206         'tk': 'tuk',
4207         'tl': 'tgl',
4208         'tn': 'tsn',
4209         'to': 'ton',
4210         'tr': 'tur',
4211         'ts': 'tso',
4212         'tt': 'tat',
4213         'tw': 'twi',
4214         'ty': 'tah',
4215         'ug': 'uig',
4216         'uk': 'ukr',
4217         'ur': 'urd',
4218         'uz': 'uzb',
4219         've': 'ven',
4220         'vi': 'vie',
4221         'vo': 'vol',
4222         'wa': 'wln',
4223         'wo': 'wol',
4224         'xh': 'xho',
4225         'yi': 'yid',
4226         'ji': 'yid',  # Replaced by yi in 1989 revision
4227         'yo': 'yor',
4228         'za': 'zha',
4229         'zh': 'zho',
4230         'zu': 'zul',
4231     }
4232
4233     @classmethod
4234     def short2long(cls, code):
4235         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4236         return cls._lang_map.get(code[:2])
4237
4238     @classmethod
4239     def long2short(cls, code):
4240         """Convert language code from ISO 639-2/T to ISO 639-1"""
4241         for short_name, long_name in cls._lang_map.items():
4242             if long_name == code:
4243                 return short_name
4244
4245
4246 class ISO3166Utils:
4247     # From http://data.okfn.org/data/core/country-list
4248     _country_map = {
4249         'AF': 'Afghanistan',
4250         'AX': 'Åland Islands',
4251         'AL': 'Albania',
4252         'DZ': 'Algeria',
4253         'AS': 'American Samoa',
4254         'AD': 'Andorra',
4255         'AO': 'Angola',
4256         'AI': 'Anguilla',
4257         'AQ': 'Antarctica',
4258         'AG': 'Antigua and Barbuda',
4259         'AR': 'Argentina',
4260         'AM': 'Armenia',
4261         'AW': 'Aruba',
4262         'AU': 'Australia',
4263         'AT': 'Austria',
4264         'AZ': 'Azerbaijan',
4265         'BS': 'Bahamas',
4266         'BH': 'Bahrain',
4267         'BD': 'Bangladesh',
4268         'BB': 'Barbados',
4269         'BY': 'Belarus',
4270         'BE': 'Belgium',
4271         'BZ': 'Belize',
4272         'BJ': 'Benin',
4273         'BM': 'Bermuda',
4274         'BT': 'Bhutan',
4275         'BO': 'Bolivia, Plurinational State of',
4276         'BQ': 'Bonaire, Sint Eustatius and Saba',
4277         'BA': 'Bosnia and Herzegovina',
4278         'BW': 'Botswana',
4279         'BV': 'Bouvet Island',
4280         'BR': 'Brazil',
4281         'IO': 'British Indian Ocean Territory',
4282         'BN': 'Brunei Darussalam',
4283         'BG': 'Bulgaria',
4284         'BF': 'Burkina Faso',
4285         'BI': 'Burundi',
4286         'KH': 'Cambodia',
4287         'CM': 'Cameroon',
4288         'CA': 'Canada',
4289         'CV': 'Cape Verde',
4290         'KY': 'Cayman Islands',
4291         'CF': 'Central African Republic',
4292         'TD': 'Chad',
4293         'CL': 'Chile',
4294         'CN': 'China',
4295         'CX': 'Christmas Island',
4296         'CC': 'Cocos (Keeling) Islands',
4297         'CO': 'Colombia',
4298         'KM': 'Comoros',
4299         'CG': 'Congo',
4300         'CD': 'Congo, the Democratic Republic of the',
4301         'CK': 'Cook Islands',
4302         'CR': 'Costa Rica',
4303         'CI': 'Côte d\'Ivoire',
4304         'HR': 'Croatia',
4305         'CU': 'Cuba',
4306         'CW': 'Curaçao',
4307         'CY': 'Cyprus',
4308         'CZ': 'Czech Republic',
4309         'DK': 'Denmark',
4310         'DJ': 'Djibouti',
4311         'DM': 'Dominica',
4312         'DO': 'Dominican Republic',
4313         'EC': 'Ecuador',
4314         'EG': 'Egypt',
4315         'SV': 'El Salvador',
4316         'GQ': 'Equatorial Guinea',
4317         'ER': 'Eritrea',
4318         'EE': 'Estonia',
4319         'ET': 'Ethiopia',
4320         'FK': 'Falkland Islands (Malvinas)',
4321         'FO': 'Faroe Islands',
4322         'FJ': 'Fiji',
4323         'FI': 'Finland',
4324         'FR': 'France',
4325         'GF': 'French Guiana',
4326         'PF': 'French Polynesia',
4327         'TF': 'French Southern Territories',
4328         'GA': 'Gabon',
4329         'GM': 'Gambia',
4330         'GE': 'Georgia',
4331         'DE': 'Germany',
4332         'GH': 'Ghana',
4333         'GI': 'Gibraltar',
4334         'GR': 'Greece',
4335         'GL': 'Greenland',
4336         'GD': 'Grenada',
4337         'GP': 'Guadeloupe',
4338         'GU': 'Guam',
4339         'GT': 'Guatemala',
4340         'GG': 'Guernsey',
4341         'GN': 'Guinea',
4342         'GW': 'Guinea-Bissau',
4343         'GY': 'Guyana',
4344         'HT': 'Haiti',
4345         'HM': 'Heard Island and McDonald Islands',
4346         'VA': 'Holy See (Vatican City State)',
4347         'HN': 'Honduras',
4348         'HK': 'Hong Kong',
4349         'HU': 'Hungary',
4350         'IS': 'Iceland',
4351         'IN': 'India',
4352         'ID': 'Indonesia',
4353         'IR': 'Iran, Islamic Republic of',
4354         'IQ': 'Iraq',
4355         'IE': 'Ireland',
4356         'IM': 'Isle of Man',
4357         'IL': 'Israel',
4358         'IT': 'Italy',
4359         'JM': 'Jamaica',
4360         'JP': 'Japan',
4361         'JE': 'Jersey',
4362         'JO': 'Jordan',
4363         'KZ': 'Kazakhstan',
4364         'KE': 'Kenya',
4365         'KI': 'Kiribati',
4366         'KP': 'Korea, Democratic People\'s Republic of',
4367         'KR': 'Korea, Republic of',
4368         'KW': 'Kuwait',
4369         'KG': 'Kyrgyzstan',
4370         'LA': 'Lao People\'s Democratic Republic',
4371         'LV': 'Latvia',
4372         'LB': 'Lebanon',
4373         'LS': 'Lesotho',
4374         'LR': 'Liberia',
4375         'LY': 'Libya',
4376         'LI': 'Liechtenstein',
4377         'LT': 'Lithuania',
4378         'LU': 'Luxembourg',
4379         'MO': 'Macao',
4380         'MK': 'Macedonia, the Former Yugoslav Republic of',
4381         'MG': 'Madagascar',
4382         'MW': 'Malawi',
4383         'MY': 'Malaysia',
4384         'MV': 'Maldives',
4385         'ML': 'Mali',
4386         'MT': 'Malta',
4387         'MH': 'Marshall Islands',
4388         'MQ': 'Martinique',
4389         'MR': 'Mauritania',
4390         'MU': 'Mauritius',
4391         'YT': 'Mayotte',
4392         'MX': 'Mexico',
4393         'FM': 'Micronesia, Federated States of',
4394         'MD': 'Moldova, Republic of',
4395         'MC': 'Monaco',
4396         'MN': 'Mongolia',
4397         'ME': 'Montenegro',
4398         'MS': 'Montserrat',
4399         'MA': 'Morocco',
4400         'MZ': 'Mozambique',
4401         'MM': 'Myanmar',
4402         'NA': 'Namibia',
4403         'NR': 'Nauru',
4404         'NP': 'Nepal',
4405         'NL': 'Netherlands',
4406         'NC': 'New Caledonia',
4407         'NZ': 'New Zealand',
4408         'NI': 'Nicaragua',
4409         'NE': 'Niger',
4410         'NG': 'Nigeria',
4411         'NU': 'Niue',
4412         'NF': 'Norfolk Island',
4413         'MP': 'Northern Mariana Islands',
4414         'NO': 'Norway',
4415         'OM': 'Oman',
4416         'PK': 'Pakistan',
4417         'PW': 'Palau',
4418         'PS': 'Palestine, State of',
4419         'PA': 'Panama',
4420         'PG': 'Papua New Guinea',
4421         'PY': 'Paraguay',
4422         'PE': 'Peru',
4423         'PH': 'Philippines',
4424         'PN': 'Pitcairn',
4425         'PL': 'Poland',
4426         'PT': 'Portugal',
4427         'PR': 'Puerto Rico',
4428         'QA': 'Qatar',
4429         'RE': 'Réunion',
4430         'RO': 'Romania',
4431         'RU': 'Russian Federation',
4432         'RW': 'Rwanda',
4433         'BL': 'Saint Barthélemy',
4434         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4435         'KN': 'Saint Kitts and Nevis',
4436         'LC': 'Saint Lucia',
4437         'MF': 'Saint Martin (French part)',
4438         'PM': 'Saint Pierre and Miquelon',
4439         'VC': 'Saint Vincent and the Grenadines',
4440         'WS': 'Samoa',
4441         'SM': 'San Marino',
4442         'ST': 'Sao Tome and Principe',
4443         'SA': 'Saudi Arabia',
4444         'SN': 'Senegal',
4445         'RS': 'Serbia',
4446         'SC': 'Seychelles',
4447         'SL': 'Sierra Leone',
4448         'SG': 'Singapore',
4449         'SX': 'Sint Maarten (Dutch part)',
4450         'SK': 'Slovakia',
4451         'SI': 'Slovenia',
4452         'SB': 'Solomon Islands',
4453         'SO': 'Somalia',
4454         'ZA': 'South Africa',
4455         'GS': 'South Georgia and the South Sandwich Islands',
4456         'SS': 'South Sudan',
4457         'ES': 'Spain',
4458         'LK': 'Sri Lanka',
4459         'SD': 'Sudan',
4460         'SR': 'Suriname',
4461         'SJ': 'Svalbard and Jan Mayen',
4462         'SZ': 'Swaziland',
4463         'SE': 'Sweden',
4464         'CH': 'Switzerland',
4465         'SY': 'Syrian Arab Republic',
4466         'TW': 'Taiwan, Province of China',
4467         'TJ': 'Tajikistan',
4468         'TZ': 'Tanzania, United Republic of',
4469         'TH': 'Thailand',
4470         'TL': 'Timor-Leste',
4471         'TG': 'Togo',
4472         'TK': 'Tokelau',
4473         'TO': 'Tonga',
4474         'TT': 'Trinidad and Tobago',
4475         'TN': 'Tunisia',
4476         'TR': 'Turkey',
4477         'TM': 'Turkmenistan',
4478         'TC': 'Turks and Caicos Islands',
4479         'TV': 'Tuvalu',
4480         'UG': 'Uganda',
4481         'UA': 'Ukraine',
4482         'AE': 'United Arab Emirates',
4483         'GB': 'United Kingdom',
4484         'US': 'United States',
4485         'UM': 'United States Minor Outlying Islands',
4486         'UY': 'Uruguay',
4487         'UZ': 'Uzbekistan',
4488         'VU': 'Vanuatu',
4489         'VE': 'Venezuela, Bolivarian Republic of',
4490         'VN': 'Viet Nam',
4491         'VG': 'Virgin Islands, British',
4492         'VI': 'Virgin Islands, U.S.',
4493         'WF': 'Wallis and Futuna',
4494         'EH': 'Western Sahara',
4495         'YE': 'Yemen',
4496         'ZM': 'Zambia',
4497         'ZW': 'Zimbabwe',
4498         # Not ISO 3166 codes, but used for IP blocks
4499         'AP': 'Asia/Pacific Region',
4500         'EU': 'Europe',
4501     }
4502
4503     @classmethod
4504     def short2full(cls, code):
4505         """Convert an ISO 3166-2 country code to the corresponding full name"""
4506         return cls._country_map.get(code.upper())
4507
4508
4509 class GeoUtils:
4510     # Major IPv4 address blocks per country
4511     _country_ip_map = {
4512         'AD': '46.172.224.0/19',
4513         'AE': '94.200.0.0/13',
4514         'AF': '149.54.0.0/17',
4515         'AG': '209.59.64.0/18',
4516         'AI': '204.14.248.0/21',
4517         'AL': '46.99.0.0/16',
4518         'AM': '46.70.0.0/15',
4519         'AO': '105.168.0.0/13',
4520         'AP': '182.50.184.0/21',
4521         'AQ': '23.154.160.0/24',
4522         'AR': '181.0.0.0/12',
4523         'AS': '202.70.112.0/20',
4524         'AT': '77.116.0.0/14',
4525         'AU': '1.128.0.0/11',
4526         'AW': '181.41.0.0/18',
4527         'AX': '185.217.4.0/22',
4528         'AZ': '5.197.0.0/16',
4529         'BA': '31.176.128.0/17',
4530         'BB': '65.48.128.0/17',
4531         'BD': '114.130.0.0/16',
4532         'BE': '57.0.0.0/8',
4533         'BF': '102.178.0.0/15',
4534         'BG': '95.42.0.0/15',
4535         'BH': '37.131.0.0/17',
4536         'BI': '154.117.192.0/18',
4537         'BJ': '137.255.0.0/16',
4538         'BL': '185.212.72.0/23',
4539         'BM': '196.12.64.0/18',
4540         'BN': '156.31.0.0/16',
4541         'BO': '161.56.0.0/16',
4542         'BQ': '161.0.80.0/20',
4543         'BR': '191.128.0.0/12',
4544         'BS': '24.51.64.0/18',
4545         'BT': '119.2.96.0/19',
4546         'BW': '168.167.0.0/16',
4547         'BY': '178.120.0.0/13',
4548         'BZ': '179.42.192.0/18',
4549         'CA': '99.224.0.0/11',
4550         'CD': '41.243.0.0/16',
4551         'CF': '197.242.176.0/21',
4552         'CG': '160.113.0.0/16',
4553         'CH': '85.0.0.0/13',
4554         'CI': '102.136.0.0/14',
4555         'CK': '202.65.32.0/19',
4556         'CL': '152.172.0.0/14',
4557         'CM': '102.244.0.0/14',
4558         'CN': '36.128.0.0/10',
4559         'CO': '181.240.0.0/12',
4560         'CR': '201.192.0.0/12',
4561         'CU': '152.206.0.0/15',
4562         'CV': '165.90.96.0/19',
4563         'CW': '190.88.128.0/17',
4564         'CY': '31.153.0.0/16',
4565         'CZ': '88.100.0.0/14',
4566         'DE': '53.0.0.0/8',
4567         'DJ': '197.241.0.0/17',
4568         'DK': '87.48.0.0/12',
4569         'DM': '192.243.48.0/20',
4570         'DO': '152.166.0.0/15',
4571         'DZ': '41.96.0.0/12',
4572         'EC': '186.68.0.0/15',
4573         'EE': '90.190.0.0/15',
4574         'EG': '156.160.0.0/11',
4575         'ER': '196.200.96.0/20',
4576         'ES': '88.0.0.0/11',
4577         'ET': '196.188.0.0/14',
4578         'EU': '2.16.0.0/13',
4579         'FI': '91.152.0.0/13',
4580         'FJ': '144.120.0.0/16',
4581         'FK': '80.73.208.0/21',
4582         'FM': '119.252.112.0/20',
4583         'FO': '88.85.32.0/19',
4584         'FR': '90.0.0.0/9',
4585         'GA': '41.158.0.0/15',
4586         'GB': '25.0.0.0/8',
4587         'GD': '74.122.88.0/21',
4588         'GE': '31.146.0.0/16',
4589         'GF': '161.22.64.0/18',
4590         'GG': '62.68.160.0/19',
4591         'GH': '154.160.0.0/12',
4592         'GI': '95.164.0.0/16',
4593         'GL': '88.83.0.0/19',
4594         'GM': '160.182.0.0/15',
4595         'GN': '197.149.192.0/18',
4596         'GP': '104.250.0.0/19',
4597         'GQ': '105.235.224.0/20',
4598         'GR': '94.64.0.0/13',
4599         'GT': '168.234.0.0/16',
4600         'GU': '168.123.0.0/16',
4601         'GW': '197.214.80.0/20',
4602         'GY': '181.41.64.0/18',
4603         'HK': '113.252.0.0/14',
4604         'HN': '181.210.0.0/16',
4605         'HR': '93.136.0.0/13',
4606         'HT': '148.102.128.0/17',
4607         'HU': '84.0.0.0/14',
4608         'ID': '39.192.0.0/10',
4609         'IE': '87.32.0.0/12',
4610         'IL': '79.176.0.0/13',
4611         'IM': '5.62.80.0/20',
4612         'IN': '117.192.0.0/10',
4613         'IO': '203.83.48.0/21',
4614         'IQ': '37.236.0.0/14',
4615         'IR': '2.176.0.0/12',
4616         'IS': '82.221.0.0/16',
4617         'IT': '79.0.0.0/10',
4618         'JE': '87.244.64.0/18',
4619         'JM': '72.27.0.0/17',
4620         'JO': '176.29.0.0/16',
4621         'JP': '133.0.0.0/8',
4622         'KE': '105.48.0.0/12',
4623         'KG': '158.181.128.0/17',
4624         'KH': '36.37.128.0/17',
4625         'KI': '103.25.140.0/22',
4626         'KM': '197.255.224.0/20',
4627         'KN': '198.167.192.0/19',
4628         'KP': '175.45.176.0/22',
4629         'KR': '175.192.0.0/10',
4630         'KW': '37.36.0.0/14',
4631         'KY': '64.96.0.0/15',
4632         'KZ': '2.72.0.0/13',
4633         'LA': '115.84.64.0/18',
4634         'LB': '178.135.0.0/16',
4635         'LC': '24.92.144.0/20',
4636         'LI': '82.117.0.0/19',
4637         'LK': '112.134.0.0/15',
4638         'LR': '102.183.0.0/16',
4639         'LS': '129.232.0.0/17',
4640         'LT': '78.56.0.0/13',
4641         'LU': '188.42.0.0/16',
4642         'LV': '46.109.0.0/16',
4643         'LY': '41.252.0.0/14',
4644         'MA': '105.128.0.0/11',
4645         'MC': '88.209.64.0/18',
4646         'MD': '37.246.0.0/16',
4647         'ME': '178.175.0.0/17',
4648         'MF': '74.112.232.0/21',
4649         'MG': '154.126.0.0/17',
4650         'MH': '117.103.88.0/21',
4651         'MK': '77.28.0.0/15',
4652         'ML': '154.118.128.0/18',
4653         'MM': '37.111.0.0/17',
4654         'MN': '49.0.128.0/17',
4655         'MO': '60.246.0.0/16',
4656         'MP': '202.88.64.0/20',
4657         'MQ': '109.203.224.0/19',
4658         'MR': '41.188.64.0/18',
4659         'MS': '208.90.112.0/22',
4660         'MT': '46.11.0.0/16',
4661         'MU': '105.16.0.0/12',
4662         'MV': '27.114.128.0/18',
4663         'MW': '102.70.0.0/15',
4664         'MX': '187.192.0.0/11',
4665         'MY': '175.136.0.0/13',
4666         'MZ': '197.218.0.0/15',
4667         'NA': '41.182.0.0/16',
4668         'NC': '101.101.0.0/18',
4669         'NE': '197.214.0.0/18',
4670         'NF': '203.17.240.0/22',
4671         'NG': '105.112.0.0/12',
4672         'NI': '186.76.0.0/15',
4673         'NL': '145.96.0.0/11',
4674         'NO': '84.208.0.0/13',
4675         'NP': '36.252.0.0/15',
4676         'NR': '203.98.224.0/19',
4677         'NU': '49.156.48.0/22',
4678         'NZ': '49.224.0.0/14',
4679         'OM': '5.36.0.0/15',
4680         'PA': '186.72.0.0/15',
4681         'PE': '186.160.0.0/14',
4682         'PF': '123.50.64.0/18',
4683         'PG': '124.240.192.0/19',
4684         'PH': '49.144.0.0/13',
4685         'PK': '39.32.0.0/11',
4686         'PL': '83.0.0.0/11',
4687         'PM': '70.36.0.0/20',
4688         'PR': '66.50.0.0/16',
4689         'PS': '188.161.0.0/16',
4690         'PT': '85.240.0.0/13',
4691         'PW': '202.124.224.0/20',
4692         'PY': '181.120.0.0/14',
4693         'QA': '37.210.0.0/15',
4694         'RE': '102.35.0.0/16',
4695         'RO': '79.112.0.0/13',
4696         'RS': '93.86.0.0/15',
4697         'RU': '5.136.0.0/13',
4698         'RW': '41.186.0.0/16',
4699         'SA': '188.48.0.0/13',
4700         'SB': '202.1.160.0/19',
4701         'SC': '154.192.0.0/11',
4702         'SD': '102.120.0.0/13',
4703         'SE': '78.64.0.0/12',
4704         'SG': '8.128.0.0/10',
4705         'SI': '188.196.0.0/14',
4706         'SK': '78.98.0.0/15',
4707         'SL': '102.143.0.0/17',
4708         'SM': '89.186.32.0/19',
4709         'SN': '41.82.0.0/15',
4710         'SO': '154.115.192.0/18',
4711         'SR': '186.179.128.0/17',
4712         'SS': '105.235.208.0/21',
4713         'ST': '197.159.160.0/19',
4714         'SV': '168.243.0.0/16',
4715         'SX': '190.102.0.0/20',
4716         'SY': '5.0.0.0/16',
4717         'SZ': '41.84.224.0/19',
4718         'TC': '65.255.48.0/20',
4719         'TD': '154.68.128.0/19',
4720         'TG': '196.168.0.0/14',
4721         'TH': '171.96.0.0/13',
4722         'TJ': '85.9.128.0/18',
4723         'TK': '27.96.24.0/21',
4724         'TL': '180.189.160.0/20',
4725         'TM': '95.85.96.0/19',
4726         'TN': '197.0.0.0/11',
4727         'TO': '175.176.144.0/21',
4728         'TR': '78.160.0.0/11',
4729         'TT': '186.44.0.0/15',
4730         'TV': '202.2.96.0/19',
4731         'TW': '120.96.0.0/11',
4732         'TZ': '156.156.0.0/14',
4733         'UA': '37.52.0.0/14',
4734         'UG': '102.80.0.0/13',
4735         'US': '6.0.0.0/8',
4736         'UY': '167.56.0.0/13',
4737         'UZ': '84.54.64.0/18',
4738         'VA': '212.77.0.0/19',
4739         'VC': '207.191.240.0/21',
4740         'VE': '186.88.0.0/13',
4741         'VG': '66.81.192.0/20',
4742         'VI': '146.226.0.0/16',
4743         'VN': '14.160.0.0/11',
4744         'VU': '202.80.32.0/20',
4745         'WF': '117.20.32.0/21',
4746         'WS': '202.4.32.0/19',
4747         'YE': '134.35.0.0/16',
4748         'YT': '41.242.116.0/22',
4749         'ZA': '41.0.0.0/11',
4750         'ZM': '102.144.0.0/13',
4751         'ZW': '102.177.192.0/18',
4752     }
4753
4754     @classmethod
4755     def random_ipv4(cls, code_or_block):
4756         if len(code_or_block) == 2:
4757             block = cls._country_ip_map.get(code_or_block.upper())
4758             if not block:
4759                 return None
4760         else:
4761             block = code_or_block
4762         addr, preflen = block.split('/')
4763         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4764         addr_max = addr_min | (0xffffffff >> int(preflen))
4765         return str(socket.inet_ntoa(
4766             struct.pack('!L', random.randint(addr_min, addr_max))))
4767
4768
4769 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4770     def __init__(self, proxies=None):
4771         # Set default handlers
4772         for type in ('http', 'https'):
4773             setattr(self, '%s_open' % type,
4774                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4775                         meth(r, proxy, type))
4776         urllib.request.ProxyHandler.__init__(self, proxies)
4777
4778     def proxy_open(self, req, proxy, type):
4779         req_proxy = req.headers.get('Ytdl-request-proxy')
4780         if req_proxy is not None:
4781             proxy = req_proxy
4782             del req.headers['Ytdl-request-proxy']
4783
4784         if proxy == '__noproxy__':
4785             return None  # No Proxy
4786         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4787             req.add_header('Ytdl-socks-proxy', proxy)
4788             # yt-dlp's http/https handlers do wrapping the socket with socks
4789             return None
4790         return urllib.request.ProxyHandler.proxy_open(
4791             self, req, proxy, type)
4792
4793
4794 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4795 # released into Public Domain
4796 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4797
4798 def long_to_bytes(n, blocksize=0):
4799     """long_to_bytes(n:long, blocksize:int) : string
4800     Convert a long integer to a byte string.
4801
4802     If optional blocksize is given and greater than zero, pad the front of the
4803     byte string with binary zeros so that the length is a multiple of
4804     blocksize.
4805     """
4806     # after much testing, this algorithm was deemed to be the fastest
4807     s = b''
4808     n = int(n)
4809     while n > 0:
4810         s = struct.pack('>I', n & 0xffffffff) + s
4811         n = n >> 32
4812     # strip off leading zeros
4813     for i in range(len(s)):
4814         if s[i] != b'\000'[0]:
4815             break
4816     else:
4817         # only happens when n == 0
4818         s = b'\000'
4819         i = 0
4820     s = s[i:]
4821     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4822     # de-padding being done above, but sigh...
4823     if blocksize > 0 and len(s) % blocksize:
4824         s = (blocksize - len(s) % blocksize) * b'\000' + s
4825     return s
4826
4827
4828 def bytes_to_long(s):
4829     """bytes_to_long(string) : long
4830     Convert a byte string to a long integer.
4831
4832     This is (essentially) the inverse of long_to_bytes().
4833     """
4834     acc = 0
4835     length = len(s)
4836     if length % 4:
4837         extra = (4 - length % 4)
4838         s = b'\000' * extra + s
4839         length = length + extra
4840     for i in range(0, length, 4):
4841         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4842     return acc
4843
4844
4845 def ohdave_rsa_encrypt(data, exponent, modulus):
4846     '''
4847     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4848
4849     Input:
4850         data: data to encrypt, bytes-like object
4851         exponent, modulus: parameter e and N of RSA algorithm, both integer
4852     Output: hex string of encrypted data
4853
4854     Limitation: supports one block encryption only
4855     '''
4856
4857     payload = int(binascii.hexlify(data[::-1]), 16)
4858     encrypted = pow(payload, exponent, modulus)
4859     return '%x' % encrypted
4860
4861
4862 def pkcs1pad(data, length):
4863     """
4864     Padding input data with PKCS#1 scheme
4865
4866     @param {int[]} data        input data
4867     @param {int}   length      target length
4868     @returns {int[]}           padded data
4869     """
4870     if len(data) > length - 11:
4871         raise ValueError('Input data too long for PKCS#1 padding')
4872
4873     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4874     return [0, 2] + pseudo_random + [0] + data
4875
4876
4877 def _base_n_table(n, table):
4878     if not table and not n:
4879         raise ValueError('Either table or n must be specified')
4880     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4881
4882     if n and n != len(table):
4883         raise ValueError(f'base {n} exceeds table length {len(table)}')
4884     return table
4885
4886
4887 def encode_base_n(num, n=None, table=None):
4888     """Convert given int to a base-n string"""
4889     table = _base_n_table(n, table)
4890     if not num:
4891         return table[0]
4892
4893     result, base = '', len(table)
4894     while num:
4895         result = table[num % base] + result
4896         num = num // base
4897     return result
4898
4899
4900 def decode_base_n(string, n=None, table=None):
4901     """Convert given base-n string to int"""
4902     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4903     result, base = 0, len(table)
4904     for char in string:
4905         result = result * base + table[char]
4906     return result
4907
4908
4909 def decode_base(value, digits):
4910     deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4911                         f'in a future version. Use {__name__}.decode_base_n instead')
4912     return decode_base_n(value, table=digits)
4913
4914
4915 def decode_packed_codes(code):
4916     mobj = re.search(PACKED_CODES_RE, code)
4917     obfuscated_code, base, count, symbols = mobj.groups()
4918     base = int(base)
4919     count = int(count)
4920     symbols = symbols.split('|')
4921     symbol_table = {}
4922
4923     while count:
4924         count -= 1
4925         base_n_count = encode_base_n(count, base)
4926         symbol_table[base_n_count] = symbols[count] or base_n_count
4927
4928     return re.sub(
4929         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4930         obfuscated_code)
4931
4932
4933 def caesar(s, alphabet, shift):
4934     if shift == 0:
4935         return s
4936     l = len(alphabet)
4937     return ''.join(
4938         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4939         for c in s)
4940
4941
4942 def rot47(s):
4943     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4944
4945
4946 def parse_m3u8_attributes(attrib):
4947     info = {}
4948     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4949         if val.startswith('"'):
4950             val = val[1:-1]
4951         info[key] = val
4952     return info
4953
4954
4955 def urshift(val, n):
4956     return val >> n if val >= 0 else (val + 0x100000000) >> n
4957
4958
4959 # Based on png2str() written by @gdkchan and improved by @yokrysty
4960 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4961 def decode_png(png_data):
4962     # Reference: https://www.w3.org/TR/PNG/
4963     header = png_data[8:]
4964
4965     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4966         raise OSError('Not a valid PNG file.')
4967
4968     int_map = {1: '>B', 2: '>H', 4: '>I'}
4969     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4970
4971     chunks = []
4972
4973     while header:
4974         length = unpack_integer(header[:4])
4975         header = header[4:]
4976
4977         chunk_type = header[:4]
4978         header = header[4:]
4979
4980         chunk_data = header[:length]
4981         header = header[length:]
4982
4983         header = header[4:]  # Skip CRC
4984
4985         chunks.append({
4986             'type': chunk_type,
4987             'length': length,
4988             'data': chunk_data
4989         })
4990
4991     ihdr = chunks[0]['data']
4992
4993     width = unpack_integer(ihdr[:4])
4994     height = unpack_integer(ihdr[4:8])
4995
4996     idat = b''
4997
4998     for chunk in chunks:
4999         if chunk['type'] == b'IDAT':
5000             idat += chunk['data']
5001
5002     if not idat:
5003         raise OSError('Unable to read PNG data.')
5004
5005     decompressed_data = bytearray(zlib.decompress(idat))
5006
5007     stride = width * 3
5008     pixels = []
5009
5010     def _get_pixel(idx):
5011         x = idx % stride
5012         y = idx // stride
5013         return pixels[y][x]
5014
5015     for y in range(height):
5016         basePos = y * (1 + stride)
5017         filter_type = decompressed_data[basePos]
5018
5019         current_row = []
5020
5021         pixels.append(current_row)
5022
5023         for x in range(stride):
5024             color = decompressed_data[1 + basePos + x]
5025             basex = y * stride + x
5026             left = 0
5027             up = 0
5028
5029             if x > 2:
5030                 left = _get_pixel(basex - 3)
5031             if y > 0:
5032                 up = _get_pixel(basex - stride)
5033
5034             if filter_type == 1:  # Sub
5035                 color = (color + left) & 0xff
5036             elif filter_type == 2:  # Up
5037                 color = (color + up) & 0xff
5038             elif filter_type == 3:  # Average
5039                 color = (color + ((left + up) >> 1)) & 0xff
5040             elif filter_type == 4:  # Paeth
5041                 a = left
5042                 b = up
5043                 c = 0
5044
5045                 if x > 2 and y > 0:
5046                     c = _get_pixel(basex - stride - 3)
5047
5048                 p = a + b - c
5049
5050                 pa = abs(p - a)
5051                 pb = abs(p - b)
5052                 pc = abs(p - c)
5053
5054                 if pa <= pb and pa <= pc:
5055                     color = (color + a) & 0xff
5056                 elif pb <= pc:
5057                     color = (color + b) & 0xff
5058                 else:
5059                     color = (color + c) & 0xff
5060
5061             current_row.append(color)
5062
5063     return width, height, pixels
5064
5065
5066 def write_xattr(path, key, value):
5067     # Windows: Write xattrs to NTFS Alternate Data Streams:
5068     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5069     if compat_os_name == 'nt':
5070         assert ':' not in key
5071         assert os.path.exists(path)
5072
5073         try:
5074             with open(f'{path}:{key}', 'wb') as f:
5075                 f.write(value)
5076         except OSError as e:
5077             raise XAttrMetadataError(e.errno, e.strerror)
5078         return
5079
5080     # UNIX Method 1. Use xattrs/pyxattrs modules
5081
5082     setxattr = None
5083     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5084         # Unicode arguments are not supported in pyxattr until version 0.5.0
5085         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5086         if version_tuple(xattr.__version__) >= (0, 5, 0):
5087             setxattr = xattr.set
5088     elif xattr:
5089         setxattr = xattr.setxattr
5090
5091     if setxattr:
5092         try:
5093             setxattr(path, key, value)
5094         except OSError as e:
5095             raise XAttrMetadataError(e.errno, e.strerror)
5096         return
5097
5098     # UNIX Method 2. Use setfattr/xattr executables
5099     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5100            else 'xattr' if check_executable('xattr', ['-h']) else None)
5101     if not exe:
5102         raise XAttrUnavailableError(
5103             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5104             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5105
5106     value = value.decode()
5107     try:
5108         _, stderr, returncode = Popen.run(
5109             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5110             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5111     except OSError as e:
5112         raise XAttrMetadataError(e.errno, e.strerror)
5113     if returncode:
5114         raise XAttrMetadataError(returncode, stderr)
5115
5116
5117 def random_birthday(year_field, month_field, day_field):
5118     start_date = datetime.date(1950, 1, 1)
5119     end_date = datetime.date(1995, 12, 31)
5120     offset = random.randint(0, (end_date - start_date).days)
5121     random_date = start_date + datetime.timedelta(offset)
5122     return {
5123         year_field: str(random_date.year),
5124         month_field: str(random_date.month),
5125         day_field: str(random_date.day),
5126     }
5127
5128
5129 # Templates for internet shortcut files, which are plain text files.
5130 DOT_URL_LINK_TEMPLATE = '''\
5131 [InternetShortcut]
5132 URL=%(url)s
5133 '''
5134
5135 DOT_WEBLOC_LINK_TEMPLATE = '''\
5136 <?xml version="1.0" encoding="UTF-8"?>
5137 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5138 <plist version="1.0">
5139 <dict>
5140 \t<key>URL</key>
5141 \t<string>%(url)s</string>
5142 </dict>
5143 </plist>
5144 '''
5145
5146 DOT_DESKTOP_LINK_TEMPLATE = '''\
5147 [Desktop Entry]
5148 Encoding=UTF-8
5149 Name=%(filename)s
5150 Type=Link
5151 URL=%(url)s
5152 Icon=text-html
5153 '''
5154
5155 LINK_TEMPLATES = {
5156     'url': DOT_URL_LINK_TEMPLATE,
5157     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5158     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5159 }
5160
5161
5162 def iri_to_uri(iri):
5163     """
5164     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5165
5166     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5167     """
5168
5169     iri_parts = urllib.parse.urlparse(iri)
5170
5171     if '[' in iri_parts.netloc:
5172         raise ValueError('IPv6 URIs are not, yet, supported.')
5173         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5174
5175     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5176
5177     net_location = ''
5178     if iri_parts.username:
5179         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5180         if iri_parts.password is not None:
5181             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5182         net_location += '@'
5183
5184     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5185     # The 'idna' encoding produces ASCII text.
5186     if iri_parts.port is not None and iri_parts.port != 80:
5187         net_location += ':' + str(iri_parts.port)
5188
5189     return urllib.parse.urlunparse(
5190         (iri_parts.scheme,
5191             net_location,
5192
5193             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5194
5195             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5196             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5197
5198             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5199             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5200
5201             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5202
5203     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5204
5205
5206 def to_high_limit_path(path):
5207     if sys.platform in ['win32', 'cygwin']:
5208         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5209         return '\\\\?\\' + os.path.abspath(path)
5210
5211     return path
5212
5213
5214 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5215     val = traverse_obj(obj, *variadic(field))
5216     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5217         return default
5218     return template % func(val)
5219
5220
5221 def clean_podcast_url(url):
5222     return re.sub(r'''(?x)
5223         (?:
5224             (?:
5225                 chtbl\.com/track|
5226                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5227                 play\.podtrac\.com
5228             )/[^/]+|
5229             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5230             flex\.acast\.com|
5231             pd(?:
5232                 cn\.co| # https://podcorn.com/analytics-prefix/
5233                 st\.fm # https://podsights.com/docs/
5234             )/e
5235         )/''', '', url)
5236
5237
5238 _HEX_TABLE = '0123456789abcdef'
5239
5240
5241 def random_uuidv4():
5242     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5243
5244
5245 def make_dir(path, to_screen=None):
5246     try:
5247         dn = os.path.dirname(path)
5248         if dn and not os.path.exists(dn):
5249             os.makedirs(dn)
5250         return True
5251     except OSError as err:
5252         if callable(to_screen) is not None:
5253             to_screen('unable to create directory ' + error_to_compat_str(err))
5254         return False
5255
5256
5257 def get_executable_path():
5258     from .update import _get_variant_and_executable_path
5259
5260     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5261
5262
5263 def load_plugins(name, suffix, namespace):
5264     classes = {}
5265     with contextlib.suppress(FileNotFoundError):
5266         plugins_spec = importlib.util.spec_from_file_location(
5267             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5268         plugins = importlib.util.module_from_spec(plugins_spec)
5269         sys.modules[plugins_spec.name] = plugins
5270         plugins_spec.loader.exec_module(plugins)
5271         for name in dir(plugins):
5272             if name in namespace:
5273                 continue
5274             if not name.endswith(suffix):
5275                 continue
5276             klass = getattr(plugins, name)
5277             classes[name] = namespace[name] = klass
5278     return classes
5279
5280
5281 def traverse_obj(
5282         obj, *path_list, default=None, expected_type=None, get_all=True,
5283         casesense=True, is_user_input=False, traverse_string=False):
5284     ''' Traverse nested list/dict/tuple
5285     @param path_list        A list of paths which are checked one by one.
5286                             Each path is a list of keys where each key is a:
5287                               - None:     Do nothing
5288                               - string:   A dictionary key / regex group
5289                               - int:      An index into a list
5290                               - tuple:    A list of keys all of which will be traversed
5291                               - Ellipsis: Fetch all values in the object
5292                               - Function: Takes the key and value as arguments
5293                                           and returns whether the key matches or not
5294     @param default          Default value to return
5295     @param expected_type    Only accept final value of this type (Can also be any callable)
5296     @param get_all          Return all the values obtained from a path or only the first one
5297     @param casesense        Whether to consider dictionary keys as case sensitive
5298
5299     The following are only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API
5300
5301     @param path_list        In addition to the above,
5302                               - dict:     Given {k:v, ...}; return {k: traverse_obj(obj, v), ...}
5303     @param is_user_input    Whether the keys are generated from user input. If True,
5304                             strings are converted to int/slice if necessary
5305     @param traverse_string  Whether to traverse inside strings. If True, any
5306                             non-compatible object will also be converted into a string
5307     '''  # TODO: Write tests
5308     if not casesense:
5309         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5310         path_list = (map(_lower, variadic(path)) for path in path_list)
5311
5312     def _traverse_obj(obj, path, _current_depth=0):
5313         nonlocal depth
5314         path = tuple(variadic(path))
5315         for i, key in enumerate(path):
5316             if None in (key, obj):
5317                 return obj
5318             if isinstance(key, (list, tuple)):
5319                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5320                 key = ...
5321
5322             if key is ...:
5323                 obj = (obj.values() if isinstance(obj, dict)
5324                        else obj if isinstance(obj, (list, tuple, LazyList))
5325                        else str(obj) if traverse_string else [])
5326                 _current_depth += 1
5327                 depth = max(depth, _current_depth)
5328                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5329             elif isinstance(key, dict):
5330                 obj = filter_dict({k: _traverse_obj(obj, v, _current_depth) for k, v in key.items()})
5331             elif callable(key):
5332                 if isinstance(obj, (list, tuple, LazyList)):
5333                     obj = enumerate(obj)
5334                 elif isinstance(obj, dict):
5335                     obj = obj.items()
5336                 else:
5337                     if not traverse_string:
5338                         return None
5339                     obj = str(obj)
5340                 _current_depth += 1
5341                 depth = max(depth, _current_depth)
5342                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5343             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5344                 obj = (obj.get(key) if casesense or (key in obj)
5345                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5346             else:
5347                 if is_user_input:
5348                     key = (int_or_none(key) if ':' not in key
5349                            else slice(*map(int_or_none, key.split(':'))))
5350                     if key == slice(None):
5351                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5352                 if not isinstance(key, (int, slice)):
5353                     return None
5354                 if not isinstance(obj, (list, tuple, LazyList)):
5355                     if not traverse_string:
5356                         return None
5357                     obj = str(obj)
5358                 try:
5359                     obj = obj[key]
5360                 except IndexError:
5361                     return None
5362         return obj
5363
5364     if isinstance(expected_type, type):
5365         type_test = lambda val: val if isinstance(val, expected_type) else None
5366     else:
5367         type_test = expected_type or IDENTITY
5368
5369     for path in path_list:
5370         depth = 0
5371         val = _traverse_obj(obj, path)
5372         if val is not None:
5373             if depth:
5374                 for _ in range(depth - 1):
5375                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5376                 val = [v for v in map(type_test, val) if v is not None]
5377                 if val:
5378                     return val if get_all else val[0]
5379             else:
5380                 val = type_test(val)
5381                 if val is not None:
5382                     return val
5383     return default
5384
5385
5386 def traverse_dict(dictn, keys, casesense=True):
5387     deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5388                         f'in a future version. Use "{__name__}.traverse_obj" instead')
5389     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5390
5391
5392 def get_first(obj, keys, **kwargs):
5393     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5394
5395
5396 def variadic(x, allowed_types=(str, bytes, dict)):
5397     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5398
5399
5400 def time_seconds(**kwargs):
5401     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5402     return t.timestamp()
5403
5404
5405 # create a JSON Web Signature (jws) with HS256 algorithm
5406 # the resulting format is in JWS Compact Serialization
5407 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5408 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5409 def jwt_encode_hs256(payload_data, key, headers={}):
5410     header_data = {
5411         'alg': 'HS256',
5412         'typ': 'JWT',
5413     }
5414     if headers:
5415         header_data.update(headers)
5416     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5417     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5418     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5419     signature_b64 = base64.b64encode(h.digest())
5420     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5421     return token
5422
5423
5424 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5425 def jwt_decode_hs256(jwt):
5426     header_b64, payload_b64, signature_b64 = jwt.split('.')
5427     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5428     return payload_data
5429
5430
5431 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5432
5433
5434 @functools.cache
5435 def supports_terminal_sequences(stream):
5436     if compat_os_name == 'nt':
5437         if not WINDOWS_VT_MODE:
5438             return False
5439     elif not os.getenv('TERM'):
5440         return False
5441     try:
5442         return stream.isatty()
5443     except BaseException:
5444         return False
5445
5446
5447 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5448     if get_windows_version() < (10, 0, 10586):
5449         return
5450     global WINDOWS_VT_MODE
5451     try:
5452         Popen.run('', shell=True)
5453     except Exception:
5454         return
5455
5456     WINDOWS_VT_MODE = True
5457     supports_terminal_sequences.cache_clear()
5458
5459
5460 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5461
5462
5463 def remove_terminal_sequences(string):
5464     return _terminal_sequences_re.sub('', string)
5465
5466
5467 def number_of_digits(number):
5468     return len('%d' % number)
5469
5470
5471 def join_nonempty(*values, delim='-', from_dict=None):
5472     if from_dict is not None:
5473         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5474     return delim.join(map(str, filter(None, values)))
5475
5476
5477 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5478     """
5479     Find the largest format dimensions in terms of video width and, for each thumbnail:
5480     * Modify the URL: Match the width with the provided regex and replace with the former width
5481     * Update dimensions
5482
5483     This function is useful with video services that scale the provided thumbnails on demand
5484     """
5485     _keys = ('width', 'height')
5486     max_dimensions = max(
5487         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5488         default=(0, 0))
5489     if not max_dimensions[0]:
5490         return thumbnails
5491     return [
5492         merge_dicts(
5493             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5494             dict(zip(_keys, max_dimensions)), thumbnail)
5495         for thumbnail in thumbnails
5496     ]
5497
5498
5499 def parse_http_range(range):
5500     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5501     if not range:
5502         return None, None, None
5503     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5504     if not crg:
5505         return None, None, None
5506     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5507
5508
5509 def read_stdin(what):
5510     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5511     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5512     return sys.stdin
5513
5514
5515 def determine_file_encoding(data):
5516     """
5517     Detect the text encoding used
5518     @returns (encoding, bytes to skip)
5519     """
5520
5521     # BOM marks are given priority over declarations
5522     for bom, enc in BOMS:
5523         if data.startswith(bom):
5524             return enc, len(bom)
5525
5526     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5527     # We ignore the endianness to get a good enough match
5528     data = data.replace(b'\0', b'')
5529     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5530     return mobj.group(1).decode() if mobj else None, 0
5531
5532
5533 class Config:
5534     own_args = None
5535     parsed_args = None
5536     filename = None
5537     __initialized = False
5538
5539     def __init__(self, parser, label=None):
5540         self.parser, self.label = parser, label
5541         self._loaded_paths, self.configs = set(), []
5542
5543     def init(self, args=None, filename=None):
5544         assert not self.__initialized
5545         self.own_args, self.filename = args, filename
5546         return self.load_configs()
5547
5548     def load_configs(self):
5549         directory = ''
5550         if self.filename:
5551             location = os.path.realpath(self.filename)
5552             directory = os.path.dirname(location)
5553             if location in self._loaded_paths:
5554                 return False
5555             self._loaded_paths.add(location)
5556
5557         self.__initialized = True
5558         opts, _ = self.parser.parse_known_args(self.own_args)
5559         self.parsed_args = self.own_args
5560         for location in opts.config_locations or []:
5561             if location == '-':
5562                 if location in self._loaded_paths:
5563                     continue
5564                 self._loaded_paths.add(location)
5565                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5566                 continue
5567             location = os.path.join(directory, expand_path(location))
5568             if os.path.isdir(location):
5569                 location = os.path.join(location, 'yt-dlp.conf')
5570             if not os.path.exists(location):
5571                 self.parser.error(f'config location {location} does not exist')
5572             self.append_config(self.read_file(location), location)
5573         return True
5574
5575     def __str__(self):
5576         label = join_nonempty(
5577             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5578             delim=' ')
5579         return join_nonempty(
5580             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5581             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5582             delim='\n')
5583
5584     @staticmethod
5585     def read_file(filename, default=[]):
5586         try:
5587             optionf = open(filename, 'rb')
5588         except OSError:
5589             return default  # silently skip if file is not present
5590         try:
5591             enc, skip = determine_file_encoding(optionf.read(512))
5592             optionf.seek(skip, io.SEEK_SET)
5593         except OSError:
5594             enc = None  # silently skip read errors
5595         try:
5596             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5597             contents = optionf.read().decode(enc or preferredencoding())
5598             res = shlex.split(contents, comments=True)
5599         except Exception as err:
5600             raise ValueError(f'Unable to parse "{filename}": {err}')
5601         finally:
5602             optionf.close()
5603         return res
5604
5605     @staticmethod
5606     def hide_login_info(opts):
5607         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5608         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5609
5610         def _scrub_eq(o):
5611             m = eqre.match(o)
5612             if m:
5613                 return m.group('key') + '=PRIVATE'
5614             else:
5615                 return o
5616
5617         opts = list(map(_scrub_eq, opts))
5618         for idx, opt in enumerate(opts):
5619             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5620                 opts[idx + 1] = 'PRIVATE'
5621         return opts
5622
5623     def append_config(self, *args, label=None):
5624         config = type(self)(self.parser, label)
5625         config._loaded_paths = self._loaded_paths
5626         if config.init(*args):
5627             self.configs.append(config)
5628
5629     @property
5630     def all_args(self):
5631         for config in reversed(self.configs):
5632             yield from config.all_args
5633         yield from self.parsed_args or []
5634
5635     def parse_known_args(self, **kwargs):
5636         return self.parser.parse_known_args(self.all_args, **kwargs)
5637
5638     def parse_args(self):
5639         return self.parser.parse_args(self.all_args)
5640
5641
5642 class WebSocketsWrapper():
5643     """Wraps websockets module to use in non-async scopes"""
5644     pool = None
5645
5646     def __init__(self, url, headers=None, connect=True):
5647         self.loop = asyncio.new_event_loop()
5648         # XXX: "loop" is deprecated
5649         self.conn = websockets.connect(
5650             url, extra_headers=headers, ping_interval=None,
5651             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5652         if connect:
5653             self.__enter__()
5654         atexit.register(self.__exit__, None, None, None)
5655
5656     def __enter__(self):
5657         if not self.pool:
5658             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5659         return self
5660
5661     def send(self, *args):
5662         self.run_with_loop(self.pool.send(*args), self.loop)
5663
5664     def recv(self, *args):
5665         return self.run_with_loop(self.pool.recv(*args), self.loop)
5666
5667     def __exit__(self, type, value, traceback):
5668         try:
5669             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5670         finally:
5671             self.loop.close()
5672             self._cancel_all_tasks(self.loop)
5673
5674     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5675     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5676     @staticmethod
5677     def run_with_loop(main, loop):
5678         if not asyncio.iscoroutine(main):
5679             raise ValueError(f'a coroutine was expected, got {main!r}')
5680
5681         try:
5682             return loop.run_until_complete(main)
5683         finally:
5684             loop.run_until_complete(loop.shutdown_asyncgens())
5685             if hasattr(loop, 'shutdown_default_executor'):
5686                 loop.run_until_complete(loop.shutdown_default_executor())
5687
5688     @staticmethod
5689     def _cancel_all_tasks(loop):
5690         to_cancel = asyncio.all_tasks(loop)
5691
5692         if not to_cancel:
5693             return
5694
5695         for task in to_cancel:
5696             task.cancel()
5697
5698         # XXX: "loop" is removed in python 3.10+
5699         loop.run_until_complete(
5700             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5701
5702         for task in to_cancel:
5703             if task.cancelled():
5704                 continue
5705             if task.exception() is not None:
5706                 loop.call_exception_handler({
5707                     'message': 'unhandled exception during asyncio.run() shutdown',
5708                     'exception': task.exception(),
5709                     'task': task,
5710                 })
5711
5712
5713 def merge_headers(*dicts):
5714     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5715     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5716
5717
5718 def cached_method(f):
5719     """Cache a method"""
5720     signature = inspect.signature(f)
5721
5722     @functools.wraps(f)
5723     def wrapper(self, *args, **kwargs):
5724         bound_args = signature.bind(self, *args, **kwargs)
5725         bound_args.apply_defaults()
5726         key = tuple(bound_args.arguments.values())
5727
5728         if not hasattr(self, '__cached_method__cache'):
5729             self.__cached_method__cache = {}
5730         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5731         if key not in cache:
5732             cache[key] = f(self, *args, **kwargs)
5733         return cache[key]
5734     return wrapper
5735
5736
5737 class classproperty:
5738     """property access for class methods"""
5739
5740     def __init__(self, func):
5741         functools.update_wrapper(self, func)
5742         self.func = func
5743
5744     def __get__(self, _, cls):
5745         return self.func(cls)
5746
5747
5748 class Namespace(types.SimpleNamespace):
5749     """Immutable namespace"""
5750
5751     def __iter__(self):
5752         return iter(self.__dict__.values())
5753
5754     @property
5755     def items_(self):
5756         return self.__dict__.items()
5757
5758
5759 MEDIA_EXTENSIONS = Namespace(
5760     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5761     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5762     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5763     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5764     thumbnails=('jpg', 'png', 'webp'),
5765     storyboards=('mhtml', ),
5766     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5767     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5768 )
5769 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5770 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5771
5772 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5773
5774
5775 class RetryManager:
5776     """Usage:
5777         for retry in RetryManager(...):
5778             try:
5779                 ...
5780             except SomeException as err:
5781                 retry.error = err
5782                 continue
5783     """
5784     attempt, _error = 0, None
5785
5786     def __init__(self, _retries, _error_callback, **kwargs):
5787         self.retries = _retries or 0
5788         self.error_callback = functools.partial(_error_callback, **kwargs)
5789
5790     def _should_retry(self):
5791         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5792
5793     @property
5794     def error(self):
5795         if self._error is NO_DEFAULT:
5796             return None
5797         return self._error
5798
5799     @error.setter
5800     def error(self, value):
5801         self._error = value
5802
5803     def __iter__(self):
5804         while self._should_retry():
5805             self.error = NO_DEFAULT
5806             self.attempt += 1
5807             yield self
5808             if self.error:
5809                 self.error_callback(self.error, self.attempt, self.retries)
5810
5811     @staticmethod
5812     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5813         """Utility function for reporting retries"""
5814         if count > retries:
5815             if error:
5816                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5817             raise e
5818
5819         if not count:
5820             return warn(e)
5821         elif isinstance(e, ExtractorError):
5822             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5823         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5824
5825         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5826         if delay:
5827             info(f'Sleeping {delay:.2f} seconds ...')
5828             time.sleep(delay)
5829
5830
5831 def make_archive_id(ie, video_id):
5832     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5833     return f'{ie_key.lower()} {video_id}'
5834
5835
5836 def truncate_string(s, left, right=0):
5837     assert left > 3 and right >= 0
5838     if s is None or len(s) <= left + right:
5839         return s
5840     return f'{s[:left-3]}...{s[-right:]}'
5841
5842
5843 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5844     assert 'all' in alias_dict, '"all" alias is required'
5845     requested = list(start or [])
5846     for val in options:
5847         discard = val.startswith('-')
5848         if discard:
5849             val = val[1:]
5850
5851         if val in alias_dict:
5852             val = alias_dict[val] if not discard else [
5853                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5854             # NB: Do not allow regex in aliases for performance
5855             requested = orderedSet_from_options(val, alias_dict, start=requested)
5856             continue
5857
5858         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5859                    else [val] if val in alias_dict['all'] else None)
5860         if current is None:
5861             raise ValueError(val)
5862
5863         if discard:
5864             for item in current:
5865                 while item in requested:
5866                     requested.remove(item)
5867         else:
5868             requested.extend(current)
5869
5870     return orderedSet(requested)
5871
5872
5873 # Deprecated
5874 has_certifi = bool(certifi)
5875 has_websockets = bool(websockets)