yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import importlib.util
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import mimetypes
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import subprocess
  33 import sys
  34 import tempfile
  35 import time
  36 import traceback
  37 import urllib.parse
  38 import xml.etree.ElementTree
  39 import zlib
  40
  41 from .compat import asyncio, functools  # isort: split
  42 from .compat import (
  43     compat_chr,
  44     compat_cookiejar,
  45     compat_etree_fromstring,
  46     compat_expanduser,
  47     compat_html_entities,
  48     compat_html_entities_html5,
  49     compat_HTMLParseError,
  50     compat_HTMLParser,
  51     compat_http_client,
  52     compat_HTTPError,
  53     compat_os_name,
  54     compat_parse_qs,
  55     compat_shlex_quote,
  56     compat_str,
  57     compat_struct_pack,
  58     compat_struct_unpack,
  59     compat_urllib_error,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_parse_urlencode,
  62     compat_urllib_parse_urlparse,
  63     compat_urllib_request,
  64     compat_urlparse,
  65 )
  66 from .dependencies import brotli, certifi, websockets
  67 from .socks import ProxyType, sockssocket
  68
  69
  70 def register_socks_protocols():
  71     # "Register" SOCKS protocols
  72     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  73     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  74     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  75         if scheme not in compat_urlparse.uses_netloc:
  76             compat_urlparse.uses_netloc.append(scheme)
  77
  78
  79 # This is not clearly defined otherwise
  80 compiled_regex_type = type(re.compile(''))
  81
  82
  83 def random_user_agent():
  84     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  85     _CHROME_VERSIONS = (
  86         '90.0.4430.212',
  87         '90.0.4430.24',
  88         '90.0.4430.70',
  89         '90.0.4430.72',
  90         '90.0.4430.85',
  91         '90.0.4430.93',
  92         '91.0.4472.101',
  93         '91.0.4472.106',
  94         '91.0.4472.114',
  95         '91.0.4472.124',
  96         '91.0.4472.164',
  97         '91.0.4472.19',
  98         '91.0.4472.77',
  99         '92.0.4515.107',
 100         '92.0.4515.115',
 101         '92.0.4515.131',
 102         '92.0.4515.159',
 103         '92.0.4515.43',
 104         '93.0.4556.0',
 105         '93.0.4577.15',
 106         '93.0.4577.63',
 107         '93.0.4577.82',
 108         '94.0.4606.41',
 109         '94.0.4606.54',
 110         '94.0.4606.61',
 111         '94.0.4606.71',
 112         '94.0.4606.81',
 113         '94.0.4606.85',
 114         '95.0.4638.17',
 115         '95.0.4638.50',
 116         '95.0.4638.54',
 117         '95.0.4638.69',
 118         '95.0.4638.74',
 119         '96.0.4664.18',
 120         '96.0.4664.45',
 121         '96.0.4664.55',
 122         '96.0.4664.93',
 123         '97.0.4692.20',
 124     )
 125     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 126
 127
 128 SUPPORTED_ENCODINGS = [
 129     'gzip', 'deflate'
 130 ]
 131 if brotli:
 132     SUPPORTED_ENCODINGS.append('br')
 133
 134 std_headers = {
 135     'User-Agent': random_user_agent(),
 136     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 137     'Accept-Language': 'en-us,en;q=0.5',
 138     'Sec-Fetch-Mode': 'navigate',
 139 }
 140
 141
 142 USER_AGENTS = {
 143     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 144 }
 145
 146
 147 NO_DEFAULT = object()
 148
 149 ENGLISH_MONTH_NAMES = [
 150     'January', 'February', 'March', 'April', 'May', 'June',
 151     'July', 'August', 'September', 'October', 'November', 'December']
 152
 153 MONTH_NAMES = {
 154     'en': ENGLISH_MONTH_NAMES,
 155     'fr': [
 156         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 157         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 158 }
 159
 160 KNOWN_EXTENSIONS = (
 161     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 162     'flv', 'f4v', 'f4a', 'f4b',
 163     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 164     'mkv', 'mka', 'mk3d',
 165     'avi', 'divx',
 166     'mov',
 167     'asf', 'wmv', 'wma',
 168     '3gp', '3g2',
 169     'mp3',
 170     'flac',
 171     'ape',
 172     'wav',
 173     'f4f', 'f4m', 'm3u8', 'smil')
 174
 175 # needed for sanitizing filenames in restricted mode
 176 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 177                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 178                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 179
 180 DATE_FORMATS = (
 181     '%d %B %Y',
 182     '%d %b %Y',
 183     '%B %d %Y',
 184     '%B %dst %Y',
 185     '%B %dnd %Y',
 186     '%B %drd %Y',
 187     '%B %dth %Y',
 188     '%b %d %Y',
 189     '%b %dst %Y',
 190     '%b %dnd %Y',
 191     '%b %drd %Y',
 192     '%b %dth %Y',
 193     '%b %dst %Y %I:%M',
 194     '%b %dnd %Y %I:%M',
 195     '%b %drd %Y %I:%M',
 196     '%b %dth %Y %I:%M',
 197     '%Y %m %d',
 198     '%Y-%m-%d',
 199     '%Y.%m.%d.',
 200     '%Y/%m/%d',
 201     '%Y/%m/%d %H:%M',
 202     '%Y/%m/%d %H:%M:%S',
 203     '%Y%m%d%H%M',
 204     '%Y%m%d%H%M%S',
 205     '%Y%m%d',
 206     '%Y-%m-%d %H:%M',
 207     '%Y-%m-%d %H:%M:%S',
 208     '%Y-%m-%d %H:%M:%S.%f',
 209     '%Y-%m-%d %H:%M:%S:%f',
 210     '%d.%m.%Y %H:%M',
 211     '%d.%m.%Y %H.%M',
 212     '%Y-%m-%dT%H:%M:%SZ',
 213     '%Y-%m-%dT%H:%M:%S.%fZ',
 214     '%Y-%m-%dT%H:%M:%S.%f0Z',
 215     '%Y-%m-%dT%H:%M:%S',
 216     '%Y-%m-%dT%H:%M:%S.%f',
 217     '%Y-%m-%dT%H:%M',
 218     '%b %d %Y at %H:%M',
 219     '%b %d %Y at %H:%M:%S',
 220     '%B %d %Y at %H:%M',
 221     '%B %d %Y at %H:%M:%S',
 222     '%H:%M %d-%b-%Y',
 223 )
 224
 225 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 226 DATE_FORMATS_DAY_FIRST.extend([
 227     '%d-%m-%Y',
 228     '%d.%m.%Y',
 229     '%d.%m.%y',
 230     '%d/%m/%Y',
 231     '%d/%m/%y',
 232     '%d/%m/%Y %H:%M:%S',
 233 ])
 234
 235 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 236 DATE_FORMATS_MONTH_FIRST.extend([
 237     '%m-%d-%Y',
 238     '%m.%d.%Y',
 239     '%m/%d/%Y',
 240     '%m/%d/%y',
 241     '%m/%d/%Y %H:%M:%S',
 242 ])
 243
 244 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 245 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 246
 247 NUMBER_RE = r'\d+(?:\.\d+)?'
 248
 249
 250 @functools.cache
 251 def preferredencoding():
 252     """Get preferred encoding.
 253
 254     Returns the best encoding scheme for the system, based on
 255     locale.getpreferredencoding() and some further tweaks.
 256     """
 257     try:
 258         pref = locale.getpreferredencoding()
 259         'TEST'.encode(pref)
 260     except Exception:
 261         pref = 'UTF-8'
 262
 263     return pref
 264
 265
 266 def write_json_file(obj, fn):
 267     """ Encode obj as JSON and write it to fn, atomically if possible """
 268
 269     tf = tempfile.NamedTemporaryFile(
 270         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 271         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 272
 273     try:
 274         with tf:
 275             json.dump(obj, tf, ensure_ascii=False)
 276         if sys.platform == 'win32':
 277             # Need to remove existing file on Windows, else os.rename raises
 278             # WindowsError or FileExistsError.
 279             with contextlib.suppress(OSError):
 280                 os.unlink(fn)
 281         with contextlib.suppress(OSError):
 282             mask = os.umask(0)
 283             os.umask(mask)
 284             os.chmod(tf.name, 0o666 & ~mask)
 285         os.rename(tf.name, fn)
 286     except Exception:
 287         with contextlib.suppress(OSError):
 288             os.remove(tf.name)
 289         raise
 290
 291
 292 def find_xpath_attr(node, xpath, key, val=None):
 293     """ Find the xpath xpath[@key=val] """
 294     assert re.match(r'^[a-zA-Z_-]+$', key)
 295     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 296     return node.find(expr)
 297
 298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 299 # the namespace parameter
 300
 301
 302 def xpath_with_ns(path, ns_map):
 303     components = [c.split(':') for c in path.split('/')]
 304     replaced = []
 305     for c in components:
 306         if len(c) == 1:
 307             replaced.append(c[0])
 308         else:
 309             ns, tag = c
 310             replaced.append('{%s}%s' % (ns_map[ns], tag))
 311     return '/'.join(replaced)
 312
 313
 314 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 315     def _find_xpath(xpath):
 316         return node.find(xpath)
 317
 318     if isinstance(xpath, (str, compat_str)):
 319         n = _find_xpath(xpath)
 320     else:
 321         for xp in xpath:
 322             n = _find_xpath(xp)
 323             if n is not None:
 324                 break
 325
 326     if n is None:
 327         if default is not NO_DEFAULT:
 328             return default
 329         elif fatal:
 330             name = xpath if name is None else name
 331             raise ExtractorError('Could not find XML element %s' % name)
 332         else:
 333             return None
 334     return n
 335
 336
 337 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 338     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 339     if n is None or n == default:
 340         return n
 341     if n.text is None:
 342         if default is not NO_DEFAULT:
 343             return default
 344         elif fatal:
 345             name = xpath if name is None else name
 346             raise ExtractorError('Could not find XML element\'s text %s' % name)
 347         else:
 348             return None
 349     return n.text
 350
 351
 352 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 353     n = find_xpath_attr(node, xpath, key)
 354     if n is None:
 355         if default is not NO_DEFAULT:
 356             return default
 357         elif fatal:
 358             name = f'{xpath}[@{key}]' if name is None else name
 359             raise ExtractorError('Could not find XML attribute %s' % name)
 360         else:
 361             return None
 362     return n.attrib[key]
 363
 364
 365 def get_element_by_id(id, html, **kwargs):
 366     """Return the content of the tag with the specified ID in the passed HTML document"""
 367     return get_element_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_html_by_id(id, html, **kwargs):
 371     """Return the html of the tag with the specified ID in the passed HTML document"""
 372     return get_element_html_by_attribute('id', id, html, **kwargs)
 373
 374
 375 def get_element_by_class(class_name, html):
 376     """Return the content of the first tag with the specified class in the passed HTML document"""
 377     retval = get_elements_by_class(class_name, html)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_html_by_class(class_name, html):
 382     """Return the html of the first tag with the specified class in the passed HTML document"""
 383     retval = get_elements_html_by_class(class_name, html)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_by_attribute(attribute, value, html, **kwargs):
 388     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_element_html_by_attribute(attribute, value, html, **kargs):
 393     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 394     return retval[0] if retval else None
 395
 396
 397 def get_elements_by_class(class_name, html, **kargs):
 398     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 399     return get_elements_by_attribute(
 400         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 401         html, escape_value=False)
 402
 403
 404 def get_elements_html_by_class(class_name, html):
 405     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 406     return get_elements_html_by_attribute(
 407         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 408         html, escape_value=False)
 409
 410
 411 def get_elements_by_attribute(*args, **kwargs):
 412     """Return the content of the tag with the specified attribute in the passed HTML document"""
 413     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_html_by_attribute(*args, **kwargs):
 417     """Return the html of the tag with the specified attribute in the passed HTML document"""
 418     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 419
 420
 421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 422     """
 423     Return the text (content) and the html (whole) of the tag with the specified
 424     attribute in the passed HTML document
 425     """
 426
 427     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 428
 429     value = re.escape(value) if escape_value else value
 430
 431     partial_element_re = rf'''(?x)
 432         <(?P<tag>[a-zA-Z0-9:._-]+)
 433          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 434          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 435         '''
 436
 437     for m in re.finditer(partial_element_re, html):
 438         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 439
 440         yield (
 441             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 442             whole
 443         )
 444
 445
 446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 447     """
 448     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 449     closing tag for the first opening tag it has encountered, and can be used
 450     as a context manager
 451     """
 452
 453     class HTMLBreakOnClosingTagException(Exception):
 454         pass
 455
 456     def __init__(self):
 457         self.tagstack = collections.deque()
 458         compat_HTMLParser.__init__(self)
 459
 460     def __enter__(self):
 461         return self
 462
 463     def __exit__(self, *_):
 464         self.close()
 465
 466     def close(self):
 467         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 468         # so data remains buffered; we no longer have any interest in it, thus
 469         # override this method to discard it
 470         pass
 471
 472     def handle_starttag(self, tag, _):
 473         self.tagstack.append(tag)
 474
 475     def handle_endtag(self, tag):
 476         if not self.tagstack:
 477             raise compat_HTMLParseError('no tags in the stack')
 478         while self.tagstack:
 479             inner_tag = self.tagstack.pop()
 480             if inner_tag == tag:
 481                 break
 482         else:
 483             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 484         if not self.tagstack:
 485             raise self.HTMLBreakOnClosingTagException()
 486
 487
 488 def get_element_text_and_html_by_tag(tag, html):
 489     """
 490     For the first element with the specified tag in the passed HTML document
 491     return its' content (text) and the whole element (html)
 492     """
 493     def find_or_raise(haystack, needle, exc):
 494         try:
 495             return haystack.index(needle)
 496         except ValueError:
 497             raise exc
 498     closing_tag = f'</{tag}>'
 499     whole_start = find_or_raise(
 500         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 501     content_start = find_or_raise(
 502         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 503     content_start += whole_start + 1
 504     with HTMLBreakOnClosingTagParser() as parser:
 505         parser.feed(html[whole_start:content_start])
 506         if not parser.tagstack or parser.tagstack[0] != tag:
 507             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 508         offset = content_start
 509         while offset < len(html):
 510             next_closing_tag_start = find_or_raise(
 511                 html[offset:], closing_tag,
 512                 compat_HTMLParseError(f'closing {tag} tag not found'))
 513             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 514             try:
 515                 parser.feed(html[offset:offset + next_closing_tag_end])
 516                 offset += next_closing_tag_end
 517             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 518                 return html[content_start:offset + next_closing_tag_start], \
 519                     html[whole_start:offset + next_closing_tag_end]
 520         raise compat_HTMLParseError('unexpected end of html')
 521
 522
 523 class HTMLAttributeParser(compat_HTMLParser):
 524     """Trivial HTML parser to gather the attributes for a single element"""
 525
 526     def __init__(self):
 527         self.attrs = {}
 528         compat_HTMLParser.__init__(self)
 529
 530     def handle_starttag(self, tag, attrs):
 531         self.attrs = dict(attrs)
 532
 533
 534 class HTMLListAttrsParser(compat_HTMLParser):
 535     """HTML parser to gather the attributes for the elements of a list"""
 536
 537     def __init__(self):
 538         compat_HTMLParser.__init__(self)
 539         self.items = []
 540         self._level = 0
 541
 542     def handle_starttag(self, tag, attrs):
 543         if tag == 'li' and self._level == 0:
 544             self.items.append(dict(attrs))
 545         self._level += 1
 546
 547     def handle_endtag(self, tag):
 548         self._level -= 1
 549
 550
 551 def extract_attributes(html_element):
 552     """Given a string for an HTML element such as
 553     <el
 554          a="foo" B="bar" c="&98;az" d=boz
 555          empty= noval entity="&amp;"
 556          sq='"' dq="'"
 557     >
 558     Decode and return a dictionary of attributes.
 559     {
 560         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 561         'empty': '', 'noval': None, 'entity': '&',
 562         'sq': '"', 'dq': '\''
 563     }.
 564     """
 565     parser = HTMLAttributeParser()
 566     with contextlib.suppress(compat_HTMLParseError):
 567         parser.feed(html_element)
 568         parser.close()
 569     return parser.attrs
 570
 571
 572 def parse_list(webpage):
 573     """Given a string for an series of HTML <li> elements,
 574     return a dictionary of their attributes"""
 575     parser = HTMLListAttrsParser()
 576     parser.feed(webpage)
 577     parser.close()
 578     return parser.items
 579
 580
 581 def clean_html(html):
 582     """Clean an HTML snippet into a readable string"""
 583
 584     if html is None:  # Convenience for sanitizing descriptions etc.
 585         return html
 586
 587     html = re.sub(r'\s+', ' ', html)
 588     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 589     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 590     # Strip html tags
 591     html = re.sub('<.*?>', '', html)
 592     # Replace html entities
 593     html = unescapeHTML(html)
 594     return html.strip()
 595
 596
 597 def sanitize_open(filename, open_mode):
 598     """Try to open the given filename, and slightly tweak it if this fails.
 599
 600     Attempts to open the given filename. If this fails, it tries to change
 601     the filename slightly, step by step, until it's either able to open it
 602     or it fails and raises a final exception, like the standard open()
 603     function.
 604
 605     It returns the tuple (stream, definitive_file_name).
 606     """
 607     if filename == '-':
 608         if sys.platform == 'win32':
 609             import msvcrt
 610             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 611         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 612
 613     for attempt in range(2):
 614         try:
 615             try:
 616                 if sys.platform == 'win32':
 617                     # FIXME: An exclusive lock also locks the file from being read.
 618                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 619                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 620                     raise LockingUnsupportedError()
 621                 stream = locked_file(filename, open_mode, block=False).__enter__()
 622             except LockingUnsupportedError:
 623                 stream = open(filename, open_mode)
 624             return (stream, filename)
 625         except OSError as err:
 626             if attempt or err.errno in (errno.EACCES,):
 627                 raise
 628             old_filename, filename = filename, sanitize_path(filename)
 629             if old_filename == filename:
 630                 raise
 631
 632
 633 def timeconvert(timestr):
 634     """Convert RFC 2822 defined time string into system timestamp"""
 635     timestamp = None
 636     timetuple = email.utils.parsedate_tz(timestr)
 637     if timetuple is not None:
 638         timestamp = email.utils.mktime_tz(timetuple)
 639     return timestamp
 640
 641
 642 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 643     """Sanitizes a string so it could be used as part of a filename.
 644     @param restricted   Use a stricter subset of allowed characters
 645     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 646                         If unset, yt-dlp's new sanitization rules are in effect
 647     """
 648     if s == '':
 649         return ''
 650
 651     def replace_insane(char):
 652         if restricted and char in ACCENT_CHARS:
 653             return ACCENT_CHARS[char]
 654         elif not restricted and char == '\n':
 655             return '\0 '
 656         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 657             return ''
 658         elif char == '"':
 659             return '' if restricted else '\''
 660         elif char == ':':
 661             return '\0_\0-' if restricted else '\0 \0-'
 662         elif char in '\\/|*<>':
 663             return '\0_'
 664         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 665             return '\0_'
 666         return char
 667
 668     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 669     result = ''.join(map(replace_insane, s))
 670     if is_id is NO_DEFAULT:
 671         result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result)  # Remove repeated substitute chars
 672         STRIP_RE = '(?:\0.|[ _-])*'
 673         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 674     result = result.replace('\0', '') or '_'
 675
 676     if not is_id:
 677         while '__' in result:
 678             result = result.replace('__', '_')
 679         result = result.strip('_')
 680         # Common case of "Foreign band name - English song title"
 681         if restricted and result.startswith('-_'):
 682             result = result[2:]
 683         if result.startswith('-'):
 684             result = '_' + result[len('-'):]
 685         result = result.lstrip('.')
 686         if not result:
 687             result = '_'
 688     return result
 689
 690
 691 def sanitize_path(s, force=False):
 692     """Sanitizes and normalizes path on Windows"""
 693     if sys.platform == 'win32':
 694         force = False
 695         drive_or_unc, _ = os.path.splitdrive(s)
 696     elif force:
 697         drive_or_unc = ''
 698     else:
 699         return s
 700
 701     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 702     if drive_or_unc:
 703         norm_path.pop(0)
 704     sanitized_path = [
 705         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 706         for path_part in norm_path]
 707     if drive_or_unc:
 708         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 709     elif force and s and s[0] == os.path.sep:
 710         sanitized_path.insert(0, os.path.sep)
 711     return os.path.join(*sanitized_path)
 712
 713
 714 def sanitize_url(url):
 715     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 716     # the number of unwanted failures due to missing protocol
 717     if url is None:
 718         return
 719     elif url.startswith('//'):
 720         return 'http:%s' % url
 721     # Fix some common typos seen so far
 722     COMMON_TYPOS = (
 723         # https://github.com/ytdl-org/youtube-dl/issues/15649
 724         (r'^httpss://', r'https://'),
 725         # https://bx1.be/lives/direct-tv/
 726         (r'^rmtp([es]?)://', r'rtmp\1://'),
 727     )
 728     for mistake, fixup in COMMON_TYPOS:
 729         if re.match(mistake, url):
 730             return re.sub(mistake, fixup, url)
 731     return url
 732
 733
 734 def extract_basic_auth(url):
 735     parts = compat_urlparse.urlsplit(url)
 736     if parts.username is None:
 737         return url, None
 738     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 739         parts.hostname if parts.port is None
 740         else '%s:%d' % (parts.hostname, parts.port))))
 741     auth_payload = base64.b64encode(
 742         ('%s:%s' % (parts.username, parts.password or '')).encode())
 743     return url, f'Basic {auth_payload.decode()}'
 744
 745
 746 def sanitized_Request(url, *args, **kwargs):
 747     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 748     if auth_header is not None:
 749         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 750         headers['Authorization'] = auth_header
 751     return compat_urllib_request.Request(url, *args, **kwargs)
 752
 753
 754 def expand_path(s):
 755     """Expand shell variables and ~"""
 756     return os.path.expandvars(compat_expanduser(s))
 757
 758
 759 def orderedSet(iterable):
 760     """ Remove all duplicates from the input iterable """
 761     res = []
 762     for el in iterable:
 763         if el not in res:
 764             res.append(el)
 765     return res
 766
 767
 768 def _htmlentity_transform(entity_with_semicolon):
 769     """Transforms an HTML entity to a character."""
 770     entity = entity_with_semicolon[:-1]
 771
 772     # Known non-numeric HTML entity
 773     if entity in compat_html_entities.name2codepoint:
 774         return compat_chr(compat_html_entities.name2codepoint[entity])
 775
 776     # TODO: HTML5 allows entities without a semicolon. For example,
 777     # '&Eacuteric' should be decoded as 'Éric'.
 778     if entity_with_semicolon in compat_html_entities_html5:
 779         return compat_html_entities_html5[entity_with_semicolon]
 780
 781     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 782     if mobj is not None:
 783         numstr = mobj.group(1)
 784         if numstr.startswith('x'):
 785             base = 16
 786             numstr = '0%s' % numstr
 787         else:
 788             base = 10
 789         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 790         with contextlib.suppress(ValueError):
 791             return compat_chr(int(numstr, base))
 792
 793     # Unknown entity in name, return its literal representation
 794     return '&%s;' % entity
 795
 796
 797 def unescapeHTML(s):
 798     if s is None:
 799         return None
 800     assert isinstance(s, str)
 801
 802     return re.sub(
 803         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 804
 805
 806 def escapeHTML(text):
 807     return (
 808         text
 809         .replace('&', '&amp;')
 810         .replace('<', '&lt;')
 811         .replace('>', '&gt;')
 812         .replace('"', '&quot;')
 813         .replace("'", '&#39;')
 814     )
 815
 816
 817 def process_communicate_or_kill(p, *args, **kwargs):
 818     try:
 819         return p.communicate(*args, **kwargs)
 820     except BaseException:  # Including KeyboardInterrupt
 821         p.kill()
 822         p.wait()
 823         raise
 824
 825
 826 class Popen(subprocess.Popen):
 827     if sys.platform == 'win32':
 828         _startupinfo = subprocess.STARTUPINFO()
 829         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 830     else:
 831         _startupinfo = None
 832
 833     def __init__(self, *args, **kwargs):
 834         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 835
 836     def communicate_or_kill(self, *args, **kwargs):
 837         return process_communicate_or_kill(self, *args, **kwargs)
 838
 839
 840 def get_subprocess_encoding():
 841     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 842         # For subprocess calls, encode with locale encoding
 843         # Refer to http://stackoverflow.com/a/9951851/35070
 844         encoding = preferredencoding()
 845     else:
 846         encoding = sys.getfilesystemencoding()
 847     if encoding is None:
 848         encoding = 'utf-8'
 849     return encoding
 850
 851
 852 def encodeFilename(s, for_subprocess=False):
 853     assert isinstance(s, str)
 854     return s
 855
 856
 857 def decodeFilename(b, for_subprocess=False):
 858     return b
 859
 860
 861 def encodeArgument(s):
 862     # Legacy code that uses byte strings
 863     # Uncomment the following line after fixing all post processors
 864     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 865     return s if isinstance(s, str) else s.decode('ascii')
 866
 867
 868 def decodeArgument(b):
 869     return b
 870
 871
 872 def decodeOption(optval):
 873     if optval is None:
 874         return optval
 875     if isinstance(optval, bytes):
 876         optval = optval.decode(preferredencoding())
 877
 878     assert isinstance(optval, compat_str)
 879     return optval
 880
 881
 882 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 883
 884
 885 def timetuple_from_msec(msec):
 886     secs, msec = divmod(msec, 1000)
 887     mins, secs = divmod(secs, 60)
 888     hrs, mins = divmod(mins, 60)
 889     return _timetuple(hrs, mins, secs, msec)
 890
 891
 892 def formatSeconds(secs, delim=':', msec=False):
 893     time = timetuple_from_msec(secs * 1000)
 894     if time.hours:
 895         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 896     elif time.minutes:
 897         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 898     else:
 899         ret = '%d' % time.seconds
 900     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 901
 902
 903 def _ssl_load_windows_store_certs(ssl_context, storename):
 904     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 905     try:
 906         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 907                  if encoding == 'x509_asn' and (
 908                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 909     except PermissionError:
 910         return
 911     for cert in certs:
 912         with contextlib.suppress(ssl.SSLError):
 913             ssl_context.load_verify_locations(cadata=cert)
 914
 915
 916 def make_HTTPS_handler(params, **kwargs):
 917     opts_check_certificate = not params.get('nocheckcertificate')
 918     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 919     context.check_hostname = opts_check_certificate
 920     if params.get('legacyserverconnect'):
 921         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 922         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 923         context.set_ciphers('DEFAULT')
 924     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 925     if opts_check_certificate:
 926         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 927             context.load_verify_locations(cafile=certifi.where())
 928         else:
 929             try:
 930                 context.load_default_certs()
 931                 # Work around the issue in load_default_certs when there are bad certificates. See:
 932                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 933                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 934             except ssl.SSLError:
 935                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 936                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 937                     for storename in ('CA', 'ROOT'):
 938                         _ssl_load_windows_store_certs(context, storename)
 939                 context.set_default_verify_paths()
 940     client_certfile = params.get('client_certificate')
 941     if client_certfile:
 942         try:
 943             context.load_cert_chain(
 944                 client_certfile, keyfile=params.get('client_certificate_key'),
 945                 password=params.get('client_certificate_password'))
 946         except ssl.SSLError:
 947             raise YoutubeDLError('Unable to load client certificate')
 948     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 949
 950
 951 def bug_reports_message(before=';'):
 952     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , '
 953            'filling out the appropriate issue template. '
 954            'Confirm you are on the latest version using  yt-dlp -U')
 955
 956     before = before.rstrip()
 957     if not before or before.endswith(('.', '!', '?')):
 958         msg = msg[0].title() + msg[1:]
 959
 960     return (before + ' ' if before else '') + msg
 961
 962
 963 class YoutubeDLError(Exception):
 964     """Base exception for YoutubeDL errors."""
 965     msg = None
 966
 967     def __init__(self, msg=None):
 968         if msg is not None:
 969             self.msg = msg
 970         elif self.msg is None:
 971             self.msg = type(self).__name__
 972         super().__init__(self.msg)
 973
 974
 975 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 976 if hasattr(ssl, 'CertificateError'):
 977     network_exceptions.append(ssl.CertificateError)
 978 network_exceptions = tuple(network_exceptions)
 979
 980
 981 class ExtractorError(YoutubeDLError):
 982     """Error during info extraction."""
 983
 984     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 985         """ tb, if given, is the original traceback (so that it can be printed out).
 986         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 987         """
 988         if sys.exc_info()[0] in network_exceptions:
 989             expected = True
 990
 991         self.orig_msg = str(msg)
 992         self.traceback = tb
 993         self.expected = expected
 994         self.cause = cause
 995         self.video_id = video_id
 996         self.ie = ie
 997         self.exc_info = sys.exc_info()  # preserve original exception
 998
 999         super().__init__(''.join((
1000             format_field(ie, template='[%s] '),
1001             format_field(video_id, template='%s: '),
1002             msg,
1003             format_field(cause, template=' (caused by %r)'),
1004             '' if expected else bug_reports_message())))
1005
1006     def format_traceback(self):
1007         return join_nonempty(
1008             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1009             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1010             delim='\n') or None
1011
1012
1013 class UnsupportedError(ExtractorError):
1014     def __init__(self, url):
1015         super().__init__(
1016             'Unsupported URL: %s' % url, expected=True)
1017         self.url = url
1018
1019
1020 class RegexNotFoundError(ExtractorError):
1021     """Error when a regex didn't match"""
1022     pass
1023
1024
1025 class GeoRestrictedError(ExtractorError):
1026     """Geographic restriction Error exception.
1027
1028     This exception may be thrown when a video is not available from your
1029     geographic location due to geographic restrictions imposed by a website.
1030     """
1031
1032     def __init__(self, msg, countries=None, **kwargs):
1033         kwargs['expected'] = True
1034         super().__init__(msg, **kwargs)
1035         self.countries = countries
1036
1037
1038 class DownloadError(YoutubeDLError):
1039     """Download Error exception.
1040
1041     This exception may be thrown by FileDownloader objects if they are not
1042     configured to continue on errors. They will contain the appropriate
1043     error message.
1044     """
1045
1046     def __init__(self, msg, exc_info=None):
1047         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1048         super().__init__(msg)
1049         self.exc_info = exc_info
1050
1051
1052 class EntryNotInPlaylist(YoutubeDLError):
1053     """Entry not in playlist exception.
1054
1055     This exception will be thrown by YoutubeDL when a requested entry
1056     is not found in the playlist info_dict
1057     """
1058     msg = 'Entry not found in info'
1059
1060
1061 class SameFileError(YoutubeDLError):
1062     """Same File exception.
1063
1064     This exception will be thrown by FileDownloader objects if they detect
1065     multiple files would have to be downloaded to the same file on disk.
1066     """
1067     msg = 'Fixed output name but more than one file to download'
1068
1069     def __init__(self, filename=None):
1070         if filename is not None:
1071             self.msg += f': {filename}'
1072         super().__init__(self.msg)
1073
1074
1075 class PostProcessingError(YoutubeDLError):
1076     """Post Processing exception.
1077
1078     This exception may be raised by PostProcessor's .run() method to
1079     indicate an error in the postprocessing task.
1080     """
1081
1082
1083 class DownloadCancelled(YoutubeDLError):
1084     """ Exception raised when the download queue should be interrupted """
1085     msg = 'The download was cancelled'
1086
1087
1088 class ExistingVideoReached(DownloadCancelled):
1089     """ --break-on-existing triggered """
1090     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1091
1092
1093 class RejectedVideoReached(DownloadCancelled):
1094     """ --break-on-reject triggered """
1095     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1096
1097
1098 class MaxDownloadsReached(DownloadCancelled):
1099     """ --max-downloads limit has been reached. """
1100     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1101
1102
1103 class ReExtractInfo(YoutubeDLError):
1104     """ Video info needs to be re-extracted. """
1105
1106     def __init__(self, msg, expected=False):
1107         super().__init__(msg)
1108         self.expected = expected
1109
1110
1111 class ThrottledDownload(ReExtractInfo):
1112     """ Download speed below --throttled-rate. """
1113     msg = 'The download speed is below throttle limit'
1114
1115     def __init__(self):
1116         super().__init__(self.msg, expected=False)
1117
1118
1119 class UnavailableVideoError(YoutubeDLError):
1120     """Unavailable Format exception.
1121
1122     This exception will be thrown when a video is requested
1123     in a format that is not available for that video.
1124     """
1125     msg = 'Unable to download video'
1126
1127     def __init__(self, err=None):
1128         if err is not None:
1129             self.msg += f': {err}'
1130         super().__init__(self.msg)
1131
1132
1133 class ContentTooShortError(YoutubeDLError):
1134     """Content Too Short exception.
1135
1136     This exception may be raised by FileDownloader objects when a file they
1137     download is too small for what the server announced first, indicating
1138     the connection was probably interrupted.
1139     """
1140
1141     def __init__(self, downloaded, expected):
1142         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1143         # Both in bytes
1144         self.downloaded = downloaded
1145         self.expected = expected
1146
1147
1148 class XAttrMetadataError(YoutubeDLError):
1149     def __init__(self, code=None, msg='Unknown error'):
1150         super().__init__(msg)
1151         self.code = code
1152         self.msg = msg
1153
1154         # Parsing code and msg
1155         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1156                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1157             self.reason = 'NO_SPACE'
1158         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1159             self.reason = 'VALUE_TOO_LONG'
1160         else:
1161             self.reason = 'NOT_SUPPORTED'
1162
1163
1164 class XAttrUnavailableError(YoutubeDLError):
1165     pass
1166
1167
1168 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1169     hc = http_class(*args, **kwargs)
1170     source_address = ydl_handler._params.get('source_address')
1171
1172     if source_address is not None:
1173         # This is to workaround _create_connection() from socket where it will try all
1174         # address data from getaddrinfo() including IPv6. This filters the result from
1175         # getaddrinfo() based on the source_address value.
1176         # This is based on the cpython socket.create_connection() function.
1177         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1178         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1179             host, port = address
1180             err = None
1181             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1182             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1183             ip_addrs = [addr for addr in addrs if addr[0] == af]
1184             if addrs and not ip_addrs:
1185                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1186                 raise OSError(
1187                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1188                     % (ip_version, source_address[0]))
1189             for res in ip_addrs:
1190                 af, socktype, proto, canonname, sa = res
1191                 sock = None
1192                 try:
1193                     sock = socket.socket(af, socktype, proto)
1194                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1195                         sock.settimeout(timeout)
1196                     sock.bind(source_address)
1197                     sock.connect(sa)
1198                     err = None  # Explicitly break reference cycle
1199                     return sock
1200                 except OSError as _:
1201                     err = _
1202                     if sock is not None:
1203                         sock.close()
1204             if err is not None:
1205                 raise err
1206             else:
1207                 raise OSError('getaddrinfo returns an empty list')
1208         if hasattr(hc, '_create_connection'):
1209             hc._create_connection = _create_connection
1210         hc.source_address = (source_address, 0)
1211
1212     return hc
1213
1214
1215 def handle_youtubedl_headers(headers):
1216     filtered_headers = headers
1217
1218     if 'Youtubedl-no-compression' in filtered_headers:
1219         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1220         del filtered_headers['Youtubedl-no-compression']
1221
1222     return filtered_headers
1223
1224
1225 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1226     """Handler for HTTP requests and responses.
1227
1228     This class, when installed with an OpenerDirector, automatically adds
1229     the standard headers to every HTTP request and handles gzipped and
1230     deflated responses from web servers. If compression is to be avoided in
1231     a particular request, the original request in the program code only has
1232     to include the HTTP header "Youtubedl-no-compression", which will be
1233     removed before making the real request.
1234
1235     Part of this code was copied from:
1236
1237     http://techknack.net/python-urllib2-handlers/
1238
1239     Andrew Rowls, the author of that code, agreed to release it to the
1240     public domain.
1241     """
1242
1243     def __init__(self, params, *args, **kwargs):
1244         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1245         self._params = params
1246
1247     def http_open(self, req):
1248         conn_class = compat_http_client.HTTPConnection
1249
1250         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1251         if socks_proxy:
1252             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1253             del req.headers['Ytdl-socks-proxy']
1254
1255         return self.do_open(functools.partial(
1256             _create_http_connection, self, conn_class, False),
1257             req)
1258
1259     @staticmethod
1260     def deflate(data):
1261         if not data:
1262             return data
1263         try:
1264             return zlib.decompress(data, -zlib.MAX_WBITS)
1265         except zlib.error:
1266             return zlib.decompress(data)
1267
1268     @staticmethod
1269     def brotli(data):
1270         if not data:
1271             return data
1272         return brotli.decompress(data)
1273
1274     def http_request(self, req):
1275         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1276         # always respected by websites, some tend to give out URLs with non percent-encoded
1277         # non-ASCII characters (see telemb.py, ard.py [#3412])
1278         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1279         # To work around aforementioned issue we will replace request's original URL with
1280         # percent-encoded one
1281         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1282         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1283         url = req.get_full_url()
1284         url_escaped = escape_url(url)
1285
1286         # Substitute URL if any change after escaping
1287         if url != url_escaped:
1288             req = update_Request(req, url=url_escaped)
1289
1290         for h, v in self._params.get('http_headers', std_headers).items():
1291             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1292             # The dict keys are capitalized because of this bug by urllib
1293             if h.capitalize() not in req.headers:
1294                 req.add_header(h, v)
1295
1296         if 'Accept-encoding' not in req.headers:
1297             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1298
1299         req.headers = handle_youtubedl_headers(req.headers)
1300
1301         return req
1302
1303     def http_response(self, req, resp):
1304         old_resp = resp
1305         # gzip
1306         if resp.headers.get('Content-encoding', '') == 'gzip':
1307             content = resp.read()
1308             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1309             try:
1310                 uncompressed = io.BytesIO(gz.read())
1311             except OSError as original_ioerror:
1312                 # There may be junk add the end of the file
1313                 # See http://stackoverflow.com/q/4928560/35070 for details
1314                 for i in range(1, 1024):
1315                     try:
1316                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1317                         uncompressed = io.BytesIO(gz.read())
1318                     except OSError:
1319                         continue
1320                     break
1321                 else:
1322                     raise original_ioerror
1323             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1324             resp.msg = old_resp.msg
1325             del resp.headers['Content-encoding']
1326         # deflate
1327         if resp.headers.get('Content-encoding', '') == 'deflate':
1328             gz = io.BytesIO(self.deflate(resp.read()))
1329             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1330             resp.msg = old_resp.msg
1331             del resp.headers['Content-encoding']
1332         # brotli
1333         if resp.headers.get('Content-encoding', '') == 'br':
1334             resp = compat_urllib_request.addinfourl(
1335                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1336             resp.msg = old_resp.msg
1337             del resp.headers['Content-encoding']
1338         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1339         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1340         if 300 <= resp.code < 400:
1341             location = resp.headers.get('Location')
1342             if location:
1343                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1344                 location = location.encode('iso-8859-1').decode()
1345                 location_escaped = escape_url(location)
1346                 if location != location_escaped:
1347                     del resp.headers['Location']
1348                     resp.headers['Location'] = location_escaped
1349         return resp
1350
1351     https_request = http_request
1352     https_response = http_response
1353
1354
1355 def make_socks_conn_class(base_class, socks_proxy):
1356     assert issubclass(base_class, (
1357         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1358
1359     url_components = compat_urlparse.urlparse(socks_proxy)
1360     if url_components.scheme.lower() == 'socks5':
1361         socks_type = ProxyType.SOCKS5
1362     elif url_components.scheme.lower() in ('socks', 'socks4'):
1363         socks_type = ProxyType.SOCKS4
1364     elif url_components.scheme.lower() == 'socks4a':
1365         socks_type = ProxyType.SOCKS4A
1366
1367     def unquote_if_non_empty(s):
1368         if not s:
1369             return s
1370         return compat_urllib_parse_unquote_plus(s)
1371
1372     proxy_args = (
1373         socks_type,
1374         url_components.hostname, url_components.port or 1080,
1375         True,  # Remote DNS
1376         unquote_if_non_empty(url_components.username),
1377         unquote_if_non_empty(url_components.password),
1378     )
1379
1380     class SocksConnection(base_class):
1381         def connect(self):
1382             self.sock = sockssocket()
1383             self.sock.setproxy(*proxy_args)
1384             if isinstance(self.timeout, (int, float)):
1385                 self.sock.settimeout(self.timeout)
1386             self.sock.connect((self.host, self.port))
1387
1388             if isinstance(self, compat_http_client.HTTPSConnection):
1389                 if hasattr(self, '_context'):  # Python > 2.6
1390                     self.sock = self._context.wrap_socket(
1391                         self.sock, server_hostname=self.host)
1392                 else:
1393                     self.sock = ssl.wrap_socket(self.sock)
1394
1395     return SocksConnection
1396
1397
1398 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1399     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1400         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1401         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1402         self._params = params
1403
1404     def https_open(self, req):
1405         kwargs = {}
1406         conn_class = self._https_conn_class
1407
1408         if hasattr(self, '_context'):  # python > 2.6
1409             kwargs['context'] = self._context
1410         if hasattr(self, '_check_hostname'):  # python 3.x
1411             kwargs['check_hostname'] = self._check_hostname
1412
1413         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1414         if socks_proxy:
1415             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1416             del req.headers['Ytdl-socks-proxy']
1417
1418         try:
1419             return self.do_open(
1420                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1421         except urllib.error.URLError as e:
1422             if (isinstance(e.reason, ssl.SSLError)
1423                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1424                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1425             raise
1426
1427
1428 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1429     """
1430     See [1] for cookie file format.
1431
1432     1. https://curl.haxx.se/docs/http-cookies.html
1433     """
1434     _HTTPONLY_PREFIX = '#HttpOnly_'
1435     _ENTRY_LEN = 7
1436     _HEADER = '''# Netscape HTTP Cookie File
1437 # This file is generated by yt-dlp.  Do not edit.
1438
1439 '''
1440     _CookieFileEntry = collections.namedtuple(
1441         'CookieFileEntry',
1442         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1443
1444     def __init__(self, filename=None, *args, **kwargs):
1445         super().__init__(None, *args, **kwargs)
1446         if self.is_path(filename):
1447             filename = os.fspath(filename)
1448         self.filename = filename
1449
1450     @staticmethod
1451     def _true_or_false(cndn):
1452         return 'TRUE' if cndn else 'FALSE'
1453
1454     @staticmethod
1455     def is_path(file):
1456         return isinstance(file, (str, bytes, os.PathLike))
1457
1458     @contextlib.contextmanager
1459     def open(self, file, *, write=False):
1460         if self.is_path(file):
1461             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1462                 yield f
1463         else:
1464             if write:
1465                 file.truncate(0)
1466             yield file
1467
1468     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1469         now = time.time()
1470         for cookie in self:
1471             if (not ignore_discard and cookie.discard
1472                     or not ignore_expires and cookie.is_expired(now)):
1473                 continue
1474             name, value = cookie.name, cookie.value
1475             if value is None:
1476                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1477                 # with no name, whereas http.cookiejar regards it as a
1478                 # cookie with no value.
1479                 name, value = '', name
1480             f.write('%s\n' % '\t'.join((
1481                 cookie.domain,
1482                 self._true_or_false(cookie.domain.startswith('.')),
1483                 cookie.path,
1484                 self._true_or_false(cookie.secure),
1485                 str_or_none(cookie.expires, default=''),
1486                 name, value
1487             )))
1488
1489     def save(self, filename=None, *args, **kwargs):
1490         """
1491         Save cookies to a file.
1492         Code is taken from CPython 3.6
1493         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1494
1495         if filename is None:
1496             if self.filename is not None:
1497                 filename = self.filename
1498             else:
1499                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1500
1501         # Store session cookies with `expires` set to 0 instead of an empty string
1502         for cookie in self:
1503             if cookie.expires is None:
1504                 cookie.expires = 0
1505
1506         with self.open(filename, write=True) as f:
1507             f.write(self._HEADER)
1508             self._really_save(f, *args, **kwargs)
1509
1510     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1511         """Load cookies from a file."""
1512         if filename is None:
1513             if self.filename is not None:
1514                 filename = self.filename
1515             else:
1516                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1517
1518         def prepare_line(line):
1519             if line.startswith(self._HTTPONLY_PREFIX):
1520                 line = line[len(self._HTTPONLY_PREFIX):]
1521             # comments and empty lines are fine
1522             if line.startswith('#') or not line.strip():
1523                 return line
1524             cookie_list = line.split('\t')
1525             if len(cookie_list) != self._ENTRY_LEN:
1526                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1527             cookie = self._CookieFileEntry(*cookie_list)
1528             if cookie.expires_at and not cookie.expires_at.isdigit():
1529                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1530             return line
1531
1532         cf = io.StringIO()
1533         with self.open(filename) as f:
1534             for line in f:
1535                 try:
1536                     cf.write(prepare_line(line))
1537                 except compat_cookiejar.LoadError as e:
1538                     if f'{line.strip()} '[0] in '[{"':
1539                         raise compat_cookiejar.LoadError(
1540                             'Cookies file must be Netscape formatted, not JSON. See  '
1541                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1542                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1543                     continue
1544         cf.seek(0)
1545         self._really_load(cf, filename, ignore_discard, ignore_expires)
1546         # Session cookies are denoted by either `expires` field set to
1547         # an empty string or 0. MozillaCookieJar only recognizes the former
1548         # (see [1]). So we need force the latter to be recognized as session
1549         # cookies on our own.
1550         # Session cookies may be important for cookies-based authentication,
1551         # e.g. usually, when user does not check 'Remember me' check box while
1552         # logging in on a site, some important cookies are stored as session
1553         # cookies so that not recognizing them will result in failed login.
1554         # 1. https://bugs.python.org/issue17164
1555         for cookie in self:
1556             # Treat `expires=0` cookies as session cookies
1557             if cookie.expires == 0:
1558                 cookie.expires = None
1559                 cookie.discard = True
1560
1561
1562 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1563     def __init__(self, cookiejar=None):
1564         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1565
1566     def http_response(self, request, response):
1567         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1568
1569     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1570     https_response = http_response
1571
1572
1573 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1574     """YoutubeDL redirect handler
1575
1576     The code is based on HTTPRedirectHandler implementation from CPython [1].
1577
1578     This redirect handler solves two issues:
1579      - ensures redirect URL is always unicode under python 2
1580      - introduces support for experimental HTTP response status code
1581        308 Permanent Redirect [2] used by some sites [3]
1582
1583     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1584     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1585     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1586     """
1587
1588     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1589
1590     def redirect_request(self, req, fp, code, msg, headers, newurl):
1591         """Return a Request or None in response to a redirect.
1592
1593         This is called by the http_error_30x methods when a
1594         redirection response is received.  If a redirection should
1595         take place, return a new Request to allow http_error_30x to
1596         perform the redirect.  Otherwise, raise HTTPError if no-one
1597         else should try to handle this url.  Return None if you can't
1598         but another Handler might.
1599         """
1600         m = req.get_method()
1601         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1602                  or code in (301, 302, 303) and m == "POST")):
1603             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1604         # Strictly (according to RFC 2616), 301 or 302 in response to
1605         # a POST MUST NOT cause a redirection without confirmation
1606         # from the user (of urllib.request, in this case).  In practice,
1607         # essentially all clients do redirect in this case, so we do
1608         # the same.
1609
1610         # Be conciliant with URIs containing a space.  This is mainly
1611         # redundant with the more complete encoding done in http_error_302(),
1612         # but it is kept for compatibility with other callers.
1613         newurl = newurl.replace(' ', '%20')
1614
1615         CONTENT_HEADERS = ("content-length", "content-type")
1616         # NB: don't use dict comprehension for python 2.6 compatibility
1617         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1618
1619         # A 303 must either use GET or HEAD for subsequent request
1620         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1621         if code == 303 and m != 'HEAD':
1622             m = 'GET'
1623         # 301 and 302 redirects are commonly turned into a GET from a POST
1624         # for subsequent requests by browsers, so we'll do the same.
1625         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1626         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1627         if code in (301, 302) and m == 'POST':
1628             m = 'GET'
1629
1630         return compat_urllib_request.Request(
1631             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1632             unverifiable=True, method=m)
1633
1634
1635 def extract_timezone(date_str):
1636     m = re.search(
1637         r'''(?x)
1638             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1639             (?P<tz>Z|                                            # just the UTC Z, or
1640                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1641                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1642                    [ ]?                                          # optional space
1643                 (?P<sign>\+|-)                                   # +/-
1644                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1645             $)
1646         ''', date_str)
1647     if not m:
1648         timezone = datetime.timedelta()
1649     else:
1650         date_str = date_str[:-len(m.group('tz'))]
1651         if not m.group('sign'):
1652             timezone = datetime.timedelta()
1653         else:
1654             sign = 1 if m.group('sign') == '+' else -1
1655             timezone = datetime.timedelta(
1656                 hours=sign * int(m.group('hours')),
1657                 minutes=sign * int(m.group('minutes')))
1658     return timezone, date_str
1659
1660
1661 def parse_iso8601(date_str, delimiter='T', timezone=None):
1662     """ Return a UNIX timestamp from the given date """
1663
1664     if date_str is None:
1665         return None
1666
1667     date_str = re.sub(r'\.[0-9]+', '', date_str)
1668
1669     if timezone is None:
1670         timezone, date_str = extract_timezone(date_str)
1671
1672     with contextlib.suppress(ValueError):
1673         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1674         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1675         return calendar.timegm(dt.timetuple())
1676
1677
1678 def date_formats(day_first=True):
1679     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1680
1681
1682 def unified_strdate(date_str, day_first=True):
1683     """Return a string with the date in the format YYYYMMDD"""
1684
1685     if date_str is None:
1686         return None
1687     upload_date = None
1688     # Replace commas
1689     date_str = date_str.replace(',', ' ')
1690     # Remove AM/PM + timezone
1691     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1692     _, date_str = extract_timezone(date_str)
1693
1694     for expression in date_formats(day_first):
1695         with contextlib.suppress(ValueError):
1696             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1697     if upload_date is None:
1698         timetuple = email.utils.parsedate_tz(date_str)
1699         if timetuple:
1700             with contextlib.suppress(ValueError):
1701                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1702     if upload_date is not None:
1703         return compat_str(upload_date)
1704
1705
1706 def unified_timestamp(date_str, day_first=True):
1707     if date_str is None:
1708         return None
1709
1710     date_str = re.sub(r'[,|]', '', date_str)
1711
1712     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1713     timezone, date_str = extract_timezone(date_str)
1714
1715     # Remove AM/PM + timezone
1716     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1717
1718     # Remove unrecognized timezones from ISO 8601 alike timestamps
1719     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1720     if m:
1721         date_str = date_str[:-len(m.group('tz'))]
1722
1723     # Python only supports microseconds, so remove nanoseconds
1724     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1725     if m:
1726         date_str = m.group(1)
1727
1728     for expression in date_formats(day_first):
1729         with contextlib.suppress(ValueError):
1730             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1731             return calendar.timegm(dt.timetuple())
1732     timetuple = email.utils.parsedate_tz(date_str)
1733     if timetuple:
1734         return calendar.timegm(timetuple) + pm_delta * 3600
1735
1736
1737 def determine_ext(url, default_ext='unknown_video'):
1738     if url is None or '.' not in url:
1739         return default_ext
1740     guess = url.partition('?')[0].rpartition('.')[2]
1741     if re.match(r'^[A-Za-z0-9]+$', guess):
1742         return guess
1743     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1744     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1745         return guess.rstrip('/')
1746     else:
1747         return default_ext
1748
1749
1750 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1751     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1752
1753
1754 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1755     R"""
1756     Return a datetime object from a string.
1757     Supported format:
1758         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1759
1760     @param format       strftime format of DATE
1761     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1762                         auto: round to the unit provided in date_str (if applicable).
1763     """
1764     auto_precision = False
1765     if precision == 'auto':
1766         auto_precision = True
1767         precision = 'microsecond'
1768     today = datetime_round(datetime.datetime.utcnow(), precision)
1769     if date_str in ('now', 'today'):
1770         return today
1771     if date_str == 'yesterday':
1772         return today - datetime.timedelta(days=1)
1773     match = re.match(
1774         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1775         date_str)
1776     if match is not None:
1777         start_time = datetime_from_str(match.group('start'), precision, format)
1778         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1779         unit = match.group('unit')
1780         if unit == 'month' or unit == 'year':
1781             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1782             unit = 'day'
1783         else:
1784             if unit == 'week':
1785                 unit = 'day'
1786                 time *= 7
1787             delta = datetime.timedelta(**{unit + 's': time})
1788             new_date = start_time + delta
1789         if auto_precision:
1790             return datetime_round(new_date, unit)
1791         return new_date
1792
1793     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1794
1795
1796 def date_from_str(date_str, format='%Y%m%d', strict=False):
1797     R"""
1798     Return a date object from a string using datetime_from_str
1799
1800     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1801                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1802     """
1803     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1804         raise ValueError(f'Invalid date format "{date_str}"')
1805     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1806
1807
1808 def datetime_add_months(dt, months):
1809     """Increment/Decrement a datetime object by months."""
1810     month = dt.month + months - 1
1811     year = dt.year + month // 12
1812     month = month % 12 + 1
1813     day = min(dt.day, calendar.monthrange(year, month)[1])
1814     return dt.replace(year, month, day)
1815
1816
1817 def datetime_round(dt, precision='day'):
1818     """
1819     Round a datetime object's time to a specific precision
1820     """
1821     if precision == 'microsecond':
1822         return dt
1823
1824     unit_seconds = {
1825         'day': 86400,
1826         'hour': 3600,
1827         'minute': 60,
1828         'second': 1,
1829     }
1830     roundto = lambda x, n: ((x + n / 2) // n) * n
1831     timestamp = calendar.timegm(dt.timetuple())
1832     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1833
1834
1835 def hyphenate_date(date_str):
1836     """
1837     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1838     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1839     if match is not None:
1840         return '-'.join(match.groups())
1841     else:
1842         return date_str
1843
1844
1845 class DateRange:
1846     """Represents a time interval between two dates"""
1847
1848     def __init__(self, start=None, end=None):
1849         """start and end must be strings in the format accepted by date"""
1850         if start is not None:
1851             self.start = date_from_str(start, strict=True)
1852         else:
1853             self.start = datetime.datetime.min.date()
1854         if end is not None:
1855             self.end = date_from_str(end, strict=True)
1856         else:
1857             self.end = datetime.datetime.max.date()
1858         if self.start > self.end:
1859             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1860
1861     @classmethod
1862     def day(cls, day):
1863         """Returns a range that only contains the given day"""
1864         return cls(day, day)
1865
1866     def __contains__(self, date):
1867         """Check if the date is in the range"""
1868         if not isinstance(date, datetime.date):
1869             date = date_from_str(date)
1870         return self.start <= date <= self.end
1871
1872     def __str__(self):
1873         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1874
1875
1876 def platform_name():
1877     """ Returns the platform name as a compat_str """
1878     res = platform.platform()
1879     if isinstance(res, bytes):
1880         res = res.decode(preferredencoding())
1881
1882     assert isinstance(res, compat_str)
1883     return res
1884
1885
1886 @functools.cache
1887 def get_windows_version():
1888     ''' Get Windows version. None if it's not running on Windows '''
1889     if compat_os_name == 'nt':
1890         return version_tuple(platform.win32_ver()[1])
1891     else:
1892         return None
1893
1894
1895 def write_string(s, out=None, encoding=None):
1896     assert isinstance(s, str)
1897     out = out or sys.stderr
1898
1899     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1900         s = re.sub(r'([\r\n]+)', r' \1', s)
1901
1902     enc = None
1903     if 'b' in getattr(out, 'mode', ''):
1904         enc = encoding or preferredencoding()
1905     elif hasattr(out, 'buffer'):
1906         out = out.buffer
1907         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1908
1909     out.write(s.encode(enc, 'ignore') if enc else s)
1910     out.flush()
1911
1912
1913 def bytes_to_intlist(bs):
1914     if not bs:
1915         return []
1916     if isinstance(bs[0], int):  # Python 3
1917         return list(bs)
1918     else:
1919         return [ord(c) for c in bs]
1920
1921
1922 def intlist_to_bytes(xs):
1923     if not xs:
1924         return b''
1925     return compat_struct_pack('%dB' % len(xs), *xs)
1926
1927
1928 class LockingUnsupportedError(IOError):
1929     msg = 'File locking is not supported on this platform'
1930
1931     def __init__(self):
1932         super().__init__(self.msg)
1933
1934
1935 # Cross-platform file locking
1936 if sys.platform == 'win32':
1937     import ctypes.wintypes
1938     import msvcrt
1939
1940     class OVERLAPPED(ctypes.Structure):
1941         _fields_ = [
1942             ('Internal', ctypes.wintypes.LPVOID),
1943             ('InternalHigh', ctypes.wintypes.LPVOID),
1944             ('Offset', ctypes.wintypes.DWORD),
1945             ('OffsetHigh', ctypes.wintypes.DWORD),
1946             ('hEvent', ctypes.wintypes.HANDLE),
1947         ]
1948
1949     kernel32 = ctypes.windll.kernel32
1950     LockFileEx = kernel32.LockFileEx
1951     LockFileEx.argtypes = [
1952         ctypes.wintypes.HANDLE,     # hFile
1953         ctypes.wintypes.DWORD,      # dwFlags
1954         ctypes.wintypes.DWORD,      # dwReserved
1955         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1956         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1957         ctypes.POINTER(OVERLAPPED)  # Overlapped
1958     ]
1959     LockFileEx.restype = ctypes.wintypes.BOOL
1960     UnlockFileEx = kernel32.UnlockFileEx
1961     UnlockFileEx.argtypes = [
1962         ctypes.wintypes.HANDLE,     # hFile
1963         ctypes.wintypes.DWORD,      # dwReserved
1964         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1965         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1966         ctypes.POINTER(OVERLAPPED)  # Overlapped
1967     ]
1968     UnlockFileEx.restype = ctypes.wintypes.BOOL
1969     whole_low = 0xffffffff
1970     whole_high = 0x7fffffff
1971
1972     def _lock_file(f, exclusive, block):
1973         overlapped = OVERLAPPED()
1974         overlapped.Offset = 0
1975         overlapped.OffsetHigh = 0
1976         overlapped.hEvent = 0
1977         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1978
1979         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1980                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1981                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1982             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
1983
1984     def _unlock_file(f):
1985         assert f._lock_file_overlapped_p
1986         handle = msvcrt.get_osfhandle(f.fileno())
1987         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1988             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1989
1990 else:
1991     try:
1992         import fcntl
1993
1994         def _lock_file(f, exclusive, block):
1995             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1996             if not block:
1997                 flags |= fcntl.LOCK_NB
1998             try:
1999                 fcntl.flock(f, flags)
2000             except BlockingIOError:
2001                 raise
2002             except OSError:  # AOSP does not have flock()
2003                 fcntl.lockf(f, flags)
2004
2005         def _unlock_file(f):
2006             try:
2007                 fcntl.flock(f, fcntl.LOCK_UN)
2008             except OSError:
2009                 fcntl.lockf(f, fcntl.LOCK_UN)
2010
2011     except ImportError:
2012
2013         def _lock_file(f, exclusive, block):
2014             raise LockingUnsupportedError()
2015
2016         def _unlock_file(f):
2017             raise LockingUnsupportedError()
2018
2019
2020 class locked_file:
2021     locked = False
2022
2023     def __init__(self, filename, mode, block=True, encoding=None):
2024         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2025             raise NotImplementedError(mode)
2026         self.mode, self.block = mode, block
2027
2028         writable = any(f in mode for f in 'wax+')
2029         readable = any(f in mode for f in 'r+')
2030         flags = functools.reduce(operator.ior, (
2031             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2032             getattr(os, 'O_BINARY', 0),  # Windows only
2033             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2034             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2035             os.O_APPEND if 'a' in mode else 0,
2036             os.O_EXCL if 'x' in mode else 0,
2037             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2038         ))
2039
2040         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2041
2042     def __enter__(self):
2043         exclusive = 'r' not in self.mode
2044         try:
2045             _lock_file(self.f, exclusive, self.block)
2046             self.locked = True
2047         except OSError:
2048             self.f.close()
2049             raise
2050         if 'w' in self.mode:
2051             try:
2052                 self.f.truncate()
2053             except OSError as e:
2054                 if e.errno != 29:  # Illegal seek, expected when self.f is a FIFO
2055                     raise e
2056         return self
2057
2058     def unlock(self):
2059         if not self.locked:
2060             return
2061         try:
2062             _unlock_file(self.f)
2063         finally:
2064             self.locked = False
2065
2066     def __exit__(self, *_):
2067         try:
2068             self.unlock()
2069         finally:
2070             self.f.close()
2071
2072     open = __enter__
2073     close = __exit__
2074
2075     def __getattr__(self, attr):
2076         return getattr(self.f, attr)
2077
2078     def __iter__(self):
2079         return iter(self.f)
2080
2081
2082 @functools.cache
2083 def get_filesystem_encoding():
2084     encoding = sys.getfilesystemencoding()
2085     return encoding if encoding is not None else 'utf-8'
2086
2087
2088 def shell_quote(args):
2089     quoted_args = []
2090     encoding = get_filesystem_encoding()
2091     for a in args:
2092         if isinstance(a, bytes):
2093             # We may get a filename encoded with 'encodeFilename'
2094             a = a.decode(encoding)
2095         quoted_args.append(compat_shlex_quote(a))
2096     return ' '.join(quoted_args)
2097
2098
2099 def smuggle_url(url, data):
2100     """ Pass additional data in a URL for internal use. """
2101
2102     url, idata = unsmuggle_url(url, {})
2103     data.update(idata)
2104     sdata = compat_urllib_parse_urlencode(
2105         {'__youtubedl_smuggle': json.dumps(data)})
2106     return url + '#' + sdata
2107
2108
2109 def unsmuggle_url(smug_url, default=None):
2110     if '#__youtubedl_smuggle' not in smug_url:
2111         return smug_url, default
2112     url, _, sdata = smug_url.rpartition('#')
2113     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2114     data = json.loads(jsond)
2115     return url, data
2116
2117
2118 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2119     """ Formats numbers with decimal sufixes like K, M, etc """
2120     num, factor = float_or_none(num), float(factor)
2121     if num is None or num < 0:
2122         return None
2123     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2124     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2125     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2126     if factor == 1024:
2127         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2128     converted = num / (factor ** exponent)
2129     return fmt % (converted, suffix)
2130
2131
2132 def format_bytes(bytes):
2133     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2134
2135
2136 def lookup_unit_table(unit_table, s):
2137     units_re = '|'.join(re.escape(u) for u in unit_table)
2138     m = re.match(
2139         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2140     if not m:
2141         return None
2142     num_str = m.group('num').replace(',', '.')
2143     mult = unit_table[m.group('unit')]
2144     return int(float(num_str) * mult)
2145
2146
2147 def parse_filesize(s):
2148     if s is None:
2149         return None
2150
2151     # The lower-case forms are of course incorrect and unofficial,
2152     # but we support those too
2153     _UNIT_TABLE = {
2154         'B': 1,
2155         'b': 1,
2156         'bytes': 1,
2157         'KiB': 1024,
2158         'KB': 1000,
2159         'kB': 1024,
2160         'Kb': 1000,
2161         'kb': 1000,
2162         'kilobytes': 1000,
2163         'kibibytes': 1024,
2164         'MiB': 1024 ** 2,
2165         'MB': 1000 ** 2,
2166         'mB': 1024 ** 2,
2167         'Mb': 1000 ** 2,
2168         'mb': 1000 ** 2,
2169         'megabytes': 1000 ** 2,
2170         'mebibytes': 1024 ** 2,
2171         'GiB': 1024 ** 3,
2172         'GB': 1000 ** 3,
2173         'gB': 1024 ** 3,
2174         'Gb': 1000 ** 3,
2175         'gb': 1000 ** 3,
2176         'gigabytes': 1000 ** 3,
2177         'gibibytes': 1024 ** 3,
2178         'TiB': 1024 ** 4,
2179         'TB': 1000 ** 4,
2180         'tB': 1024 ** 4,
2181         'Tb': 1000 ** 4,
2182         'tb': 1000 ** 4,
2183         'terabytes': 1000 ** 4,
2184         'tebibytes': 1024 ** 4,
2185         'PiB': 1024 ** 5,
2186         'PB': 1000 ** 5,
2187         'pB': 1024 ** 5,
2188         'Pb': 1000 ** 5,
2189         'pb': 1000 ** 5,
2190         'petabytes': 1000 ** 5,
2191         'pebibytes': 1024 ** 5,
2192         'EiB': 1024 ** 6,
2193         'EB': 1000 ** 6,
2194         'eB': 1024 ** 6,
2195         'Eb': 1000 ** 6,
2196         'eb': 1000 ** 6,
2197         'exabytes': 1000 ** 6,
2198         'exbibytes': 1024 ** 6,
2199         'ZiB': 1024 ** 7,
2200         'ZB': 1000 ** 7,
2201         'zB': 1024 ** 7,
2202         'Zb': 1000 ** 7,
2203         'zb': 1000 ** 7,
2204         'zettabytes': 1000 ** 7,
2205         'zebibytes': 1024 ** 7,
2206         'YiB': 1024 ** 8,
2207         'YB': 1000 ** 8,
2208         'yB': 1024 ** 8,
2209         'Yb': 1000 ** 8,
2210         'yb': 1000 ** 8,
2211         'yottabytes': 1000 ** 8,
2212         'yobibytes': 1024 ** 8,
2213     }
2214
2215     return lookup_unit_table(_UNIT_TABLE, s)
2216
2217
2218 def parse_count(s):
2219     if s is None:
2220         return None
2221
2222     s = re.sub(r'^[^\d]+\s', '', s).strip()
2223
2224     if re.match(r'^[\d,.]+$', s):
2225         return str_to_int(s)
2226
2227     _UNIT_TABLE = {
2228         'k': 1000,
2229         'K': 1000,
2230         'm': 1000 ** 2,
2231         'M': 1000 ** 2,
2232         'kk': 1000 ** 2,
2233         'KK': 1000 ** 2,
2234         'b': 1000 ** 3,
2235         'B': 1000 ** 3,
2236     }
2237
2238     ret = lookup_unit_table(_UNIT_TABLE, s)
2239     if ret is not None:
2240         return ret
2241
2242     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2243     if mobj:
2244         return str_to_int(mobj.group(1))
2245
2246
2247 def parse_resolution(s, *, lenient=False):
2248     if s is None:
2249         return {}
2250
2251     if lenient:
2252         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2253     else:
2254         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2255     if mobj:
2256         return {
2257             'width': int(mobj.group('w')),
2258             'height': int(mobj.group('h')),
2259         }
2260
2261     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2262     if mobj:
2263         return {'height': int(mobj.group(1))}
2264
2265     mobj = re.search(r'\b([48])[kK]\b', s)
2266     if mobj:
2267         return {'height': int(mobj.group(1)) * 540}
2268
2269     return {}
2270
2271
2272 def parse_bitrate(s):
2273     if not isinstance(s, compat_str):
2274         return
2275     mobj = re.search(r'\b(\d+)\s*kbps', s)
2276     if mobj:
2277         return int(mobj.group(1))
2278
2279
2280 def month_by_name(name, lang='en'):
2281     """ Return the number of a month by (locale-independently) English name """
2282
2283     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2284
2285     try:
2286         return month_names.index(name) + 1
2287     except ValueError:
2288         return None
2289
2290
2291 def month_by_abbreviation(abbrev):
2292     """ Return the number of a month by (locale-independently) English
2293         abbreviations """
2294
2295     try:
2296         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2297     except ValueError:
2298         return None
2299
2300
2301 def fix_xml_ampersands(xml_str):
2302     """Replace all the '&' by '&amp;' in XML"""
2303     return re.sub(
2304         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2305         '&amp;',
2306         xml_str)
2307
2308
2309 def setproctitle(title):
2310     assert isinstance(title, compat_str)
2311
2312     # ctypes in Jython is not complete
2313     # http://bugs.jython.org/issue2148
2314     if sys.platform.startswith('java'):
2315         return
2316
2317     try:
2318         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2319     except OSError:
2320         return
2321     except TypeError:
2322         # LoadLibrary in Windows Python 2.7.13 only expects
2323         # a bytestring, but since unicode_literals turns
2324         # every string into a unicode string, it fails.
2325         return
2326     title_bytes = title.encode()
2327     buf = ctypes.create_string_buffer(len(title_bytes))
2328     buf.value = title_bytes
2329     try:
2330         libc.prctl(15, buf, 0, 0, 0)
2331     except AttributeError:
2332         return  # Strange libc, just skip this
2333
2334
2335 def remove_start(s, start):
2336     return s[len(start):] if s is not None and s.startswith(start) else s
2337
2338
2339 def remove_end(s, end):
2340     return s[:-len(end)] if s is not None and s.endswith(end) else s
2341
2342
2343 def remove_quotes(s):
2344     if s is None or len(s) < 2:
2345         return s
2346     for quote in ('"', "'", ):
2347         if s[0] == quote and s[-1] == quote:
2348             return s[1:-1]
2349     return s
2350
2351
2352 def get_domain(url):
2353     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2354     return domain.group('domain') if domain else None
2355
2356
2357 def url_basename(url):
2358     path = compat_urlparse.urlparse(url).path
2359     return path.strip('/').split('/')[-1]
2360
2361
2362 def base_url(url):
2363     return re.match(r'https?://[^?#&]+/', url).group()
2364
2365
2366 def urljoin(base, path):
2367     if isinstance(path, bytes):
2368         path = path.decode()
2369     if not isinstance(path, compat_str) or not path:
2370         return None
2371     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2372         return path
2373     if isinstance(base, bytes):
2374         base = base.decode()
2375     if not isinstance(base, compat_str) or not re.match(
2376             r'^(?:https?:)?//', base):
2377         return None
2378     return compat_urlparse.urljoin(base, path)
2379
2380
2381 class HEADRequest(compat_urllib_request.Request):
2382     def get_method(self):
2383         return 'HEAD'
2384
2385
2386 class PUTRequest(compat_urllib_request.Request):
2387     def get_method(self):
2388         return 'PUT'
2389
2390
2391 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2392     if get_attr and v is not None:
2393         v = getattr(v, get_attr, None)
2394     try:
2395         return int(v) * invscale // scale
2396     except (ValueError, TypeError, OverflowError):
2397         return default
2398
2399
2400 def str_or_none(v, default=None):
2401     return default if v is None else compat_str(v)
2402
2403
2404 def str_to_int(int_str):
2405     """ A more relaxed version of int_or_none """
2406     if isinstance(int_str, int):
2407         return int_str
2408     elif isinstance(int_str, compat_str):
2409         int_str = re.sub(r'[,\.\+]', '', int_str)
2410         return int_or_none(int_str)
2411
2412
2413 def float_or_none(v, scale=1, invscale=1, default=None):
2414     if v is None:
2415         return default
2416     try:
2417         return float(v) * invscale / scale
2418     except (ValueError, TypeError):
2419         return default
2420
2421
2422 def bool_or_none(v, default=None):
2423     return v if isinstance(v, bool) else default
2424
2425
2426 def strip_or_none(v, default=None):
2427     return v.strip() if isinstance(v, compat_str) else default
2428
2429
2430 def url_or_none(url):
2431     if not url or not isinstance(url, compat_str):
2432         return None
2433     url = url.strip()
2434     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2435
2436
2437 def request_to_url(req):
2438     if isinstance(req, compat_urllib_request.Request):
2439         return req.get_full_url()
2440     else:
2441         return req
2442
2443
2444 def strftime_or_none(timestamp, date_format, default=None):
2445     datetime_object = None
2446     try:
2447         if isinstance(timestamp, (int, float)):  # unix timestamp
2448             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2449         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2450             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2451         return datetime_object.strftime(date_format)
2452     except (ValueError, TypeError, AttributeError):
2453         return default
2454
2455
2456 def parse_duration(s):
2457     if not isinstance(s, str):
2458         return None
2459     s = s.strip()
2460     if not s:
2461         return None
2462
2463     days, hours, mins, secs, ms = [None] * 5
2464     m = re.match(r'''(?x)
2465             (?P<before_secs>
2466                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2467             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2468             (?P<ms>[.:][0-9]+)?Z?$
2469         ''', s)
2470     if m:
2471         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2472     else:
2473         m = re.match(
2474             r'''(?ix)(?:P?
2475                 (?:
2476                     [0-9]+\s*y(?:ears?)?,?\s*
2477                 )?
2478                 (?:
2479                     [0-9]+\s*m(?:onths?)?,?\s*
2480                 )?
2481                 (?:
2482                     [0-9]+\s*w(?:eeks?)?,?\s*
2483                 )?
2484                 (?:
2485                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2486                 )?
2487                 T)?
2488                 (?:
2489                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2490                 )?
2491                 (?:
2492                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2493                 )?
2494                 (?:
2495                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2496                 )?Z?$''', s)
2497         if m:
2498             days, hours, mins, secs, ms = m.groups()
2499         else:
2500             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2501             if m:
2502                 hours, mins = m.groups()
2503             else:
2504                 return None
2505
2506     if ms:
2507         ms = ms.replace(':', '.')
2508     return sum(float(part or 0) * mult for part, mult in (
2509         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2510
2511
2512 def prepend_extension(filename, ext, expected_real_ext=None):
2513     name, real_ext = os.path.splitext(filename)
2514     return (
2515         f'{name}.{ext}{real_ext}'
2516         if not expected_real_ext or real_ext[1:] == expected_real_ext
2517         else f'{filename}.{ext}')
2518
2519
2520 def replace_extension(filename, ext, expected_real_ext=None):
2521     name, real_ext = os.path.splitext(filename)
2522     return '{}.{}'.format(
2523         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2524         ext)
2525
2526
2527 def check_executable(exe, args=[]):
2528     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2529     args can be a list of arguments for a short output (like -version) """
2530     try:
2531         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2532     except OSError:
2533         return False
2534     return exe
2535
2536
2537 def _get_exe_version_output(exe, args, *, to_screen=None):
2538     if to_screen:
2539         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2540     try:
2541         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2542         # SIGTTOU if yt-dlp is run in the background.
2543         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2544         out, _ = Popen(
2545             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2546             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2547     except OSError:
2548         return False
2549     if isinstance(out, bytes):  # Python 2.x
2550         out = out.decode('ascii', 'ignore')
2551     return out
2552
2553
2554 def detect_exe_version(output, version_re=None, unrecognized='present'):
2555     assert isinstance(output, compat_str)
2556     if version_re is None:
2557         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2558     m = re.search(version_re, output)
2559     if m:
2560         return m.group(1)
2561     else:
2562         return unrecognized
2563
2564
2565 def get_exe_version(exe, args=['--version'],
2566                     version_re=None, unrecognized='present'):
2567     """ Returns the version of the specified executable,
2568     or False if the executable is not present """
2569     out = _get_exe_version_output(exe, args)
2570     return detect_exe_version(out, version_re, unrecognized) if out else False
2571
2572
2573 class LazyList(collections.abc.Sequence):
2574     """Lazy immutable list from an iterable
2575     Note that slices of a LazyList are lists and not LazyList"""
2576
2577     class IndexError(IndexError):
2578         pass
2579
2580     def __init__(self, iterable, *, reverse=False, _cache=None):
2581         self._iterable = iter(iterable)
2582         self._cache = [] if _cache is None else _cache
2583         self._reversed = reverse
2584
2585     def __iter__(self):
2586         if self._reversed:
2587             # We need to consume the entire iterable to iterate in reverse
2588             yield from self.exhaust()
2589             return
2590         yield from self._cache
2591         for item in self._iterable:
2592             self._cache.append(item)
2593             yield item
2594
2595     def _exhaust(self):
2596         self._cache.extend(self._iterable)
2597         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2598         return self._cache
2599
2600     def exhaust(self):
2601         """Evaluate the entire iterable"""
2602         return self._exhaust()[::-1 if self._reversed else 1]
2603
2604     @staticmethod
2605     def _reverse_index(x):
2606         return None if x is None else -(x + 1)
2607
2608     def __getitem__(self, idx):
2609         if isinstance(idx, slice):
2610             if self._reversed:
2611                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2612             start, stop, step = idx.start, idx.stop, idx.step or 1
2613         elif isinstance(idx, int):
2614             if self._reversed:
2615                 idx = self._reverse_index(idx)
2616             start, stop, step = idx, idx, 0
2617         else:
2618             raise TypeError('indices must be integers or slices')
2619         if ((start or 0) < 0 or (stop or 0) < 0
2620                 or (start is None and step < 0)
2621                 or (stop is None and step > 0)):
2622             # We need to consume the entire iterable to be able to slice from the end
2623             # Obviously, never use this with infinite iterables
2624             self._exhaust()
2625             try:
2626                 return self._cache[idx]
2627             except IndexError as e:
2628                 raise self.IndexError(e) from e
2629         n = max(start or 0, stop or 0) - len(self._cache) + 1
2630         if n > 0:
2631             self._cache.extend(itertools.islice(self._iterable, n))
2632         try:
2633             return self._cache[idx]
2634         except IndexError as e:
2635             raise self.IndexError(e) from e
2636
2637     def __bool__(self):
2638         try:
2639             self[-1] if self._reversed else self[0]
2640         except self.IndexError:
2641             return False
2642         return True
2643
2644     def __len__(self):
2645         self._exhaust()
2646         return len(self._cache)
2647
2648     def __reversed__(self):
2649         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2650
2651     def __copy__(self):
2652         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2653
2654     def __repr__(self):
2655         # repr and str should mimic a list. So we exhaust the iterable
2656         return repr(self.exhaust())
2657
2658     def __str__(self):
2659         return repr(self.exhaust())
2660
2661
2662 class PagedList:
2663
2664     class IndexError(IndexError):
2665         pass
2666
2667     def __len__(self):
2668         # This is only useful for tests
2669         return len(self.getslice())
2670
2671     def __init__(self, pagefunc, pagesize, use_cache=True):
2672         self._pagefunc = pagefunc
2673         self._pagesize = pagesize
2674         self._pagecount = float('inf')
2675         self._use_cache = use_cache
2676         self._cache = {}
2677
2678     def getpage(self, pagenum):
2679         page_results = self._cache.get(pagenum)
2680         if page_results is None:
2681             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2682         if self._use_cache:
2683             self._cache[pagenum] = page_results
2684         return page_results
2685
2686     def getslice(self, start=0, end=None):
2687         return list(self._getslice(start, end))
2688
2689     def _getslice(self, start, end):
2690         raise NotImplementedError('This method must be implemented by subclasses')
2691
2692     def __getitem__(self, idx):
2693         assert self._use_cache, 'Indexing PagedList requires cache'
2694         if not isinstance(idx, int) or idx < 0:
2695             raise TypeError('indices must be non-negative integers')
2696         entries = self.getslice(idx, idx + 1)
2697         if not entries:
2698             raise self.IndexError()
2699         return entries[0]
2700
2701
2702 class OnDemandPagedList(PagedList):
2703     """Download pages until a page with less than maximum results"""
2704
2705     def _getslice(self, start, end):
2706         for pagenum in itertools.count(start // self._pagesize):
2707             firstid = pagenum * self._pagesize
2708             nextfirstid = pagenum * self._pagesize + self._pagesize
2709             if start >= nextfirstid:
2710                 continue
2711
2712             startv = (
2713                 start % self._pagesize
2714                 if firstid <= start < nextfirstid
2715                 else 0)
2716             endv = (
2717                 ((end - 1) % self._pagesize) + 1
2718                 if (end is not None and firstid <= end <= nextfirstid)
2719                 else None)
2720
2721             try:
2722                 page_results = self.getpage(pagenum)
2723             except Exception:
2724                 self._pagecount = pagenum - 1
2725                 raise
2726             if startv != 0 or endv is not None:
2727                 page_results = page_results[startv:endv]
2728             yield from page_results
2729
2730             # A little optimization - if current page is not "full", ie. does
2731             # not contain page_size videos then we can assume that this page
2732             # is the last one - there are no more ids on further pages -
2733             # i.e. no need to query again.
2734             if len(page_results) + startv < self._pagesize:
2735                 break
2736
2737             # If we got the whole page, but the next page is not interesting,
2738             # break out early as well
2739             if end == nextfirstid:
2740                 break
2741
2742
2743 class InAdvancePagedList(PagedList):
2744     """PagedList with total number of pages known in advance"""
2745
2746     def __init__(self, pagefunc, pagecount, pagesize):
2747         PagedList.__init__(self, pagefunc, pagesize, True)
2748         self._pagecount = pagecount
2749
2750     def _getslice(self, start, end):
2751         start_page = start // self._pagesize
2752         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2753         skip_elems = start - start_page * self._pagesize
2754         only_more = None if end is None else end - start
2755         for pagenum in range(start_page, end_page):
2756             page_results = self.getpage(pagenum)
2757             if skip_elems:
2758                 page_results = page_results[skip_elems:]
2759                 skip_elems = None
2760             if only_more is not None:
2761                 if len(page_results) < only_more:
2762                     only_more -= len(page_results)
2763                 else:
2764                     yield from page_results[:only_more]
2765                     break
2766             yield from page_results
2767
2768
2769 def uppercase_escape(s):
2770     unicode_escape = codecs.getdecoder('unicode_escape')
2771     return re.sub(
2772         r'\\U[0-9a-fA-F]{8}',
2773         lambda m: unicode_escape(m.group(0))[0],
2774         s)
2775
2776
2777 def lowercase_escape(s):
2778     unicode_escape = codecs.getdecoder('unicode_escape')
2779     return re.sub(
2780         r'\\u[0-9a-fA-F]{4}',
2781         lambda m: unicode_escape(m.group(0))[0],
2782         s)
2783
2784
2785 def escape_rfc3986(s):
2786     """Escape non-ASCII characters as suggested by RFC 3986"""
2787     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2788
2789
2790 def escape_url(url):
2791     """Escape URL as suggested by RFC 3986"""
2792     url_parsed = compat_urllib_parse_urlparse(url)
2793     return url_parsed._replace(
2794         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2795         path=escape_rfc3986(url_parsed.path),
2796         params=escape_rfc3986(url_parsed.params),
2797         query=escape_rfc3986(url_parsed.query),
2798         fragment=escape_rfc3986(url_parsed.fragment)
2799     ).geturl()
2800
2801
2802 def parse_qs(url):
2803     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2804
2805
2806 def read_batch_urls(batch_fd):
2807     def fixup(url):
2808         if not isinstance(url, compat_str):
2809             url = url.decode('utf-8', 'replace')
2810         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2811         for bom in BOM_UTF8:
2812             if url.startswith(bom):
2813                 url = url[len(bom):]
2814         url = url.lstrip()
2815         if not url or url.startswith(('#', ';', ']')):
2816             return False
2817         # "#" cannot be stripped out since it is part of the URI
2818         # However, it can be safely stipped out if follwing a whitespace
2819         return re.split(r'\s#', url, 1)[0].rstrip()
2820
2821     with contextlib.closing(batch_fd) as fd:
2822         return [url for url in map(fixup, fd) if url]
2823
2824
2825 def urlencode_postdata(*args, **kargs):
2826     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2827
2828
2829 def update_url_query(url, query):
2830     if not query:
2831         return url
2832     parsed_url = compat_urlparse.urlparse(url)
2833     qs = compat_parse_qs(parsed_url.query)
2834     qs.update(query)
2835     return compat_urlparse.urlunparse(parsed_url._replace(
2836         query=compat_urllib_parse_urlencode(qs, True)))
2837
2838
2839 def update_Request(req, url=None, data=None, headers={}, query={}):
2840     req_headers = req.headers.copy()
2841     req_headers.update(headers)
2842     req_data = data or req.data
2843     req_url = update_url_query(url or req.get_full_url(), query)
2844     req_get_method = req.get_method()
2845     if req_get_method == 'HEAD':
2846         req_type = HEADRequest
2847     elif req_get_method == 'PUT':
2848         req_type = PUTRequest
2849     else:
2850         req_type = compat_urllib_request.Request
2851     new_req = req_type(
2852         req_url, data=req_data, headers=req_headers,
2853         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2854     if hasattr(req, 'timeout'):
2855         new_req.timeout = req.timeout
2856     return new_req
2857
2858
2859 def _multipart_encode_impl(data, boundary):
2860     content_type = 'multipart/form-data; boundary=%s' % boundary
2861
2862     out = b''
2863     for k, v in data.items():
2864         out += b'--' + boundary.encode('ascii') + b'\r\n'
2865         if isinstance(k, compat_str):
2866             k = k.encode()
2867         if isinstance(v, compat_str):
2868             v = v.encode()
2869         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2870         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2871         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2872         if boundary.encode('ascii') in content:
2873             raise ValueError('Boundary overlaps with data')
2874         out += content
2875
2876     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2877
2878     return out, content_type
2879
2880
2881 def multipart_encode(data, boundary=None):
2882     '''
2883     Encode a dict to RFC 7578-compliant form-data
2884
2885     data:
2886         A dict where keys and values can be either Unicode or bytes-like
2887         objects.
2888     boundary:
2889         If specified a Unicode object, it's used as the boundary. Otherwise
2890         a random boundary is generated.
2891
2892     Reference: https://tools.ietf.org/html/rfc7578
2893     '''
2894     has_specified_boundary = boundary is not None
2895
2896     while True:
2897         if boundary is None:
2898             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2899
2900         try:
2901             out, content_type = _multipart_encode_impl(data, boundary)
2902             break
2903         except ValueError:
2904             if has_specified_boundary:
2905                 raise
2906             boundary = None
2907
2908     return out, content_type
2909
2910
2911 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2912     for val in map(d.get, variadic(key_or_keys)):
2913         if val is not None and (val or not skip_false_values):
2914             return val
2915     return default
2916
2917
2918 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2919     for f in funcs:
2920         try:
2921             val = f(*args, **kwargs)
2922         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2923             pass
2924         else:
2925             if expected_type is None or isinstance(val, expected_type):
2926                 return val
2927
2928
2929 def try_get(src, getter, expected_type=None):
2930     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2931
2932
2933 def filter_dict(dct, cndn=lambda _, v: v is not None):
2934     return {k: v for k, v in dct.items() if cndn(k, v)}
2935
2936
2937 def merge_dicts(*dicts):
2938     merged = {}
2939     for a_dict in dicts:
2940         for k, v in a_dict.items():
2941             if (v is not None and k not in merged
2942                     or isinstance(v, str) and merged[k] == ''):
2943                 merged[k] = v
2944     return merged
2945
2946
2947 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2948     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2949
2950
2951 US_RATINGS = {
2952     'G': 0,
2953     'PG': 10,
2954     'PG-13': 13,
2955     'R': 16,
2956     'NC': 18,
2957 }
2958
2959
2960 TV_PARENTAL_GUIDELINES = {
2961     'TV-Y': 0,
2962     'TV-Y7': 7,
2963     'TV-G': 0,
2964     'TV-PG': 0,
2965     'TV-14': 14,
2966     'TV-MA': 17,
2967 }
2968
2969
2970 def parse_age_limit(s):
2971     # isinstance(False, int) is True. So type() must be used instead
2972     if type(s) is int:  # noqa: E721
2973         return s if 0 <= s <= 21 else None
2974     elif not isinstance(s, str):
2975         return None
2976     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2977     if m:
2978         return int(m.group('age'))
2979     s = s.upper()
2980     if s in US_RATINGS:
2981         return US_RATINGS[s]
2982     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2983     if m:
2984         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2985     return None
2986
2987
2988 def strip_jsonp(code):
2989     return re.sub(
2990         r'''(?sx)^
2991             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2992             (?:\s*&&\s*(?P=func_name))?
2993             \s*\(\s*(?P<callback_data>.*)\);?
2994             \s*?(?://[^\n]*)*$''',
2995         r'\g<callback_data>', code)
2996
2997
2998 def js_to_json(code, vars={}):
2999     # vars is a dict of var, val pairs to substitute
3000     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3001     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3002     INTEGER_TABLE = (
3003         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3004         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3005     )
3006
3007     def fix_kv(m):
3008         v = m.group(0)
3009         if v in ('true', 'false', 'null'):
3010             return v
3011         elif v in ('undefined', 'void 0'):
3012             return 'null'
3013         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3014             return ""
3015
3016         if v[0] in ("'", '"'):
3017             v = re.sub(r'(?s)\\.|"', lambda m: {
3018                 '"': '\\"',
3019                 "\\'": "'",
3020                 '\\\n': '',
3021                 '\\x': '\\u00',
3022             }.get(m.group(0), m.group(0)), v[1:-1])
3023         else:
3024             for regex, base in INTEGER_TABLE:
3025                 im = re.match(regex, v)
3026                 if im:
3027                     i = int(im.group(1), base)
3028                     return '"%d":' % i if v.endswith(':') else '%d' % i
3029
3030             if v in vars:
3031                 return vars[v]
3032
3033         return '"%s"' % v
3034
3035     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3036
3037     return re.sub(r'''(?sx)
3038         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3039         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3040         {comment}|,(?={skip}[\]}}])|
3041         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3042         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3043         [0-9]+(?={skip}:)|
3044         !+
3045         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3046
3047
3048 def qualities(quality_ids):
3049     """ Get a numeric quality value out of a list of possible values """
3050     def q(qid):
3051         try:
3052             return quality_ids.index(qid)
3053         except ValueError:
3054             return -1
3055     return q
3056
3057
3058 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3059
3060
3061 DEFAULT_OUTTMPL = {
3062     'default': '%(title)s [%(id)s].%(ext)s',
3063     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3064 }
3065 OUTTMPL_TYPES = {
3066     'chapter': None,
3067     'subtitle': None,
3068     'thumbnail': None,
3069     'description': 'description',
3070     'annotation': 'annotations.xml',
3071     'infojson': 'info.json',
3072     'link': None,
3073     'pl_video': None,
3074     'pl_thumbnail': None,
3075     'pl_description': 'description',
3076     'pl_infojson': 'info.json',
3077 }
3078
3079 # As of [1] format syntax is:
3080 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3081 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3082 STR_FORMAT_RE_TMPL = r'''(?x)
3083     (?<!%)(?P<prefix>(?:%%)*)
3084     %
3085     (?P<has_key>\((?P<key>{0})\))?
3086     (?P<format>
3087         (?P<conversion>[#0\-+ ]+)?
3088         (?P<min_width>\d+)?
3089         (?P<precision>\.\d+)?
3090         (?P<len_mod>[hlL])?  # unused in python
3091         {1}  # conversion type
3092     )
3093 '''
3094
3095
3096 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3097
3098
3099 def limit_length(s, length):
3100     """ Add ellipses to overly long strings """
3101     if s is None:
3102         return None
3103     ELLIPSES = '...'
3104     if len(s) > length:
3105         return s[:length - len(ELLIPSES)] + ELLIPSES
3106     return s
3107
3108
3109 def version_tuple(v):
3110     return tuple(int(e) for e in re.split(r'[-.]', v))
3111
3112
3113 def is_outdated_version(version, limit, assume_new=True):
3114     if not version:
3115         return not assume_new
3116     try:
3117         return version_tuple(version) < version_tuple(limit)
3118     except ValueError:
3119         return not assume_new
3120
3121
3122 def ytdl_is_updateable():
3123     """ Returns if yt-dlp can be updated with -U """
3124
3125     from .update import is_non_updateable
3126
3127     return not is_non_updateable()
3128
3129
3130 def args_to_str(args):
3131     # Get a short string representation for a subprocess command
3132     return ' '.join(compat_shlex_quote(a) for a in args)
3133
3134
3135 def error_to_compat_str(err):
3136     return str(err)
3137
3138
3139 def error_to_str(err):
3140     return f'{type(err).__name__}: {err}'
3141
3142
3143 def mimetype2ext(mt):
3144     if mt is None:
3145         return None
3146
3147     mt, _, params = mt.partition(';')
3148     mt = mt.strip()
3149
3150     FULL_MAP = {
3151         'audio/mp4': 'm4a',
3152         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3153         # it's the most popular one
3154         'audio/mpeg': 'mp3',
3155         'audio/x-wav': 'wav',
3156         'audio/wav': 'wav',
3157         'audio/wave': 'wav',
3158     }
3159
3160     ext = FULL_MAP.get(mt)
3161     if ext is not None:
3162         return ext
3163
3164     SUBTYPE_MAP = {
3165         '3gpp': '3gp',
3166         'smptett+xml': 'tt',
3167         'ttaf+xml': 'dfxp',
3168         'ttml+xml': 'ttml',
3169         'x-flv': 'flv',
3170         'x-mp4-fragmented': 'mp4',
3171         'x-ms-sami': 'sami',
3172         'x-ms-wmv': 'wmv',
3173         'mpegurl': 'm3u8',
3174         'x-mpegurl': 'm3u8',
3175         'vnd.apple.mpegurl': 'm3u8',
3176         'dash+xml': 'mpd',
3177         'f4m+xml': 'f4m',
3178         'hds+xml': 'f4m',
3179         'vnd.ms-sstr+xml': 'ism',
3180         'quicktime': 'mov',
3181         'mp2t': 'ts',
3182         'x-wav': 'wav',
3183         'filmstrip+json': 'fs',
3184         'svg+xml': 'svg',
3185     }
3186
3187     _, _, subtype = mt.rpartition('/')
3188     ext = SUBTYPE_MAP.get(subtype.lower())
3189     if ext is not None:
3190         return ext
3191
3192     SUFFIX_MAP = {
3193         'json': 'json',
3194         'xml': 'xml',
3195         'zip': 'zip',
3196         'gzip': 'gz',
3197     }
3198
3199     _, _, suffix = subtype.partition('+')
3200     ext = SUFFIX_MAP.get(suffix)
3201     if ext is not None:
3202         return ext
3203
3204     return subtype.replace('+', '.')
3205
3206
3207 def ext2mimetype(ext_or_url):
3208     if not ext_or_url:
3209         return None
3210     if '.' not in ext_or_url:
3211         ext_or_url = f'file.{ext_or_url}'
3212     return mimetypes.guess_type(ext_or_url)[0]
3213
3214
3215 def parse_codecs(codecs_str):
3216     # http://tools.ietf.org/html/rfc6381
3217     if not codecs_str:
3218         return {}
3219     split_codecs = list(filter(None, map(
3220         str.strip, codecs_str.strip().strip(',').split(','))))
3221     vcodec, acodec, scodec, hdr = None, None, None, None
3222     for full_codec in split_codecs:
3223         parts = full_codec.split('.')
3224         codec = parts[0].replace('0', '')
3225         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3226                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3227             if not vcodec:
3228                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3229                 if codec in ('dvh1', 'dvhe'):
3230                     hdr = 'DV'
3231                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3232                     hdr = 'HDR10'
3233                 elif full_codec.replace('0', '').startswith('vp9.2'):
3234                     hdr = 'HDR10'
3235         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3236             if not acodec:
3237                 acodec = full_codec
3238         elif codec in ('stpp', 'wvtt',):
3239             if not scodec:
3240                 scodec = full_codec
3241         else:
3242             write_string(f'WARNING: Unknown codec {full_codec}\n')
3243     if vcodec or acodec or scodec:
3244         return {
3245             'vcodec': vcodec or 'none',
3246             'acodec': acodec or 'none',
3247             'dynamic_range': hdr,
3248             **({'scodec': scodec} if scodec is not None else {}),
3249         }
3250     elif len(split_codecs) == 2:
3251         return {
3252             'vcodec': split_codecs[0],
3253             'acodec': split_codecs[1],
3254         }
3255     return {}
3256
3257
3258 def urlhandle_detect_ext(url_handle):
3259     getheader = url_handle.headers.get
3260
3261     cd = getheader('Content-Disposition')
3262     if cd:
3263         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3264         if m:
3265             e = determine_ext(m.group('filename'), default_ext=None)
3266             if e:
3267                 return e
3268
3269     return mimetype2ext(getheader('Content-Type'))
3270
3271
3272 def encode_data_uri(data, mime_type):
3273     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3274
3275
3276 def age_restricted(content_limit, age_limit):
3277     """ Returns True iff the content should be blocked """
3278
3279     if age_limit is None:  # No limit set
3280         return False
3281     if content_limit is None:
3282         return False  # Content available for everyone
3283     return age_limit < content_limit
3284
3285
3286 def is_html(first_bytes):
3287     """ Detect whether a file contains HTML by examining its first bytes. """
3288
3289     BOMS = [
3290         (b'\xef\xbb\xbf', 'utf-8'),
3291         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3292         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3293         (b'\xff\xfe', 'utf-16-le'),
3294         (b'\xfe\xff', 'utf-16-be'),
3295     ]
3296
3297     encoding = 'utf-8'
3298     for bom, enc in BOMS:
3299         while first_bytes.startswith(bom):
3300             encoding, first_bytes = enc, first_bytes[len(bom):]
3301
3302     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3303
3304
3305 def determine_protocol(info_dict):
3306     protocol = info_dict.get('protocol')
3307     if protocol is not None:
3308         return protocol
3309
3310     url = sanitize_url(info_dict['url'])
3311     if url.startswith('rtmp'):
3312         return 'rtmp'
3313     elif url.startswith('mms'):
3314         return 'mms'
3315     elif url.startswith('rtsp'):
3316         return 'rtsp'
3317
3318     ext = determine_ext(url)
3319     if ext == 'm3u8':
3320         return 'm3u8'
3321     elif ext == 'f4m':
3322         return 'f4m'
3323
3324     return compat_urllib_parse_urlparse(url).scheme
3325
3326
3327 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3328     """ Render a list of rows, each as a list of values.
3329     Text after a \t will be right aligned """
3330     def width(string):
3331         return len(remove_terminal_sequences(string).replace('\t', ''))
3332
3333     def get_max_lens(table):
3334         return [max(width(str(v)) for v in col) for col in zip(*table)]
3335
3336     def filter_using_list(row, filterArray):
3337         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3338
3339     max_lens = get_max_lens(data) if hide_empty else []
3340     header_row = filter_using_list(header_row, max_lens)
3341     data = [filter_using_list(row, max_lens) for row in data]
3342
3343     table = [header_row] + data
3344     max_lens = get_max_lens(table)
3345     extra_gap += 1
3346     if delim:
3347         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3348         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3349     for row in table:
3350         for pos, text in enumerate(map(str, row)):
3351             if '\t' in text:
3352                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3353             else:
3354                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3355     ret = '\n'.join(''.join(row).rstrip() for row in table)
3356     return ret
3357
3358
3359 def _match_one(filter_part, dct, incomplete):
3360     # TODO: Generalize code with YoutubeDL._build_format_filter
3361     STRING_OPERATORS = {
3362         '*=': operator.contains,
3363         '^=': lambda attr, value: attr.startswith(value),
3364         '$=': lambda attr, value: attr.endswith(value),
3365         '~=': lambda attr, value: re.search(value, attr),
3366     }
3367     COMPARISON_OPERATORS = {
3368         **STRING_OPERATORS,
3369         '<=': operator.le,  # "<=" must be defined above "<"
3370         '<': operator.lt,
3371         '>=': operator.ge,
3372         '>': operator.gt,
3373         '=': operator.eq,
3374     }
3375
3376     if isinstance(incomplete, bool):
3377         is_incomplete = lambda _: incomplete
3378     else:
3379         is_incomplete = lambda k: k in incomplete
3380
3381     operator_rex = re.compile(r'''(?x)\s*
3382         (?P<key>[a-z_]+)
3383         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3384         (?:
3385             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3386             (?P<strval>.+?)
3387         )
3388         \s*$
3389         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3390     m = operator_rex.search(filter_part)
3391     if m:
3392         m = m.groupdict()
3393         unnegated_op = COMPARISON_OPERATORS[m['op']]
3394         if m['negation']:
3395             op = lambda attr, value: not unnegated_op(attr, value)
3396         else:
3397             op = unnegated_op
3398         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3399         if m['quote']:
3400             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3401         actual_value = dct.get(m['key'])
3402         numeric_comparison = None
3403         if isinstance(actual_value, (int, float)):
3404             # If the original field is a string and matching comparisonvalue is
3405             # a number we should respect the origin of the original field
3406             # and process comparison value as a string (see
3407             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3408             try:
3409                 numeric_comparison = int(comparison_value)
3410             except ValueError:
3411                 numeric_comparison = parse_filesize(comparison_value)
3412                 if numeric_comparison is None:
3413                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3414                 if numeric_comparison is None:
3415                     numeric_comparison = parse_duration(comparison_value)
3416         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3417             raise ValueError('Operator %s only supports string values!' % m['op'])
3418         if actual_value is None:
3419             return is_incomplete(m['key']) or m['none_inclusive']
3420         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3421
3422     UNARY_OPERATORS = {
3423         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3424         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3425     }
3426     operator_rex = re.compile(r'''(?x)\s*
3427         (?P<op>%s)\s*(?P<key>[a-z_]+)
3428         \s*$
3429         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3430     m = operator_rex.search(filter_part)
3431     if m:
3432         op = UNARY_OPERATORS[m.group('op')]
3433         actual_value = dct.get(m.group('key'))
3434         if is_incomplete(m.group('key')) and actual_value is None:
3435             return True
3436         return op(actual_value)
3437
3438     raise ValueError('Invalid filter part %r' % filter_part)
3439
3440
3441 def match_str(filter_str, dct, incomplete=False):
3442     """ Filter a dictionary with a simple string syntax.
3443     @returns           Whether the filter passes
3444     @param incomplete  Set of keys that is expected to be missing from dct.
3445                        Can be True/False to indicate all/none of the keys may be missing.
3446                        All conditions on incomplete keys pass if the key is missing
3447     """
3448     return all(
3449         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3450         for filter_part in re.split(r'(?<!\\)&', filter_str))
3451
3452
3453 def match_filter_func(filters):
3454     if not filters:
3455         return None
3456     filters = set(variadic(filters))
3457
3458     interactive = '-' in filters
3459     if interactive:
3460         filters.remove('-')
3461
3462     def _match_func(info_dict, incomplete=False):
3463         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3464             return NO_DEFAULT if interactive and not incomplete else None
3465         else:
3466             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3467             filter_str = ') | ('.join(map(str.strip, filters))
3468             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3469     return _match_func
3470
3471
3472 def parse_dfxp_time_expr(time_expr):
3473     if not time_expr:
3474         return
3475
3476     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3477     if mobj:
3478         return float(mobj.group('time_offset'))
3479
3480     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3481     if mobj:
3482         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3483
3484
3485 def srt_subtitles_timecode(seconds):
3486     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3487
3488
3489 def ass_subtitles_timecode(seconds):
3490     time = timetuple_from_msec(seconds * 1000)
3491     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3492
3493
3494 def dfxp2srt(dfxp_data):
3495     '''
3496     @param dfxp_data A bytes-like object containing DFXP data
3497     @returns A unicode object containing converted SRT data
3498     '''
3499     LEGACY_NAMESPACES = (
3500         (b'http://www.w3.org/ns/ttml', [
3501             b'http://www.w3.org/2004/11/ttaf1',
3502             b'http://www.w3.org/2006/04/ttaf1',
3503             b'http://www.w3.org/2006/10/ttaf1',
3504         ]),
3505         (b'http://www.w3.org/ns/ttml#styling', [
3506             b'http://www.w3.org/ns/ttml#style',
3507         ]),
3508     )
3509
3510     SUPPORTED_STYLING = [
3511         'color',
3512         'fontFamily',
3513         'fontSize',
3514         'fontStyle',
3515         'fontWeight',
3516         'textDecoration'
3517     ]
3518
3519     _x = functools.partial(xpath_with_ns, ns_map={
3520         'xml': 'http://www.w3.org/XML/1998/namespace',
3521         'ttml': 'http://www.w3.org/ns/ttml',
3522         'tts': 'http://www.w3.org/ns/ttml#styling',
3523     })
3524
3525     styles = {}
3526     default_style = {}
3527
3528     class TTMLPElementParser:
3529         _out = ''
3530         _unclosed_elements = []
3531         _applied_styles = []
3532
3533         def start(self, tag, attrib):
3534             if tag in (_x('ttml:br'), 'br'):
3535                 self._out += '\n'
3536             else:
3537                 unclosed_elements = []
3538                 style = {}
3539                 element_style_id = attrib.get('style')
3540                 if default_style:
3541                     style.update(default_style)
3542                 if element_style_id:
3543                     style.update(styles.get(element_style_id, {}))
3544                 for prop in SUPPORTED_STYLING:
3545                     prop_val = attrib.get(_x('tts:' + prop))
3546                     if prop_val:
3547                         style[prop] = prop_val
3548                 if style:
3549                     font = ''
3550                     for k, v in sorted(style.items()):
3551                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3552                             continue
3553                         if k == 'color':
3554                             font += ' color="%s"' % v
3555                         elif k == 'fontSize':
3556                             font += ' size="%s"' % v
3557                         elif k == 'fontFamily':
3558                             font += ' face="%s"' % v
3559                         elif k == 'fontWeight' and v == 'bold':
3560                             self._out += '<b>'
3561                             unclosed_elements.append('b')
3562                         elif k == 'fontStyle' and v == 'italic':
3563                             self._out += '<i>'
3564                             unclosed_elements.append('i')
3565                         elif k == 'textDecoration' and v == 'underline':
3566                             self._out += '<u>'
3567                             unclosed_elements.append('u')
3568                     if font:
3569                         self._out += '<font' + font + '>'
3570                         unclosed_elements.append('font')
3571                     applied_style = {}
3572                     if self._applied_styles:
3573                         applied_style.update(self._applied_styles[-1])
3574                     applied_style.update(style)
3575                     self._applied_styles.append(applied_style)
3576                 self._unclosed_elements.append(unclosed_elements)
3577
3578         def end(self, tag):
3579             if tag not in (_x('ttml:br'), 'br'):
3580                 unclosed_elements = self._unclosed_elements.pop()
3581                 for element in reversed(unclosed_elements):
3582                     self._out += '</%s>' % element
3583                 if unclosed_elements and self._applied_styles:
3584                     self._applied_styles.pop()
3585
3586         def data(self, data):
3587             self._out += data
3588
3589         def close(self):
3590             return self._out.strip()
3591
3592     def parse_node(node):
3593         target = TTMLPElementParser()
3594         parser = xml.etree.ElementTree.XMLParser(target=target)
3595         parser.feed(xml.etree.ElementTree.tostring(node))
3596         return parser.close()
3597
3598     for k, v in LEGACY_NAMESPACES:
3599         for ns in v:
3600             dfxp_data = dfxp_data.replace(ns, k)
3601
3602     dfxp = compat_etree_fromstring(dfxp_data)
3603     out = []
3604     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3605
3606     if not paras:
3607         raise ValueError('Invalid dfxp/TTML subtitle')
3608
3609     repeat = False
3610     while True:
3611         for style in dfxp.findall(_x('.//ttml:style')):
3612             style_id = style.get('id') or style.get(_x('xml:id'))
3613             if not style_id:
3614                 continue
3615             parent_style_id = style.get('style')
3616             if parent_style_id:
3617                 if parent_style_id not in styles:
3618                     repeat = True
3619                     continue
3620                 styles[style_id] = styles[parent_style_id].copy()
3621             for prop in SUPPORTED_STYLING:
3622                 prop_val = style.get(_x('tts:' + prop))
3623                 if prop_val:
3624                     styles.setdefault(style_id, {})[prop] = prop_val
3625         if repeat:
3626             repeat = False
3627         else:
3628             break
3629
3630     for p in ('body', 'div'):
3631         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3632         if ele is None:
3633             continue
3634         style = styles.get(ele.get('style'))
3635         if not style:
3636             continue
3637         default_style.update(style)
3638
3639     for para, index in zip(paras, itertools.count(1)):
3640         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3641         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3642         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3643         if begin_time is None:
3644             continue
3645         if not end_time:
3646             if not dur:
3647                 continue
3648             end_time = begin_time + dur
3649         out.append('%d\n%s --> %s\n%s\n\n' % (
3650             index,
3651             srt_subtitles_timecode(begin_time),
3652             srt_subtitles_timecode(end_time),
3653             parse_node(para)))
3654
3655     return ''.join(out)
3656
3657
3658 def cli_option(params, command_option, param, separator=None):
3659     param = params.get(param)
3660     return ([] if param is None
3661             else [command_option, str(param)] if separator is None
3662             else [f'{command_option}{separator}{param}'])
3663
3664
3665 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3666     param = params.get(param)
3667     assert param in (True, False, None)
3668     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3669
3670
3671 def cli_valueless_option(params, command_option, param, expected_value=True):
3672     return [command_option] if params.get(param) == expected_value else []
3673
3674
3675 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3676     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3677         if use_compat:
3678             return argdict
3679         else:
3680             argdict = None
3681     if argdict is None:
3682         return default
3683     assert isinstance(argdict, dict)
3684
3685     assert isinstance(keys, (list, tuple))
3686     for key_list in keys:
3687         arg_list = list(filter(
3688             lambda x: x is not None,
3689             [argdict.get(key.lower()) for key in variadic(key_list)]))
3690         if arg_list:
3691             return [arg for args in arg_list for arg in args]
3692     return default
3693
3694
3695 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3696     main_key, exe = main_key.lower(), exe.lower()
3697     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3698     keys = [f'{root_key}{k}' for k in (keys or [''])]
3699     if root_key in keys:
3700         if main_key != exe:
3701             keys.append((main_key, exe))
3702         keys.append('default')
3703     else:
3704         use_compat = False
3705     return cli_configuration_args(argdict, keys, default, use_compat)
3706
3707
3708 class ISO639Utils:
3709     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3710     _lang_map = {
3711         'aa': 'aar',
3712         'ab': 'abk',
3713         'ae': 'ave',
3714         'af': 'afr',
3715         'ak': 'aka',
3716         'am': 'amh',
3717         'an': 'arg',
3718         'ar': 'ara',
3719         'as': 'asm',
3720         'av': 'ava',
3721         'ay': 'aym',
3722         'az': 'aze',
3723         'ba': 'bak',
3724         'be': 'bel',
3725         'bg': 'bul',
3726         'bh': 'bih',
3727         'bi': 'bis',
3728         'bm': 'bam',
3729         'bn': 'ben',
3730         'bo': 'bod',
3731         'br': 'bre',
3732         'bs': 'bos',
3733         'ca': 'cat',
3734         'ce': 'che',
3735         'ch': 'cha',
3736         'co': 'cos',
3737         'cr': 'cre',
3738         'cs': 'ces',
3739         'cu': 'chu',
3740         'cv': 'chv',
3741         'cy': 'cym',
3742         'da': 'dan',
3743         'de': 'deu',
3744         'dv': 'div',
3745         'dz': 'dzo',
3746         'ee': 'ewe',
3747         'el': 'ell',
3748         'en': 'eng',
3749         'eo': 'epo',
3750         'es': 'spa',
3751         'et': 'est',
3752         'eu': 'eus',
3753         'fa': 'fas',
3754         'ff': 'ful',
3755         'fi': 'fin',
3756         'fj': 'fij',
3757         'fo': 'fao',
3758         'fr': 'fra',
3759         'fy': 'fry',
3760         'ga': 'gle',
3761         'gd': 'gla',
3762         'gl': 'glg',
3763         'gn': 'grn',
3764         'gu': 'guj',
3765         'gv': 'glv',
3766         'ha': 'hau',
3767         'he': 'heb',
3768         'iw': 'heb',  # Replaced by he in 1989 revision
3769         'hi': 'hin',
3770         'ho': 'hmo',
3771         'hr': 'hrv',
3772         'ht': 'hat',
3773         'hu': 'hun',
3774         'hy': 'hye',
3775         'hz': 'her',
3776         'ia': 'ina',
3777         'id': 'ind',
3778         'in': 'ind',  # Replaced by id in 1989 revision
3779         'ie': 'ile',
3780         'ig': 'ibo',
3781         'ii': 'iii',
3782         'ik': 'ipk',
3783         'io': 'ido',
3784         'is': 'isl',
3785         'it': 'ita',
3786         'iu': 'iku',
3787         'ja': 'jpn',
3788         'jv': 'jav',
3789         'ka': 'kat',
3790         'kg': 'kon',
3791         'ki': 'kik',
3792         'kj': 'kua',
3793         'kk': 'kaz',
3794         'kl': 'kal',
3795         'km': 'khm',
3796         'kn': 'kan',
3797         'ko': 'kor',
3798         'kr': 'kau',
3799         'ks': 'kas',
3800         'ku': 'kur',
3801         'kv': 'kom',
3802         'kw': 'cor',
3803         'ky': 'kir',
3804         'la': 'lat',
3805         'lb': 'ltz',
3806         'lg': 'lug',
3807         'li': 'lim',
3808         'ln': 'lin',
3809         'lo': 'lao',
3810         'lt': 'lit',
3811         'lu': 'lub',
3812         'lv': 'lav',
3813         'mg': 'mlg',
3814         'mh': 'mah',
3815         'mi': 'mri',
3816         'mk': 'mkd',
3817         'ml': 'mal',
3818         'mn': 'mon',
3819         'mr': 'mar',
3820         'ms': 'msa',
3821         'mt': 'mlt',
3822         'my': 'mya',
3823         'na': 'nau',
3824         'nb': 'nob',
3825         'nd': 'nde',
3826         'ne': 'nep',
3827         'ng': 'ndo',
3828         'nl': 'nld',
3829         'nn': 'nno',
3830         'no': 'nor',
3831         'nr': 'nbl',
3832         'nv': 'nav',
3833         'ny': 'nya',
3834         'oc': 'oci',
3835         'oj': 'oji',
3836         'om': 'orm',
3837         'or': 'ori',
3838         'os': 'oss',
3839         'pa': 'pan',
3840         'pi': 'pli',
3841         'pl': 'pol',
3842         'ps': 'pus',
3843         'pt': 'por',
3844         'qu': 'que',
3845         'rm': 'roh',
3846         'rn': 'run',
3847         'ro': 'ron',
3848         'ru': 'rus',
3849         'rw': 'kin',
3850         'sa': 'san',
3851         'sc': 'srd',
3852         'sd': 'snd',
3853         'se': 'sme',
3854         'sg': 'sag',
3855         'si': 'sin',
3856         'sk': 'slk',
3857         'sl': 'slv',
3858         'sm': 'smo',
3859         'sn': 'sna',
3860         'so': 'som',
3861         'sq': 'sqi',
3862         'sr': 'srp',
3863         'ss': 'ssw',
3864         'st': 'sot',
3865         'su': 'sun',
3866         'sv': 'swe',
3867         'sw': 'swa',
3868         'ta': 'tam',
3869         'te': 'tel',
3870         'tg': 'tgk',
3871         'th': 'tha',
3872         'ti': 'tir',
3873         'tk': 'tuk',
3874         'tl': 'tgl',
3875         'tn': 'tsn',
3876         'to': 'ton',
3877         'tr': 'tur',
3878         'ts': 'tso',
3879         'tt': 'tat',
3880         'tw': 'twi',
3881         'ty': 'tah',
3882         'ug': 'uig',
3883         'uk': 'ukr',
3884         'ur': 'urd',
3885         'uz': 'uzb',
3886         've': 'ven',
3887         'vi': 'vie',
3888         'vo': 'vol',
3889         'wa': 'wln',
3890         'wo': 'wol',
3891         'xh': 'xho',
3892         'yi': 'yid',
3893         'ji': 'yid',  # Replaced by yi in 1989 revision
3894         'yo': 'yor',
3895         'za': 'zha',
3896         'zh': 'zho',
3897         'zu': 'zul',
3898     }
3899
3900     @classmethod
3901     def short2long(cls, code):
3902         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3903         return cls._lang_map.get(code[:2])
3904
3905     @classmethod
3906     def long2short(cls, code):
3907         """Convert language code from ISO 639-2/T to ISO 639-1"""
3908         for short_name, long_name in cls._lang_map.items():
3909             if long_name == code:
3910                 return short_name
3911
3912
3913 class ISO3166Utils:
3914     # From http://data.okfn.org/data/core/country-list
3915     _country_map = {
3916         'AF': 'Afghanistan',
3917         'AX': 'Åland Islands',
3918         'AL': 'Albania',
3919         'DZ': 'Algeria',
3920         'AS': 'American Samoa',
3921         'AD': 'Andorra',
3922         'AO': 'Angola',
3923         'AI': 'Anguilla',
3924         'AQ': 'Antarctica',
3925         'AG': 'Antigua and Barbuda',
3926         'AR': 'Argentina',
3927         'AM': 'Armenia',
3928         'AW': 'Aruba',
3929         'AU': 'Australia',
3930         'AT': 'Austria',
3931         'AZ': 'Azerbaijan',
3932         'BS': 'Bahamas',
3933         'BH': 'Bahrain',
3934         'BD': 'Bangladesh',
3935         'BB': 'Barbados',
3936         'BY': 'Belarus',
3937         'BE': 'Belgium',
3938         'BZ': 'Belize',
3939         'BJ': 'Benin',
3940         'BM': 'Bermuda',
3941         'BT': 'Bhutan',
3942         'BO': 'Bolivia, Plurinational State of',
3943         'BQ': 'Bonaire, Sint Eustatius and Saba',
3944         'BA': 'Bosnia and Herzegovina',
3945         'BW': 'Botswana',
3946         'BV': 'Bouvet Island',
3947         'BR': 'Brazil',
3948         'IO': 'British Indian Ocean Territory',
3949         'BN': 'Brunei Darussalam',
3950         'BG': 'Bulgaria',
3951         'BF': 'Burkina Faso',
3952         'BI': 'Burundi',
3953         'KH': 'Cambodia',
3954         'CM': 'Cameroon',
3955         'CA': 'Canada',
3956         'CV': 'Cape Verde',
3957         'KY': 'Cayman Islands',
3958         'CF': 'Central African Republic',
3959         'TD': 'Chad',
3960         'CL': 'Chile',
3961         'CN': 'China',
3962         'CX': 'Christmas Island',
3963         'CC': 'Cocos (Keeling) Islands',
3964         'CO': 'Colombia',
3965         'KM': 'Comoros',
3966         'CG': 'Congo',
3967         'CD': 'Congo, the Democratic Republic of the',
3968         'CK': 'Cook Islands',
3969         'CR': 'Costa Rica',
3970         'CI': 'Côte d\'Ivoire',
3971         'HR': 'Croatia',
3972         'CU': 'Cuba',
3973         'CW': 'Curaçao',
3974         'CY': 'Cyprus',
3975         'CZ': 'Czech Republic',
3976         'DK': 'Denmark',
3977         'DJ': 'Djibouti',
3978         'DM': 'Dominica',
3979         'DO': 'Dominican Republic',
3980         'EC': 'Ecuador',
3981         'EG': 'Egypt',
3982         'SV': 'El Salvador',
3983         'GQ': 'Equatorial Guinea',
3984         'ER': 'Eritrea',
3985         'EE': 'Estonia',
3986         'ET': 'Ethiopia',
3987         'FK': 'Falkland Islands (Malvinas)',
3988         'FO': 'Faroe Islands',
3989         'FJ': 'Fiji',
3990         'FI': 'Finland',
3991         'FR': 'France',
3992         'GF': 'French Guiana',
3993         'PF': 'French Polynesia',
3994         'TF': 'French Southern Territories',
3995         'GA': 'Gabon',
3996         'GM': 'Gambia',
3997         'GE': 'Georgia',
3998         'DE': 'Germany',
3999         'GH': 'Ghana',
4000         'GI': 'Gibraltar',
4001         'GR': 'Greece',
4002         'GL': 'Greenland',
4003         'GD': 'Grenada',
4004         'GP': 'Guadeloupe',
4005         'GU': 'Guam',
4006         'GT': 'Guatemala',
4007         'GG': 'Guernsey',
4008         'GN': 'Guinea',
4009         'GW': 'Guinea-Bissau',
4010         'GY': 'Guyana',
4011         'HT': 'Haiti',
4012         'HM': 'Heard Island and McDonald Islands',
4013         'VA': 'Holy See (Vatican City State)',
4014         'HN': 'Honduras',
4015         'HK': 'Hong Kong',
4016         'HU': 'Hungary',
4017         'IS': 'Iceland',
4018         'IN': 'India',
4019         'ID': 'Indonesia',
4020         'IR': 'Iran, Islamic Republic of',
4021         'IQ': 'Iraq',
4022         'IE': 'Ireland',
4023         'IM': 'Isle of Man',
4024         'IL': 'Israel',
4025         'IT': 'Italy',
4026         'JM': 'Jamaica',
4027         'JP': 'Japan',
4028         'JE': 'Jersey',
4029         'JO': 'Jordan',
4030         'KZ': 'Kazakhstan',
4031         'KE': 'Kenya',
4032         'KI': 'Kiribati',
4033         'KP': 'Korea, Democratic People\'s Republic of',
4034         'KR': 'Korea, Republic of',
4035         'KW': 'Kuwait',
4036         'KG': 'Kyrgyzstan',
4037         'LA': 'Lao People\'s Democratic Republic',
4038         'LV': 'Latvia',
4039         'LB': 'Lebanon',
4040         'LS': 'Lesotho',
4041         'LR': 'Liberia',
4042         'LY': 'Libya',
4043         'LI': 'Liechtenstein',
4044         'LT': 'Lithuania',
4045         'LU': 'Luxembourg',
4046         'MO': 'Macao',
4047         'MK': 'Macedonia, the Former Yugoslav Republic of',
4048         'MG': 'Madagascar',
4049         'MW': 'Malawi',
4050         'MY': 'Malaysia',
4051         'MV': 'Maldives',
4052         'ML': 'Mali',
4053         'MT': 'Malta',
4054         'MH': 'Marshall Islands',
4055         'MQ': 'Martinique',
4056         'MR': 'Mauritania',
4057         'MU': 'Mauritius',
4058         'YT': 'Mayotte',
4059         'MX': 'Mexico',
4060         'FM': 'Micronesia, Federated States of',
4061         'MD': 'Moldova, Republic of',
4062         'MC': 'Monaco',
4063         'MN': 'Mongolia',
4064         'ME': 'Montenegro',
4065         'MS': 'Montserrat',
4066         'MA': 'Morocco',
4067         'MZ': 'Mozambique',
4068         'MM': 'Myanmar',
4069         'NA': 'Namibia',
4070         'NR': 'Nauru',
4071         'NP': 'Nepal',
4072         'NL': 'Netherlands',
4073         'NC': 'New Caledonia',
4074         'NZ': 'New Zealand',
4075         'NI': 'Nicaragua',
4076         'NE': 'Niger',
4077         'NG': 'Nigeria',
4078         'NU': 'Niue',
4079         'NF': 'Norfolk Island',
4080         'MP': 'Northern Mariana Islands',
4081         'NO': 'Norway',
4082         'OM': 'Oman',
4083         'PK': 'Pakistan',
4084         'PW': 'Palau',
4085         'PS': 'Palestine, State of',
4086         'PA': 'Panama',
4087         'PG': 'Papua New Guinea',
4088         'PY': 'Paraguay',
4089         'PE': 'Peru',
4090         'PH': 'Philippines',
4091         'PN': 'Pitcairn',
4092         'PL': 'Poland',
4093         'PT': 'Portugal',
4094         'PR': 'Puerto Rico',
4095         'QA': 'Qatar',
4096         'RE': 'Réunion',
4097         'RO': 'Romania',
4098         'RU': 'Russian Federation',
4099         'RW': 'Rwanda',
4100         'BL': 'Saint Barthélemy',
4101         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4102         'KN': 'Saint Kitts and Nevis',
4103         'LC': 'Saint Lucia',
4104         'MF': 'Saint Martin (French part)',
4105         'PM': 'Saint Pierre and Miquelon',
4106         'VC': 'Saint Vincent and the Grenadines',
4107         'WS': 'Samoa',
4108         'SM': 'San Marino',
4109         'ST': 'Sao Tome and Principe',
4110         'SA': 'Saudi Arabia',
4111         'SN': 'Senegal',
4112         'RS': 'Serbia',
4113         'SC': 'Seychelles',
4114         'SL': 'Sierra Leone',
4115         'SG': 'Singapore',
4116         'SX': 'Sint Maarten (Dutch part)',
4117         'SK': 'Slovakia',
4118         'SI': 'Slovenia',
4119         'SB': 'Solomon Islands',
4120         'SO': 'Somalia',
4121         'ZA': 'South Africa',
4122         'GS': 'South Georgia and the South Sandwich Islands',
4123         'SS': 'South Sudan',
4124         'ES': 'Spain',
4125         'LK': 'Sri Lanka',
4126         'SD': 'Sudan',
4127         'SR': 'Suriname',
4128         'SJ': 'Svalbard and Jan Mayen',
4129         'SZ': 'Swaziland',
4130         'SE': 'Sweden',
4131         'CH': 'Switzerland',
4132         'SY': 'Syrian Arab Republic',
4133         'TW': 'Taiwan, Province of China',
4134         'TJ': 'Tajikistan',
4135         'TZ': 'Tanzania, United Republic of',
4136         'TH': 'Thailand',
4137         'TL': 'Timor-Leste',
4138         'TG': 'Togo',
4139         'TK': 'Tokelau',
4140         'TO': 'Tonga',
4141         'TT': 'Trinidad and Tobago',
4142         'TN': 'Tunisia',
4143         'TR': 'Turkey',
4144         'TM': 'Turkmenistan',
4145         'TC': 'Turks and Caicos Islands',
4146         'TV': 'Tuvalu',
4147         'UG': 'Uganda',
4148         'UA': 'Ukraine',
4149         'AE': 'United Arab Emirates',
4150         'GB': 'United Kingdom',
4151         'US': 'United States',
4152         'UM': 'United States Minor Outlying Islands',
4153         'UY': 'Uruguay',
4154         'UZ': 'Uzbekistan',
4155         'VU': 'Vanuatu',
4156         'VE': 'Venezuela, Bolivarian Republic of',
4157         'VN': 'Viet Nam',
4158         'VG': 'Virgin Islands, British',
4159         'VI': 'Virgin Islands, U.S.',
4160         'WF': 'Wallis and Futuna',
4161         'EH': 'Western Sahara',
4162         'YE': 'Yemen',
4163         'ZM': 'Zambia',
4164         'ZW': 'Zimbabwe',
4165         # Not ISO 3166 codes, but used for IP blocks
4166         'AP': 'Asia/Pacific Region',
4167         'EU': 'Europe',
4168     }
4169
4170     @classmethod
4171     def short2full(cls, code):
4172         """Convert an ISO 3166-2 country code to the corresponding full name"""
4173         return cls._country_map.get(code.upper())
4174
4175
4176 class GeoUtils:
4177     # Major IPv4 address blocks per country
4178     _country_ip_map = {
4179         'AD': '46.172.224.0/19',
4180         'AE': '94.200.0.0/13',
4181         'AF': '149.54.0.0/17',
4182         'AG': '209.59.64.0/18',
4183         'AI': '204.14.248.0/21',
4184         'AL': '46.99.0.0/16',
4185         'AM': '46.70.0.0/15',
4186         'AO': '105.168.0.0/13',
4187         'AP': '182.50.184.0/21',
4188         'AQ': '23.154.160.0/24',
4189         'AR': '181.0.0.0/12',
4190         'AS': '202.70.112.0/20',
4191         'AT': '77.116.0.0/14',
4192         'AU': '1.128.0.0/11',
4193         'AW': '181.41.0.0/18',
4194         'AX': '185.217.4.0/22',
4195         'AZ': '5.197.0.0/16',
4196         'BA': '31.176.128.0/17',
4197         'BB': '65.48.128.0/17',
4198         'BD': '114.130.0.0/16',
4199         'BE': '57.0.0.0/8',
4200         'BF': '102.178.0.0/15',
4201         'BG': '95.42.0.0/15',
4202         'BH': '37.131.0.0/17',
4203         'BI': '154.117.192.0/18',
4204         'BJ': '137.255.0.0/16',
4205         'BL': '185.212.72.0/23',
4206         'BM': '196.12.64.0/18',
4207         'BN': '156.31.0.0/16',
4208         'BO': '161.56.0.0/16',
4209         'BQ': '161.0.80.0/20',
4210         'BR': '191.128.0.0/12',
4211         'BS': '24.51.64.0/18',
4212         'BT': '119.2.96.0/19',
4213         'BW': '168.167.0.0/16',
4214         'BY': '178.120.0.0/13',
4215         'BZ': '179.42.192.0/18',
4216         'CA': '99.224.0.0/11',
4217         'CD': '41.243.0.0/16',
4218         'CF': '197.242.176.0/21',
4219         'CG': '160.113.0.0/16',
4220         'CH': '85.0.0.0/13',
4221         'CI': '102.136.0.0/14',
4222         'CK': '202.65.32.0/19',
4223         'CL': '152.172.0.0/14',
4224         'CM': '102.244.0.0/14',
4225         'CN': '36.128.0.0/10',
4226         'CO': '181.240.0.0/12',
4227         'CR': '201.192.0.0/12',
4228         'CU': '152.206.0.0/15',
4229         'CV': '165.90.96.0/19',
4230         'CW': '190.88.128.0/17',
4231         'CY': '31.153.0.0/16',
4232         'CZ': '88.100.0.0/14',
4233         'DE': '53.0.0.0/8',
4234         'DJ': '197.241.0.0/17',
4235         'DK': '87.48.0.0/12',
4236         'DM': '192.243.48.0/20',
4237         'DO': '152.166.0.0/15',
4238         'DZ': '41.96.0.0/12',
4239         'EC': '186.68.0.0/15',
4240         'EE': '90.190.0.0/15',
4241         'EG': '156.160.0.0/11',
4242         'ER': '196.200.96.0/20',
4243         'ES': '88.0.0.0/11',
4244         'ET': '196.188.0.0/14',
4245         'EU': '2.16.0.0/13',
4246         'FI': '91.152.0.0/13',
4247         'FJ': '144.120.0.0/16',
4248         'FK': '80.73.208.0/21',
4249         'FM': '119.252.112.0/20',
4250         'FO': '88.85.32.0/19',
4251         'FR': '90.0.0.0/9',
4252         'GA': '41.158.0.0/15',
4253         'GB': '25.0.0.0/8',
4254         'GD': '74.122.88.0/21',
4255         'GE': '31.146.0.0/16',
4256         'GF': '161.22.64.0/18',
4257         'GG': '62.68.160.0/19',
4258         'GH': '154.160.0.0/12',
4259         'GI': '95.164.0.0/16',
4260         'GL': '88.83.0.0/19',
4261         'GM': '160.182.0.0/15',
4262         'GN': '197.149.192.0/18',
4263         'GP': '104.250.0.0/19',
4264         'GQ': '105.235.224.0/20',
4265         'GR': '94.64.0.0/13',
4266         'GT': '168.234.0.0/16',
4267         'GU': '168.123.0.0/16',
4268         'GW': '197.214.80.0/20',
4269         'GY': '181.41.64.0/18',
4270         'HK': '113.252.0.0/14',
4271         'HN': '181.210.0.0/16',
4272         'HR': '93.136.0.0/13',
4273         'HT': '148.102.128.0/17',
4274         'HU': '84.0.0.0/14',
4275         'ID': '39.192.0.0/10',
4276         'IE': '87.32.0.0/12',
4277         'IL': '79.176.0.0/13',
4278         'IM': '5.62.80.0/20',
4279         'IN': '117.192.0.0/10',
4280         'IO': '203.83.48.0/21',
4281         'IQ': '37.236.0.0/14',
4282         'IR': '2.176.0.0/12',
4283         'IS': '82.221.0.0/16',
4284         'IT': '79.0.0.0/10',
4285         'JE': '87.244.64.0/18',
4286         'JM': '72.27.0.0/17',
4287         'JO': '176.29.0.0/16',
4288         'JP': '133.0.0.0/8',
4289         'KE': '105.48.0.0/12',
4290         'KG': '158.181.128.0/17',
4291         'KH': '36.37.128.0/17',
4292         'KI': '103.25.140.0/22',
4293         'KM': '197.255.224.0/20',
4294         'KN': '198.167.192.0/19',
4295         'KP': '175.45.176.0/22',
4296         'KR': '175.192.0.0/10',
4297         'KW': '37.36.0.0/14',
4298         'KY': '64.96.0.0/15',
4299         'KZ': '2.72.0.0/13',
4300         'LA': '115.84.64.0/18',
4301         'LB': '178.135.0.0/16',
4302         'LC': '24.92.144.0/20',
4303         'LI': '82.117.0.0/19',
4304         'LK': '112.134.0.0/15',
4305         'LR': '102.183.0.0/16',
4306         'LS': '129.232.0.0/17',
4307         'LT': '78.56.0.0/13',
4308         'LU': '188.42.0.0/16',
4309         'LV': '46.109.0.0/16',
4310         'LY': '41.252.0.0/14',
4311         'MA': '105.128.0.0/11',
4312         'MC': '88.209.64.0/18',
4313         'MD': '37.246.0.0/16',
4314         'ME': '178.175.0.0/17',
4315         'MF': '74.112.232.0/21',
4316         'MG': '154.126.0.0/17',
4317         'MH': '117.103.88.0/21',
4318         'MK': '77.28.0.0/15',
4319         'ML': '154.118.128.0/18',
4320         'MM': '37.111.0.0/17',
4321         'MN': '49.0.128.0/17',
4322         'MO': '60.246.0.0/16',
4323         'MP': '202.88.64.0/20',
4324         'MQ': '109.203.224.0/19',
4325         'MR': '41.188.64.0/18',
4326         'MS': '208.90.112.0/22',
4327         'MT': '46.11.0.0/16',
4328         'MU': '105.16.0.0/12',
4329         'MV': '27.114.128.0/18',
4330         'MW': '102.70.0.0/15',
4331         'MX': '187.192.0.0/11',
4332         'MY': '175.136.0.0/13',
4333         'MZ': '197.218.0.0/15',
4334         'NA': '41.182.0.0/16',
4335         'NC': '101.101.0.0/18',
4336         'NE': '197.214.0.0/18',
4337         'NF': '203.17.240.0/22',
4338         'NG': '105.112.0.0/12',
4339         'NI': '186.76.0.0/15',
4340         'NL': '145.96.0.0/11',
4341         'NO': '84.208.0.0/13',
4342         'NP': '36.252.0.0/15',
4343         'NR': '203.98.224.0/19',
4344         'NU': '49.156.48.0/22',
4345         'NZ': '49.224.0.0/14',
4346         'OM': '5.36.0.0/15',
4347         'PA': '186.72.0.0/15',
4348         'PE': '186.160.0.0/14',
4349         'PF': '123.50.64.0/18',
4350         'PG': '124.240.192.0/19',
4351         'PH': '49.144.0.0/13',
4352         'PK': '39.32.0.0/11',
4353         'PL': '83.0.0.0/11',
4354         'PM': '70.36.0.0/20',
4355         'PR': '66.50.0.0/16',
4356         'PS': '188.161.0.0/16',
4357         'PT': '85.240.0.0/13',
4358         'PW': '202.124.224.0/20',
4359         'PY': '181.120.0.0/14',
4360         'QA': '37.210.0.0/15',
4361         'RE': '102.35.0.0/16',
4362         'RO': '79.112.0.0/13',
4363         'RS': '93.86.0.0/15',
4364         'RU': '5.136.0.0/13',
4365         'RW': '41.186.0.0/16',
4366         'SA': '188.48.0.0/13',
4367         'SB': '202.1.160.0/19',
4368         'SC': '154.192.0.0/11',
4369         'SD': '102.120.0.0/13',
4370         'SE': '78.64.0.0/12',
4371         'SG': '8.128.0.0/10',
4372         'SI': '188.196.0.0/14',
4373         'SK': '78.98.0.0/15',
4374         'SL': '102.143.0.0/17',
4375         'SM': '89.186.32.0/19',
4376         'SN': '41.82.0.0/15',
4377         'SO': '154.115.192.0/18',
4378         'SR': '186.179.128.0/17',
4379         'SS': '105.235.208.0/21',
4380         'ST': '197.159.160.0/19',
4381         'SV': '168.243.0.0/16',
4382         'SX': '190.102.0.0/20',
4383         'SY': '5.0.0.0/16',
4384         'SZ': '41.84.224.0/19',
4385         'TC': '65.255.48.0/20',
4386         'TD': '154.68.128.0/19',
4387         'TG': '196.168.0.0/14',
4388         'TH': '171.96.0.0/13',
4389         'TJ': '85.9.128.0/18',
4390         'TK': '27.96.24.0/21',
4391         'TL': '180.189.160.0/20',
4392         'TM': '95.85.96.0/19',
4393         'TN': '197.0.0.0/11',
4394         'TO': '175.176.144.0/21',
4395         'TR': '78.160.0.0/11',
4396         'TT': '186.44.0.0/15',
4397         'TV': '202.2.96.0/19',
4398         'TW': '120.96.0.0/11',
4399         'TZ': '156.156.0.0/14',
4400         'UA': '37.52.0.0/14',
4401         'UG': '102.80.0.0/13',
4402         'US': '6.0.0.0/8',
4403         'UY': '167.56.0.0/13',
4404         'UZ': '84.54.64.0/18',
4405         'VA': '212.77.0.0/19',
4406         'VC': '207.191.240.0/21',
4407         'VE': '186.88.0.0/13',
4408         'VG': '66.81.192.0/20',
4409         'VI': '146.226.0.0/16',
4410         'VN': '14.160.0.0/11',
4411         'VU': '202.80.32.0/20',
4412         'WF': '117.20.32.0/21',
4413         'WS': '202.4.32.0/19',
4414         'YE': '134.35.0.0/16',
4415         'YT': '41.242.116.0/22',
4416         'ZA': '41.0.0.0/11',
4417         'ZM': '102.144.0.0/13',
4418         'ZW': '102.177.192.0/18',
4419     }
4420
4421     @classmethod
4422     def random_ipv4(cls, code_or_block):
4423         if len(code_or_block) == 2:
4424             block = cls._country_ip_map.get(code_or_block.upper())
4425             if not block:
4426                 return None
4427         else:
4428             block = code_or_block
4429         addr, preflen = block.split('/')
4430         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4431         addr_max = addr_min | (0xffffffff >> int(preflen))
4432         return compat_str(socket.inet_ntoa(
4433             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4434
4435
4436 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4437     def __init__(self, proxies=None):
4438         # Set default handlers
4439         for type in ('http', 'https'):
4440             setattr(self, '%s_open' % type,
4441                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4442                         meth(r, proxy, type))
4443         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4444
4445     def proxy_open(self, req, proxy, type):
4446         req_proxy = req.headers.get('Ytdl-request-proxy')
4447         if req_proxy is not None:
4448             proxy = req_proxy
4449             del req.headers['Ytdl-request-proxy']
4450
4451         if proxy == '__noproxy__':
4452             return None  # No Proxy
4453         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4454             req.add_header('Ytdl-socks-proxy', proxy)
4455             # yt-dlp's http/https handlers do wrapping the socket with socks
4456             return None
4457         return compat_urllib_request.ProxyHandler.proxy_open(
4458             self, req, proxy, type)
4459
4460
4461 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4462 # released into Public Domain
4463 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4464
4465 def long_to_bytes(n, blocksize=0):
4466     """long_to_bytes(n:long, blocksize:int) : string
4467     Convert a long integer to a byte string.
4468
4469     If optional blocksize is given and greater than zero, pad the front of the
4470     byte string with binary zeros so that the length is a multiple of
4471     blocksize.
4472     """
4473     # after much testing, this algorithm was deemed to be the fastest
4474     s = b''
4475     n = int(n)
4476     while n > 0:
4477         s = compat_struct_pack('>I', n & 0xffffffff) + s
4478         n = n >> 32
4479     # strip off leading zeros
4480     for i in range(len(s)):
4481         if s[i] != b'\000'[0]:
4482             break
4483     else:
4484         # only happens when n == 0
4485         s = b'\000'
4486         i = 0
4487     s = s[i:]
4488     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4489     # de-padding being done above, but sigh...
4490     if blocksize > 0 and len(s) % blocksize:
4491         s = (blocksize - len(s) % blocksize) * b'\000' + s
4492     return s
4493
4494
4495 def bytes_to_long(s):
4496     """bytes_to_long(string) : long
4497     Convert a byte string to a long integer.
4498
4499     This is (essentially) the inverse of long_to_bytes().
4500     """
4501     acc = 0
4502     length = len(s)
4503     if length % 4:
4504         extra = (4 - length % 4)
4505         s = b'\000' * extra + s
4506         length = length + extra
4507     for i in range(0, length, 4):
4508         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4509     return acc
4510
4511
4512 def ohdave_rsa_encrypt(data, exponent, modulus):
4513     '''
4514     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4515
4516     Input:
4517         data: data to encrypt, bytes-like object
4518         exponent, modulus: parameter e and N of RSA algorithm, both integer
4519     Output: hex string of encrypted data
4520
4521     Limitation: supports one block encryption only
4522     '''
4523
4524     payload = int(binascii.hexlify(data[::-1]), 16)
4525     encrypted = pow(payload, exponent, modulus)
4526     return '%x' % encrypted
4527
4528
4529 def pkcs1pad(data, length):
4530     """
4531     Padding input data with PKCS#1 scheme
4532
4533     @param {int[]} data        input data
4534     @param {int}   length      target length
4535     @returns {int[]}           padded data
4536     """
4537     if len(data) > length - 11:
4538         raise ValueError('Input data too long for PKCS#1 padding')
4539
4540     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4541     return [0, 2] + pseudo_random + [0] + data
4542
4543
4544 def encode_base_n(num, n, table=None):
4545     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4546     if not table:
4547         table = FULL_TABLE[:n]
4548
4549     if n > len(table):
4550         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4551
4552     if num == 0:
4553         return table[0]
4554
4555     ret = ''
4556     while num:
4557         ret = table[num % n] + ret
4558         num = num // n
4559     return ret
4560
4561
4562 def decode_packed_codes(code):
4563     mobj = re.search(PACKED_CODES_RE, code)
4564     obfuscated_code, base, count, symbols = mobj.groups()
4565     base = int(base)
4566     count = int(count)
4567     symbols = symbols.split('|')
4568     symbol_table = {}
4569
4570     while count:
4571         count -= 1
4572         base_n_count = encode_base_n(count, base)
4573         symbol_table[base_n_count] = symbols[count] or base_n_count
4574
4575     return re.sub(
4576         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4577         obfuscated_code)
4578
4579
4580 def caesar(s, alphabet, shift):
4581     if shift == 0:
4582         return s
4583     l = len(alphabet)
4584     return ''.join(
4585         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4586         for c in s)
4587
4588
4589 def rot47(s):
4590     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4591
4592
4593 def parse_m3u8_attributes(attrib):
4594     info = {}
4595     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4596         if val.startswith('"'):
4597             val = val[1:-1]
4598         info[key] = val
4599     return info
4600
4601
4602 def urshift(val, n):
4603     return val >> n if val >= 0 else (val + 0x100000000) >> n
4604
4605
4606 # Based on png2str() written by @gdkchan and improved by @yokrysty
4607 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4608 def decode_png(png_data):
4609     # Reference: https://www.w3.org/TR/PNG/
4610     header = png_data[8:]
4611
4612     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4613         raise OSError('Not a valid PNG file.')
4614
4615     int_map = {1: '>B', 2: '>H', 4: '>I'}
4616     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4617
4618     chunks = []
4619
4620     while header:
4621         length = unpack_integer(header[:4])
4622         header = header[4:]
4623
4624         chunk_type = header[:4]
4625         header = header[4:]
4626
4627         chunk_data = header[:length]
4628         header = header[length:]
4629
4630         header = header[4:]  # Skip CRC
4631
4632         chunks.append({
4633             'type': chunk_type,
4634             'length': length,
4635             'data': chunk_data
4636         })
4637
4638     ihdr = chunks[0]['data']
4639
4640     width = unpack_integer(ihdr[:4])
4641     height = unpack_integer(ihdr[4:8])
4642
4643     idat = b''
4644
4645     for chunk in chunks:
4646         if chunk['type'] == b'IDAT':
4647             idat += chunk['data']
4648
4649     if not idat:
4650         raise OSError('Unable to read PNG data.')
4651
4652     decompressed_data = bytearray(zlib.decompress(idat))
4653
4654     stride = width * 3
4655     pixels = []
4656
4657     def _get_pixel(idx):
4658         x = idx % stride
4659         y = idx // stride
4660         return pixels[y][x]
4661
4662     for y in range(height):
4663         basePos = y * (1 + stride)
4664         filter_type = decompressed_data[basePos]
4665
4666         current_row = []
4667
4668         pixels.append(current_row)
4669
4670         for x in range(stride):
4671             color = decompressed_data[1 + basePos + x]
4672             basex = y * stride + x
4673             left = 0
4674             up = 0
4675
4676             if x > 2:
4677                 left = _get_pixel(basex - 3)
4678             if y > 0:
4679                 up = _get_pixel(basex - stride)
4680
4681             if filter_type == 1:  # Sub
4682                 color = (color + left) & 0xff
4683             elif filter_type == 2:  # Up
4684                 color = (color + up) & 0xff
4685             elif filter_type == 3:  # Average
4686                 color = (color + ((left + up) >> 1)) & 0xff
4687             elif filter_type == 4:  # Paeth
4688                 a = left
4689                 b = up
4690                 c = 0
4691
4692                 if x > 2 and y > 0:
4693                     c = _get_pixel(basex - stride - 3)
4694
4695                 p = a + b - c
4696
4697                 pa = abs(p - a)
4698                 pb = abs(p - b)
4699                 pc = abs(p - c)
4700
4701                 if pa <= pb and pa <= pc:
4702                     color = (color + a) & 0xff
4703                 elif pb <= pc:
4704                     color = (color + b) & 0xff
4705                 else:
4706                     color = (color + c) & 0xff
4707
4708             current_row.append(color)
4709
4710     return width, height, pixels
4711
4712
4713 def write_xattr(path, key, value):
4714     # Windows: Write xattrs to NTFS Alternate Data Streams:
4715     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4716     if compat_os_name == 'nt':
4717         assert ':' not in key
4718         assert os.path.exists(path)
4719
4720         try:
4721             with open(f'{path}:{key}', 'wb') as f:
4722                 f.write(value)
4723         except OSError as e:
4724             raise XAttrMetadataError(e.errno, e.strerror)
4725         return
4726
4727     # UNIX Method 1. Use xattrs/pyxattrs modules
4728     from .dependencies import xattr
4729
4730     setxattr = None
4731     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4732         # Unicode arguments are not supported in pyxattr until version 0.5.0
4733         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4734         if version_tuple(xattr.__version__) >= (0, 5, 0):
4735             setxattr = xattr.set
4736     elif xattr:
4737         setxattr = xattr.setxattr
4738
4739     if setxattr:
4740         try:
4741             setxattr(path, key, value)
4742         except OSError as e:
4743             raise XAttrMetadataError(e.errno, e.strerror)
4744         return
4745
4746     # UNIX Method 2. Use setfattr/xattr executables
4747     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4748            else 'xattr' if check_executable('xattr', ['-h']) else None)
4749     if not exe:
4750         raise XAttrUnavailableError(
4751             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4752             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4753
4754     value = value.decode()
4755     try:
4756         p = Popen(
4757             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4758             stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4759     except OSError as e:
4760         raise XAttrMetadataError(e.errno, e.strerror)
4761     stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4762     if p.returncode:
4763         raise XAttrMetadataError(p.returncode, stderr)
4764
4765
4766 def random_birthday(year_field, month_field, day_field):
4767     start_date = datetime.date(1950, 1, 1)
4768     end_date = datetime.date(1995, 12, 31)
4769     offset = random.randint(0, (end_date - start_date).days)
4770     random_date = start_date + datetime.timedelta(offset)
4771     return {
4772         year_field: str(random_date.year),
4773         month_field: str(random_date.month),
4774         day_field: str(random_date.day),
4775     }
4776
4777
4778 # Templates for internet shortcut files, which are plain text files.
4779 DOT_URL_LINK_TEMPLATE = '''\
4780 [InternetShortcut]
4781 URL=%(url)s
4782 '''
4783
4784 DOT_WEBLOC_LINK_TEMPLATE = '''\
4785 <?xml version="1.0" encoding="UTF-8"?>
4786 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4787 <plist version="1.0">
4788 <dict>
4789 \t<key>URL</key>
4790 \t<string>%(url)s</string>
4791 </dict>
4792 </plist>
4793 '''
4794
4795 DOT_DESKTOP_LINK_TEMPLATE = '''\
4796 [Desktop Entry]
4797 Encoding=UTF-8
4798 Name=%(filename)s
4799 Type=Link
4800 URL=%(url)s
4801 Icon=text-html
4802 '''
4803
4804 LINK_TEMPLATES = {
4805     'url': DOT_URL_LINK_TEMPLATE,
4806     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4807     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4808 }
4809
4810
4811 def iri_to_uri(iri):
4812     """
4813     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4814
4815     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4816     """
4817
4818     iri_parts = compat_urllib_parse_urlparse(iri)
4819
4820     if '[' in iri_parts.netloc:
4821         raise ValueError('IPv6 URIs are not, yet, supported.')
4822         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4823
4824     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4825
4826     net_location = ''
4827     if iri_parts.username:
4828         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4829         if iri_parts.password is not None:
4830             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4831         net_location += '@'
4832
4833     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4834     # The 'idna' encoding produces ASCII text.
4835     if iri_parts.port is not None and iri_parts.port != 80:
4836         net_location += ':' + str(iri_parts.port)
4837
4838     return urllib.parse.urlunparse(
4839         (iri_parts.scheme,
4840             net_location,
4841
4842             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4843
4844             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4845             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4846
4847             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4848             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4849
4850             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4851
4852     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4853
4854
4855 def to_high_limit_path(path):
4856     if sys.platform in ['win32', 'cygwin']:
4857         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4858         return '\\\\?\\' + os.path.abspath(path)
4859
4860     return path
4861
4862
4863 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4864     val = traverse_obj(obj, *variadic(field))
4865     if val in ignore:
4866         return default
4867     return template % (func(val) if func else val)
4868
4869
4870 def clean_podcast_url(url):
4871     return re.sub(r'''(?x)
4872         (?:
4873             (?:
4874                 chtbl\.com/track|
4875                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4876                 play\.podtrac\.com
4877             )/[^/]+|
4878             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4879             flex\.acast\.com|
4880             pd(?:
4881                 cn\.co| # https://podcorn.com/analytics-prefix/
4882                 st\.fm # https://podsights.com/docs/
4883             )/e
4884         )/''', '', url)
4885
4886
4887 _HEX_TABLE = '0123456789abcdef'
4888
4889
4890 def random_uuidv4():
4891     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4892
4893
4894 def make_dir(path, to_screen=None):
4895     try:
4896         dn = os.path.dirname(path)
4897         if dn and not os.path.exists(dn):
4898             os.makedirs(dn)
4899         return True
4900     except OSError as err:
4901         if callable(to_screen) is not None:
4902             to_screen('unable to create directory ' + error_to_compat_str(err))
4903         return False
4904
4905
4906 def get_executable_path():
4907     from .update import _get_variant_and_executable_path
4908
4909     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4910
4911
4912 def load_plugins(name, suffix, namespace):
4913     classes = {}
4914     with contextlib.suppress(FileNotFoundError):
4915         plugins_spec = importlib.util.spec_from_file_location(
4916             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4917         plugins = importlib.util.module_from_spec(plugins_spec)
4918         sys.modules[plugins_spec.name] = plugins
4919         plugins_spec.loader.exec_module(plugins)
4920         for name in dir(plugins):
4921             if name in namespace:
4922                 continue
4923             if not name.endswith(suffix):
4924                 continue
4925             klass = getattr(plugins, name)
4926             classes[name] = namespace[name] = klass
4927     return classes
4928
4929
4930 def traverse_obj(
4931         obj, *path_list, default=None, expected_type=None, get_all=True,
4932         casesense=True, is_user_input=False, traverse_string=False):
4933     ''' Traverse nested list/dict/tuple
4934     @param path_list        A list of paths which are checked one by one.
4935                             Each path is a list of keys where each key is a:
4936                               - None:     Do nothing
4937                               - string:   A dictionary key
4938                               - int:      An index into a list
4939                               - tuple:    A list of keys all of which will be traversed
4940                               - Ellipsis: Fetch all values in the object
4941                               - Function: Takes the key and value as arguments
4942                                           and returns whether the key matches or not
4943     @param default          Default value to return
4944     @param expected_type    Only accept final value of this type (Can also be any callable)
4945     @param get_all          Return all the values obtained from a path or only the first one
4946     @param casesense        Whether to consider dictionary keys as case sensitive
4947     @param is_user_input    Whether the keys are generated from user input. If True,
4948                             strings are converted to int/slice if necessary
4949     @param traverse_string  Whether to traverse inside strings. If True, any
4950                             non-compatible object will also be converted into a string
4951     # TODO: Write tests
4952     '''
4953     if not casesense:
4954         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4955         path_list = (map(_lower, variadic(path)) for path in path_list)
4956
4957     def _traverse_obj(obj, path, _current_depth=0):
4958         nonlocal depth
4959         path = tuple(variadic(path))
4960         for i, key in enumerate(path):
4961             if None in (key, obj):
4962                 return obj
4963             if isinstance(key, (list, tuple)):
4964                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4965                 key = ...
4966             if key is ...:
4967                 obj = (obj.values() if isinstance(obj, dict)
4968                        else obj if isinstance(obj, (list, tuple, LazyList))
4969                        else str(obj) if traverse_string else [])
4970                 _current_depth += 1
4971                 depth = max(depth, _current_depth)
4972                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4973             elif callable(key):
4974                 if isinstance(obj, (list, tuple, LazyList)):
4975                     obj = enumerate(obj)
4976                 elif isinstance(obj, dict):
4977                     obj = obj.items()
4978                 else:
4979                     if not traverse_string:
4980                         return None
4981                     obj = str(obj)
4982                 _current_depth += 1
4983                 depth = max(depth, _current_depth)
4984                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
4985             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4986                 obj = (obj.get(key) if casesense or (key in obj)
4987                        else next((v for k, v in obj.items() if _lower(k) == key), None))
4988             else:
4989                 if is_user_input:
4990                     key = (int_or_none(key) if ':' not in key
4991                            else slice(*map(int_or_none, key.split(':'))))
4992                     if key == slice(None):
4993                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4994                 if not isinstance(key, (int, slice)):
4995                     return None
4996                 if not isinstance(obj, (list, tuple, LazyList)):
4997                     if not traverse_string:
4998                         return None
4999                     obj = str(obj)
5000                 try:
5001                     obj = obj[key]
5002                 except IndexError:
5003                     return None
5004         return obj
5005
5006     if isinstance(expected_type, type):
5007         type_test = lambda val: val if isinstance(val, expected_type) else None
5008     elif expected_type is not None:
5009         type_test = expected_type
5010     else:
5011         type_test = lambda val: val
5012
5013     for path in path_list:
5014         depth = 0
5015         val = _traverse_obj(obj, path)
5016         if val is not None:
5017             if depth:
5018                 for _ in range(depth - 1):
5019                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5020                 val = [v for v in map(type_test, val) if v is not None]
5021                 if val:
5022                     return val if get_all else val[0]
5023             else:
5024                 val = type_test(val)
5025                 if val is not None:
5026                     return val
5027     return default
5028
5029
5030 def traverse_dict(dictn, keys, casesense=True):
5031     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5032                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5033     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5034
5035
5036 def get_first(obj, keys, **kwargs):
5037     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5038
5039
5040 def variadic(x, allowed_types=(str, bytes, dict)):
5041     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5042
5043
5044 def decode_base(value, digits):
5045     # This will convert given base-x string to scalar (long or int)
5046     table = {char: index for index, char in enumerate(digits)}
5047     result = 0
5048     base = len(digits)
5049     for chr in value:
5050         result *= base
5051         result += table[chr]
5052     return result
5053
5054
5055 def time_seconds(**kwargs):
5056     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5057     return t.timestamp()
5058
5059
5060 # create a JSON Web Signature (jws) with HS256 algorithm
5061 # the resulting format is in JWS Compact Serialization
5062 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5063 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5064 def jwt_encode_hs256(payload_data, key, headers={}):
5065     header_data = {
5066         'alg': 'HS256',
5067         'typ': 'JWT',
5068     }
5069     if headers:
5070         header_data.update(headers)
5071     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5072     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5073     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5074     signature_b64 = base64.b64encode(h.digest())
5075     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5076     return token
5077
5078
5079 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5080 def jwt_decode_hs256(jwt):
5081     header_b64, payload_b64, signature_b64 = jwt.split('.')
5082     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5083     return payload_data
5084
5085
5086 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5087
5088
5089 @functools.cache
5090 def supports_terminal_sequences(stream):
5091     if compat_os_name == 'nt':
5092         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5093             return False
5094     elif not os.getenv('TERM'):
5095         return False
5096     try:
5097         return stream.isatty()
5098     except BaseException:
5099         return False
5100
5101
5102 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5103     if compat_os_name != 'nt':
5104         return
5105     global WINDOWS_VT_MODE
5106     startupinfo = subprocess.STARTUPINFO()
5107     startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
5108     try:
5109         subprocess.Popen('', shell=True, startupinfo=startupinfo).wait()
5110     except Exception:
5111         return
5112
5113     WINDOWS_VT_MODE = True
5114     supports_terminal_sequences.cache_clear()
5115
5116
5117 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5118
5119
5120 def remove_terminal_sequences(string):
5121     return _terminal_sequences_re.sub('', string)
5122
5123
5124 def number_of_digits(number):
5125     return len('%d' % number)
5126
5127
5128 def join_nonempty(*values, delim='-', from_dict=None):
5129     if from_dict is not None:
5130         values = map(from_dict.get, values)
5131     return delim.join(map(str, filter(None, values)))
5132
5133
5134 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5135     """
5136     Find the largest format dimensions in terms of video width and, for each thumbnail:
5137     * Modify the URL: Match the width with the provided regex and replace with the former width
5138     * Update dimensions
5139
5140     This function is useful with video services that scale the provided thumbnails on demand
5141     """
5142     _keys = ('width', 'height')
5143     max_dimensions = max(
5144         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5145         default=(0, 0))
5146     if not max_dimensions[0]:
5147         return thumbnails
5148     return [
5149         merge_dicts(
5150             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5151             dict(zip(_keys, max_dimensions)), thumbnail)
5152         for thumbnail in thumbnails
5153     ]
5154
5155
5156 def parse_http_range(range):
5157     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5158     if not range:
5159         return None, None, None
5160     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5161     if not crg:
5162         return None, None, None
5163     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5164
5165
5166 def read_stdin(what):
5167     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5168     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5169     return sys.stdin
5170
5171
5172 class Config:
5173     own_args = None
5174     parsed_args = None
5175     filename = None
5176     __initialized = False
5177
5178     def __init__(self, parser, label=None):
5179         self.parser, self.label = parser, label
5180         self._loaded_paths, self.configs = set(), []
5181
5182     def init(self, args=None, filename=None):
5183         assert not self.__initialized
5184         directory = ''
5185         if filename:
5186             location = os.path.realpath(filename)
5187             directory = os.path.dirname(location)
5188             if location in self._loaded_paths:
5189                 return False
5190             self._loaded_paths.add(location)
5191
5192         self.own_args, self.__initialized = args, True
5193         opts, _ = self.parser.parse_known_args(args)
5194         self.parsed_args, self.filename = args, filename
5195
5196         for location in opts.config_locations or []:
5197             if location == '-':
5198                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5199                 continue
5200             location = os.path.join(directory, expand_path(location))
5201             if os.path.isdir(location):
5202                 location = os.path.join(location, 'yt-dlp.conf')
5203             if not os.path.exists(location):
5204                 self.parser.error(f'config location {location} does not exist')
5205             self.append_config(self.read_file(location), location)
5206         return True
5207
5208     def __str__(self):
5209         label = join_nonempty(
5210             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5211             delim=' ')
5212         return join_nonempty(
5213             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5214             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5215             delim='\n')
5216
5217     @staticmethod
5218     def read_file(filename, default=[]):
5219         try:
5220             optionf = open(filename)
5221         except OSError:
5222             return default  # silently skip if file is not present
5223         try:
5224             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5225             contents = optionf.read()
5226             res = shlex.split(contents, comments=True)
5227         finally:
5228             optionf.close()
5229         return res
5230
5231     @staticmethod
5232     def hide_login_info(opts):
5233         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5234         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5235
5236         def _scrub_eq(o):
5237             m = eqre.match(o)
5238             if m:
5239                 return m.group('key') + '=PRIVATE'
5240             else:
5241                 return o
5242
5243         opts = list(map(_scrub_eq, opts))
5244         for idx, opt in enumerate(opts):
5245             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5246                 opts[idx + 1] = 'PRIVATE'
5247         return opts
5248
5249     def append_config(self, *args, label=None):
5250         config = type(self)(self.parser, label)
5251         config._loaded_paths = self._loaded_paths
5252         if config.init(*args):
5253             self.configs.append(config)
5254
5255     @property
5256     def all_args(self):
5257         for config in reversed(self.configs):
5258             yield from config.all_args
5259         yield from self.parsed_args or []
5260
5261     def parse_known_args(self, **kwargs):
5262         return self.parser.parse_known_args(self.all_args, **kwargs)
5263
5264     def parse_args(self):
5265         return self.parser.parse_args(self.all_args)
5266
5267
5268 class WebSocketsWrapper():
5269     """Wraps websockets module to use in non-async scopes"""
5270     pool = None
5271
5272     def __init__(self, url, headers=None, connect=True):
5273         self.loop = asyncio.new_event_loop()
5274         # XXX: "loop" is deprecated
5275         self.conn = websockets.connect(
5276             url, extra_headers=headers, ping_interval=None,
5277             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5278         if connect:
5279             self.__enter__()
5280         atexit.register(self.__exit__, None, None, None)
5281
5282     def __enter__(self):
5283         if not self.pool:
5284             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5285         return self
5286
5287     def send(self, *args):
5288         self.run_with_loop(self.pool.send(*args), self.loop)
5289
5290     def recv(self, *args):
5291         return self.run_with_loop(self.pool.recv(*args), self.loop)
5292
5293     def __exit__(self, type, value, traceback):
5294         try:
5295             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5296         finally:
5297             self.loop.close()
5298             self._cancel_all_tasks(self.loop)
5299
5300     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5301     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5302     @staticmethod
5303     def run_with_loop(main, loop):
5304         if not asyncio.iscoroutine(main):
5305             raise ValueError(f'a coroutine was expected, got {main!r}')
5306
5307         try:
5308             return loop.run_until_complete(main)
5309         finally:
5310             loop.run_until_complete(loop.shutdown_asyncgens())
5311             if hasattr(loop, 'shutdown_default_executor'):
5312                 loop.run_until_complete(loop.shutdown_default_executor())
5313
5314     @staticmethod
5315     def _cancel_all_tasks(loop):
5316         to_cancel = asyncio.all_tasks(loop)
5317
5318         if not to_cancel:
5319             return
5320
5321         for task in to_cancel:
5322             task.cancel()
5323
5324         # XXX: "loop" is removed in python 3.10+
5325         loop.run_until_complete(
5326             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5327
5328         for task in to_cancel:
5329             if task.cancelled():
5330                 continue
5331             if task.exception() is not None:
5332                 loop.call_exception_handler({
5333                     'message': 'unhandled exception during asyncio.run() shutdown',
5334                     'exception': task.exception(),
5335                     'task': task,
5336                 })
5337
5338
5339 def merge_headers(*dicts):
5340     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5341     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5342
5343
5344 class classproperty:
5345     """classmethod(property(func)) that works in py < 3.9"""
5346
5347     def __init__(self, func):
5348         functools.update_wrapper(self, func)
5349         self.func = func
5350
5351     def __get__(self, _, cls):
5352         return self.func(cls)
5353
5354
5355 class Namespace:
5356     """Immutable namespace"""
5357
5358     def __init__(self, **kwargs):
5359         self._dict = kwargs
5360
5361     def __getattr__(self, attr):
5362         return self._dict[attr]
5363
5364     def __contains__(self, item):
5365         return item in self._dict.values()
5366
5367     def __iter__(self):
5368         return iter(self._dict.items())
5369
5370     def __repr__(self):
5371         return f'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5372
5373
5374 # Deprecated
5375 has_certifi = bool(certifi)
5376 has_websockets = bool(websockets)