yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import importlib.util
  18 import io
  19 import itertools
  20 import json
  21 import locale
  22 import math
  23 import mimetypes
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import subprocess
  33 import sys
  34 import tempfile
  35 import time
  36 import traceback
  37 import urllib.parse
  38 import xml.etree.ElementTree
  39 import zlib
  40
  41 from .compat import asyncio, functools  # isort: split
  42 from .compat import (
  43     compat_chr,
  44     compat_cookiejar,
  45     compat_etree_fromstring,
  46     compat_expanduser,
  47     compat_html_entities,
  48     compat_html_entities_html5,
  49     compat_HTMLParseError,
  50     compat_HTMLParser,
  51     compat_http_client,
  52     compat_HTTPError,
  53     compat_os_name,
  54     compat_parse_qs,
  55     compat_shlex_quote,
  56     compat_str,
  57     compat_struct_pack,
  58     compat_struct_unpack,
  59     compat_urllib_error,
  60     compat_urllib_parse_unquote_plus,
  61     compat_urllib_parse_urlencode,
  62     compat_urllib_parse_urlparse,
  63     compat_urllib_request,
  64     compat_urlparse,
  65 )
  66 from .dependencies import brotli, certifi, websockets
  67 from .socks import ProxyType, sockssocket
  68
  69
  70 def register_socks_protocols():
  71     # "Register" SOCKS protocols
  72     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  73     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  74     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  75         if scheme not in compat_urlparse.uses_netloc:
  76             compat_urlparse.uses_netloc.append(scheme)
  77
  78
  79 # This is not clearly defined otherwise
  80 compiled_regex_type = type(re.compile(''))
  81
  82
  83 def random_user_agent():
  84     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  85     _CHROME_VERSIONS = (
  86         '90.0.4430.212',
  87         '90.0.4430.24',
  88         '90.0.4430.70',
  89         '90.0.4430.72',
  90         '90.0.4430.85',
  91         '90.0.4430.93',
  92         '91.0.4472.101',
  93         '91.0.4472.106',
  94         '91.0.4472.114',
  95         '91.0.4472.124',
  96         '91.0.4472.164',
  97         '91.0.4472.19',
  98         '91.0.4472.77',
  99         '92.0.4515.107',
 100         '92.0.4515.115',
 101         '92.0.4515.131',
 102         '92.0.4515.159',
 103         '92.0.4515.43',
 104         '93.0.4556.0',
 105         '93.0.4577.15',
 106         '93.0.4577.63',
 107         '93.0.4577.82',
 108         '94.0.4606.41',
 109         '94.0.4606.54',
 110         '94.0.4606.61',
 111         '94.0.4606.71',
 112         '94.0.4606.81',
 113         '94.0.4606.85',
 114         '95.0.4638.17',
 115         '95.0.4638.50',
 116         '95.0.4638.54',
 117         '95.0.4638.69',
 118         '95.0.4638.74',
 119         '96.0.4664.18',
 120         '96.0.4664.45',
 121         '96.0.4664.55',
 122         '96.0.4664.93',
 123         '97.0.4692.20',
 124     )
 125     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 126
 127
 128 SUPPORTED_ENCODINGS = [
 129     'gzip', 'deflate'
 130 ]
 131 if brotli:
 132     SUPPORTED_ENCODINGS.append('br')
 133
 134 std_headers = {
 135     'User-Agent': random_user_agent(),
 136     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 137     'Accept-Language': 'en-us,en;q=0.5',
 138     'Sec-Fetch-Mode': 'navigate',
 139 }
 140
 141
 142 USER_AGENTS = {
 143     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 144 }
 145
 146
 147 NO_DEFAULT = object()
 148
 149 ENGLISH_MONTH_NAMES = [
 150     'January', 'February', 'March', 'April', 'May', 'June',
 151     'July', 'August', 'September', 'October', 'November', 'December']
 152
 153 MONTH_NAMES = {
 154     'en': ENGLISH_MONTH_NAMES,
 155     'fr': [
 156         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 157         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 158 }
 159
 160 KNOWN_EXTENSIONS = (
 161     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 162     'flv', 'f4v', 'f4a', 'f4b',
 163     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 164     'mkv', 'mka', 'mk3d',
 165     'avi', 'divx',
 166     'mov',
 167     'asf', 'wmv', 'wma',
 168     '3gp', '3g2',
 169     'mp3',
 170     'flac',
 171     'ape',
 172     'wav',
 173     'f4f', 'f4m', 'm3u8', 'smil')
 174
 175 # needed for sanitizing filenames in restricted mode
 176 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 177                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 178                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 179
 180 DATE_FORMATS = (
 181     '%d %B %Y',
 182     '%d %b %Y',
 183     '%B %d %Y',
 184     '%B %dst %Y',
 185     '%B %dnd %Y',
 186     '%B %drd %Y',
 187     '%B %dth %Y',
 188     '%b %d %Y',
 189     '%b %dst %Y',
 190     '%b %dnd %Y',
 191     '%b %drd %Y',
 192     '%b %dth %Y',
 193     '%b %dst %Y %I:%M',
 194     '%b %dnd %Y %I:%M',
 195     '%b %drd %Y %I:%M',
 196     '%b %dth %Y %I:%M',
 197     '%Y %m %d',
 198     '%Y-%m-%d',
 199     '%Y.%m.%d.',
 200     '%Y/%m/%d',
 201     '%Y/%m/%d %H:%M',
 202     '%Y/%m/%d %H:%M:%S',
 203     '%Y%m%d%H%M',
 204     '%Y%m%d%H%M%S',
 205     '%Y%m%d',
 206     '%Y-%m-%d %H:%M',
 207     '%Y-%m-%d %H:%M:%S',
 208     '%Y-%m-%d %H:%M:%S.%f',
 209     '%Y-%m-%d %H:%M:%S:%f',
 210     '%d.%m.%Y %H:%M',
 211     '%d.%m.%Y %H.%M',
 212     '%Y-%m-%dT%H:%M:%SZ',
 213     '%Y-%m-%dT%H:%M:%S.%fZ',
 214     '%Y-%m-%dT%H:%M:%S.%f0Z',
 215     '%Y-%m-%dT%H:%M:%S',
 216     '%Y-%m-%dT%H:%M:%S.%f',
 217     '%Y-%m-%dT%H:%M',
 218     '%b %d %Y at %H:%M',
 219     '%b %d %Y at %H:%M:%S',
 220     '%B %d %Y at %H:%M',
 221     '%B %d %Y at %H:%M:%S',
 222     '%H:%M %d-%b-%Y',
 223 )
 224
 225 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 226 DATE_FORMATS_DAY_FIRST.extend([
 227     '%d-%m-%Y',
 228     '%d.%m.%Y',
 229     '%d.%m.%y',
 230     '%d/%m/%Y',
 231     '%d/%m/%y',
 232     '%d/%m/%Y %H:%M:%S',
 233 ])
 234
 235 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 236 DATE_FORMATS_MONTH_FIRST.extend([
 237     '%m-%d-%Y',
 238     '%m.%d.%Y',
 239     '%m/%d/%Y',
 240     '%m/%d/%y',
 241     '%m/%d/%Y %H:%M:%S',
 242 ])
 243
 244 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 245 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 246
 247 NUMBER_RE = r'\d+(?:\.\d+)?'
 248
 249
 250 @functools.cache
 251 def preferredencoding():
 252     """Get preferred encoding.
 253
 254     Returns the best encoding scheme for the system, based on
 255     locale.getpreferredencoding() and some further tweaks.
 256     """
 257     try:
 258         pref = locale.getpreferredencoding()
 259         'TEST'.encode(pref)
 260     except Exception:
 261         pref = 'UTF-8'
 262
 263     return pref
 264
 265
 266 def write_json_file(obj, fn):
 267     """ Encode obj as JSON and write it to fn, atomically if possible """
 268
 269     tf = tempfile.NamedTemporaryFile(
 270         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 271         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 272
 273     try:
 274         with tf:
 275             json.dump(obj, tf, ensure_ascii=False)
 276         if sys.platform == 'win32':
 277             # Need to remove existing file on Windows, else os.rename raises
 278             # WindowsError or FileExistsError.
 279             with contextlib.suppress(OSError):
 280                 os.unlink(fn)
 281         with contextlib.suppress(OSError):
 282             mask = os.umask(0)
 283             os.umask(mask)
 284             os.chmod(tf.name, 0o666 & ~mask)
 285         os.rename(tf.name, fn)
 286     except Exception:
 287         with contextlib.suppress(OSError):
 288             os.remove(tf.name)
 289         raise
 290
 291
 292 def find_xpath_attr(node, xpath, key, val=None):
 293     """ Find the xpath xpath[@key=val] """
 294     assert re.match(r'^[a-zA-Z_-]+$', key)
 295     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 296     return node.find(expr)
 297
 298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 299 # the namespace parameter
 300
 301
 302 def xpath_with_ns(path, ns_map):
 303     components = [c.split(':') for c in path.split('/')]
 304     replaced = []
 305     for c in components:
 306         if len(c) == 1:
 307             replaced.append(c[0])
 308         else:
 309             ns, tag = c
 310             replaced.append('{%s}%s' % (ns_map[ns], tag))
 311     return '/'.join(replaced)
 312
 313
 314 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 315     def _find_xpath(xpath):
 316         return node.find(xpath)
 317
 318     if isinstance(xpath, (str, compat_str)):
 319         n = _find_xpath(xpath)
 320     else:
 321         for xp in xpath:
 322             n = _find_xpath(xp)
 323             if n is not None:
 324                 break
 325
 326     if n is None:
 327         if default is not NO_DEFAULT:
 328             return default
 329         elif fatal:
 330             name = xpath if name is None else name
 331             raise ExtractorError('Could not find XML element %s' % name)
 332         else:
 333             return None
 334     return n
 335
 336
 337 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 338     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 339     if n is None or n == default:
 340         return n
 341     if n.text is None:
 342         if default is not NO_DEFAULT:
 343             return default
 344         elif fatal:
 345             name = xpath if name is None else name
 346             raise ExtractorError('Could not find XML element\'s text %s' % name)
 347         else:
 348             return None
 349     return n.text
 350
 351
 352 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 353     n = find_xpath_attr(node, xpath, key)
 354     if n is None:
 355         if default is not NO_DEFAULT:
 356             return default
 357         elif fatal:
 358             name = f'{xpath}[@{key}]' if name is None else name
 359             raise ExtractorError('Could not find XML attribute %s' % name)
 360         else:
 361             return None
 362     return n.attrib[key]
 363
 364
 365 def get_element_by_id(id, html, **kwargs):
 366     """Return the content of the tag with the specified ID in the passed HTML document"""
 367     return get_element_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_html_by_id(id, html, **kwargs):
 371     """Return the html of the tag with the specified ID in the passed HTML document"""
 372     return get_element_html_by_attribute('id', id, html, **kwargs)
 373
 374
 375 def get_element_by_class(class_name, html):
 376     """Return the content of the first tag with the specified class in the passed HTML document"""
 377     retval = get_elements_by_class(class_name, html)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_html_by_class(class_name, html):
 382     """Return the html of the first tag with the specified class in the passed HTML document"""
 383     retval = get_elements_html_by_class(class_name, html)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_by_attribute(attribute, value, html, **kwargs):
 388     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_element_html_by_attribute(attribute, value, html, **kargs):
 393     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 394     return retval[0] if retval else None
 395
 396
 397 def get_elements_by_class(class_name, html, **kargs):
 398     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 399     return get_elements_by_attribute(
 400         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 401         html, escape_value=False)
 402
 403
 404 def get_elements_html_by_class(class_name, html):
 405     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 406     return get_elements_html_by_attribute(
 407         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 408         html, escape_value=False)
 409
 410
 411 def get_elements_by_attribute(*args, **kwargs):
 412     """Return the content of the tag with the specified attribute in the passed HTML document"""
 413     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_html_by_attribute(*args, **kwargs):
 417     """Return the html of the tag with the specified attribute in the passed HTML document"""
 418     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 419
 420
 421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 422     """
 423     Return the text (content) and the html (whole) of the tag with the specified
 424     attribute in the passed HTML document
 425     """
 426
 427     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 428
 429     value = re.escape(value) if escape_value else value
 430
 431     partial_element_re = rf'''(?x)
 432         <(?P<tag>[a-zA-Z0-9:._-]+)
 433          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 434          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 435         '''
 436
 437     for m in re.finditer(partial_element_re, html):
 438         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 439
 440         yield (
 441             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 442             whole
 443         )
 444
 445
 446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 447     """
 448     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 449     closing tag for the first opening tag it has encountered, and can be used
 450     as a context manager
 451     """
 452
 453     class HTMLBreakOnClosingTagException(Exception):
 454         pass
 455
 456     def __init__(self):
 457         self.tagstack = collections.deque()
 458         compat_HTMLParser.__init__(self)
 459
 460     def __enter__(self):
 461         return self
 462
 463     def __exit__(self, *_):
 464         self.close()
 465
 466     def close(self):
 467         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 468         # so data remains buffered; we no longer have any interest in it, thus
 469         # override this method to discard it
 470         pass
 471
 472     def handle_starttag(self, tag, _):
 473         self.tagstack.append(tag)
 474
 475     def handle_endtag(self, tag):
 476         if not self.tagstack:
 477             raise compat_HTMLParseError('no tags in the stack')
 478         while self.tagstack:
 479             inner_tag = self.tagstack.pop()
 480             if inner_tag == tag:
 481                 break
 482         else:
 483             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 484         if not self.tagstack:
 485             raise self.HTMLBreakOnClosingTagException()
 486
 487
 488 def get_element_text_and_html_by_tag(tag, html):
 489     """
 490     For the first element with the specified tag in the passed HTML document
 491     return its' content (text) and the whole element (html)
 492     """
 493     def find_or_raise(haystack, needle, exc):
 494         try:
 495             return haystack.index(needle)
 496         except ValueError:
 497             raise exc
 498     closing_tag = f'</{tag}>'
 499     whole_start = find_or_raise(
 500         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 501     content_start = find_or_raise(
 502         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 503     content_start += whole_start + 1
 504     with HTMLBreakOnClosingTagParser() as parser:
 505         parser.feed(html[whole_start:content_start])
 506         if not parser.tagstack or parser.tagstack[0] != tag:
 507             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 508         offset = content_start
 509         while offset < len(html):
 510             next_closing_tag_start = find_or_raise(
 511                 html[offset:], closing_tag,
 512                 compat_HTMLParseError(f'closing {tag} tag not found'))
 513             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 514             try:
 515                 parser.feed(html[offset:offset + next_closing_tag_end])
 516                 offset += next_closing_tag_end
 517             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 518                 return html[content_start:offset + next_closing_tag_start], \
 519                     html[whole_start:offset + next_closing_tag_end]
 520         raise compat_HTMLParseError('unexpected end of html')
 521
 522
 523 class HTMLAttributeParser(compat_HTMLParser):
 524     """Trivial HTML parser to gather the attributes for a single element"""
 525
 526     def __init__(self):
 527         self.attrs = {}
 528         compat_HTMLParser.__init__(self)
 529
 530     def handle_starttag(self, tag, attrs):
 531         self.attrs = dict(attrs)
 532
 533
 534 class HTMLListAttrsParser(compat_HTMLParser):
 535     """HTML parser to gather the attributes for the elements of a list"""
 536
 537     def __init__(self):
 538         compat_HTMLParser.__init__(self)
 539         self.items = []
 540         self._level = 0
 541
 542     def handle_starttag(self, tag, attrs):
 543         if tag == 'li' and self._level == 0:
 544             self.items.append(dict(attrs))
 545         self._level += 1
 546
 547     def handle_endtag(self, tag):
 548         self._level -= 1
 549
 550
 551 def extract_attributes(html_element):
 552     """Given a string for an HTML element such as
 553     <el
 554          a="foo" B="bar" c="&98;az" d=boz
 555          empty= noval entity="&amp;"
 556          sq='"' dq="'"
 557     >
 558     Decode and return a dictionary of attributes.
 559     {
 560         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 561         'empty': '', 'noval': None, 'entity': '&',
 562         'sq': '"', 'dq': '\''
 563     }.
 564     """
 565     parser = HTMLAttributeParser()
 566     with contextlib.suppress(compat_HTMLParseError):
 567         parser.feed(html_element)
 568         parser.close()
 569     return parser.attrs
 570
 571
 572 def parse_list(webpage):
 573     """Given a string for an series of HTML <li> elements,
 574     return a dictionary of their attributes"""
 575     parser = HTMLListAttrsParser()
 576     parser.feed(webpage)
 577     parser.close()
 578     return parser.items
 579
 580
 581 def clean_html(html):
 582     """Clean an HTML snippet into a readable string"""
 583
 584     if html is None:  # Convenience for sanitizing descriptions etc.
 585         return html
 586
 587     html = re.sub(r'\s+', ' ', html)
 588     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 589     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 590     # Strip html tags
 591     html = re.sub('<.*?>', '', html)
 592     # Replace html entities
 593     html = unescapeHTML(html)
 594     return html.strip()
 595
 596
 597 class LenientJSONDecoder(json.JSONDecoder):
 598     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 599         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 600         super().__init__(*args, **kwargs)
 601
 602     def decode(self, s):
 603         if self.transform_source:
 604             s = self.transform_source(s)
 605         if self.ignore_extra:
 606             return self.raw_decode(s.lstrip())[0]
 607         return super().decode(s)
 608
 609
 610 def sanitize_open(filename, open_mode):
 611     """Try to open the given filename, and slightly tweak it if this fails.
 612
 613     Attempts to open the given filename. If this fails, it tries to change
 614     the filename slightly, step by step, until it's either able to open it
 615     or it fails and raises a final exception, like the standard open()
 616     function.
 617
 618     It returns the tuple (stream, definitive_file_name).
 619     """
 620     if filename == '-':
 621         if sys.platform == 'win32':
 622             import msvcrt
 623             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 624         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 625
 626     for attempt in range(2):
 627         try:
 628             try:
 629                 if sys.platform == 'win32':
 630                     # FIXME: An exclusive lock also locks the file from being read.
 631                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 632                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 633                     raise LockingUnsupportedError()
 634                 stream = locked_file(filename, open_mode, block=False).__enter__()
 635             except OSError:
 636                 stream = open(filename, open_mode)
 637             return stream, filename
 638         except OSError as err:
 639             if attempt or err.errno in (errno.EACCES,):
 640                 raise
 641             old_filename, filename = filename, sanitize_path(filename)
 642             if old_filename == filename:
 643                 raise
 644
 645
 646 def timeconvert(timestr):
 647     """Convert RFC 2822 defined time string into system timestamp"""
 648     timestamp = None
 649     timetuple = email.utils.parsedate_tz(timestr)
 650     if timetuple is not None:
 651         timestamp = email.utils.mktime_tz(timetuple)
 652     return timestamp
 653
 654
 655 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 656     """Sanitizes a string so it could be used as part of a filename.
 657     @param restricted   Use a stricter subset of allowed characters
 658     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 659                         If unset, yt-dlp's new sanitization rules are in effect
 660     """
 661     if s == '':
 662         return ''
 663
 664     def replace_insane(char):
 665         if restricted and char in ACCENT_CHARS:
 666             return ACCENT_CHARS[char]
 667         elif not restricted and char == '\n':
 668             return '\0 '
 669         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 670             return ''
 671         elif char == '"':
 672             return '' if restricted else '\''
 673         elif char == ':':
 674             return '\0_\0-' if restricted else '\0 \0-'
 675         elif char in '\\/|*<>':
 676             return '\0_'
 677         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 678             return '\0_'
 679         return char
 680
 681     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 682     result = ''.join(map(replace_insane, s))
 683     if is_id is NO_DEFAULT:
 684         result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result)  # Remove repeated substitute chars
 685         STRIP_RE = '(?:\0.|[ _-])*'
 686         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 687     result = result.replace('\0', '') or '_'
 688
 689     if not is_id:
 690         while '__' in result:
 691             result = result.replace('__', '_')
 692         result = result.strip('_')
 693         # Common case of "Foreign band name - English song title"
 694         if restricted and result.startswith('-_'):
 695             result = result[2:]
 696         if result.startswith('-'):
 697             result = '_' + result[len('-'):]
 698         result = result.lstrip('.')
 699         if not result:
 700             result = '_'
 701     return result
 702
 703
 704 def sanitize_path(s, force=False):
 705     """Sanitizes and normalizes path on Windows"""
 706     if sys.platform == 'win32':
 707         force = False
 708         drive_or_unc, _ = os.path.splitdrive(s)
 709     elif force:
 710         drive_or_unc = ''
 711     else:
 712         return s
 713
 714     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 715     if drive_or_unc:
 716         norm_path.pop(0)
 717     sanitized_path = [
 718         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 719         for path_part in norm_path]
 720     if drive_or_unc:
 721         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 722     elif force and s and s[0] == os.path.sep:
 723         sanitized_path.insert(0, os.path.sep)
 724     return os.path.join(*sanitized_path)
 725
 726
 727 def sanitize_url(url):
 728     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 729     # the number of unwanted failures due to missing protocol
 730     if url is None:
 731         return
 732     elif url.startswith('//'):
 733         return 'http:%s' % url
 734     # Fix some common typos seen so far
 735     COMMON_TYPOS = (
 736         # https://github.com/ytdl-org/youtube-dl/issues/15649
 737         (r'^httpss://', r'https://'),
 738         # https://bx1.be/lives/direct-tv/
 739         (r'^rmtp([es]?)://', r'rtmp\1://'),
 740     )
 741     for mistake, fixup in COMMON_TYPOS:
 742         if re.match(mistake, url):
 743             return re.sub(mistake, fixup, url)
 744     return url
 745
 746
 747 def extract_basic_auth(url):
 748     parts = compat_urlparse.urlsplit(url)
 749     if parts.username is None:
 750         return url, None
 751     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 752         parts.hostname if parts.port is None
 753         else '%s:%d' % (parts.hostname, parts.port))))
 754     auth_payload = base64.b64encode(
 755         ('%s:%s' % (parts.username, parts.password or '')).encode())
 756     return url, f'Basic {auth_payload.decode()}'
 757
 758
 759 def sanitized_Request(url, *args, **kwargs):
 760     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 761     if auth_header is not None:
 762         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 763         headers['Authorization'] = auth_header
 764     return compat_urllib_request.Request(url, *args, **kwargs)
 765
 766
 767 def expand_path(s):
 768     """Expand shell variables and ~"""
 769     return os.path.expandvars(compat_expanduser(s))
 770
 771
 772 def orderedSet(iterable):
 773     """ Remove all duplicates from the input iterable """
 774     res = []
 775     for el in iterable:
 776         if el not in res:
 777             res.append(el)
 778     return res
 779
 780
 781 def _htmlentity_transform(entity_with_semicolon):
 782     """Transforms an HTML entity to a character."""
 783     entity = entity_with_semicolon[:-1]
 784
 785     # Known non-numeric HTML entity
 786     if entity in compat_html_entities.name2codepoint:
 787         return compat_chr(compat_html_entities.name2codepoint[entity])
 788
 789     # TODO: HTML5 allows entities without a semicolon. For example,
 790     # '&Eacuteric' should be decoded as 'Éric'.
 791     if entity_with_semicolon in compat_html_entities_html5:
 792         return compat_html_entities_html5[entity_with_semicolon]
 793
 794     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 795     if mobj is not None:
 796         numstr = mobj.group(1)
 797         if numstr.startswith('x'):
 798             base = 16
 799             numstr = '0%s' % numstr
 800         else:
 801             base = 10
 802         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 803         with contextlib.suppress(ValueError):
 804             return compat_chr(int(numstr, base))
 805
 806     # Unknown entity in name, return its literal representation
 807     return '&%s;' % entity
 808
 809
 810 def unescapeHTML(s):
 811     if s is None:
 812         return None
 813     assert isinstance(s, str)
 814
 815     return re.sub(
 816         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 817
 818
 819 def escapeHTML(text):
 820     return (
 821         text
 822         .replace('&', '&amp;')
 823         .replace('<', '&lt;')
 824         .replace('>', '&gt;')
 825         .replace('"', '&quot;')
 826         .replace("'", '&#39;')
 827     )
 828
 829
 830 def process_communicate_or_kill(p, *args, **kwargs):
 831     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 832                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 833     return Popen.communicate_or_kill(p, *args, **kwargs)
 834
 835
 836 class Popen(subprocess.Popen):
 837     if sys.platform == 'win32':
 838         _startupinfo = subprocess.STARTUPINFO()
 839         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 840     else:
 841         _startupinfo = None
 842
 843     def __init__(self, *args, **kwargs):
 844         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 845
 846     def communicate_or_kill(self, *args, **kwargs):
 847         try:
 848             return self.communicate(*args, **kwargs)
 849         except BaseException:  # Including KeyboardInterrupt
 850             self.kill()
 851             self.wait()
 852             raise
 853
 854
 855 def get_subprocess_encoding():
 856     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 857         # For subprocess calls, encode with locale encoding
 858         # Refer to http://stackoverflow.com/a/9951851/35070
 859         encoding = preferredencoding()
 860     else:
 861         encoding = sys.getfilesystemencoding()
 862     if encoding is None:
 863         encoding = 'utf-8'
 864     return encoding
 865
 866
 867 def encodeFilename(s, for_subprocess=False):
 868     assert isinstance(s, str)
 869     return s
 870
 871
 872 def decodeFilename(b, for_subprocess=False):
 873     return b
 874
 875
 876 def encodeArgument(s):
 877     # Legacy code that uses byte strings
 878     # Uncomment the following line after fixing all post processors
 879     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 880     return s if isinstance(s, str) else s.decode('ascii')
 881
 882
 883 def decodeArgument(b):
 884     return b
 885
 886
 887 def decodeOption(optval):
 888     if optval is None:
 889         return optval
 890     if isinstance(optval, bytes):
 891         optval = optval.decode(preferredencoding())
 892
 893     assert isinstance(optval, compat_str)
 894     return optval
 895
 896
 897 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 898
 899
 900 def timetuple_from_msec(msec):
 901     secs, msec = divmod(msec, 1000)
 902     mins, secs = divmod(secs, 60)
 903     hrs, mins = divmod(mins, 60)
 904     return _timetuple(hrs, mins, secs, msec)
 905
 906
 907 def formatSeconds(secs, delim=':', msec=False):
 908     time = timetuple_from_msec(secs * 1000)
 909     if time.hours:
 910         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 911     elif time.minutes:
 912         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 913     else:
 914         ret = '%d' % time.seconds
 915     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 916
 917
 918 def _ssl_load_windows_store_certs(ssl_context, storename):
 919     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 920     try:
 921         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 922                  if encoding == 'x509_asn' and (
 923                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 924     except PermissionError:
 925         return
 926     for cert in certs:
 927         with contextlib.suppress(ssl.SSLError):
 928             ssl_context.load_verify_locations(cadata=cert)
 929
 930
 931 def make_HTTPS_handler(params, **kwargs):
 932     opts_check_certificate = not params.get('nocheckcertificate')
 933     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 934     context.check_hostname = opts_check_certificate
 935     if params.get('legacyserverconnect'):
 936         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 937         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 938         context.set_ciphers('DEFAULT')
 939
 940     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 941     if opts_check_certificate:
 942         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 943             context.load_verify_locations(cafile=certifi.where())
 944         try:
 945             context.load_default_certs()
 946         # Work around the issue in load_default_certs when there are bad certificates. See:
 947         # https://github.com/yt-dlp/yt-dlp/issues/1060,
 948         # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 949         except ssl.SSLError:
 950             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 951             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 952                 for storename in ('CA', 'ROOT'):
 953                     _ssl_load_windows_store_certs(context, storename)
 954             context.set_default_verify_paths()
 955
 956     client_certfile = params.get('client_certificate')
 957     if client_certfile:
 958         try:
 959             context.load_cert_chain(
 960                 client_certfile, keyfile=params.get('client_certificate_key'),
 961                 password=params.get('client_certificate_password'))
 962         except ssl.SSLError:
 963             raise YoutubeDLError('Unable to load client certificate')
 964
 965     # Some servers may reject requests if ALPN extension is not sent. See:
 966     # https://github.com/python/cpython/issues/85140
 967     # https://github.com/yt-dlp/yt-dlp/issues/3878
 968     with contextlib.suppress(NotImplementedError):
 969         context.set_alpn_protocols(['http/1.1'])
 970
 971     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 972
 973
 974 def bug_reports_message(before=';'):
 975     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , '
 976            'filling out the appropriate issue template. '
 977            'Confirm you are on the latest version using  yt-dlp -U')
 978
 979     before = before.rstrip()
 980     if not before or before.endswith(('.', '!', '?')):
 981         msg = msg[0].title() + msg[1:]
 982
 983     return (before + ' ' if before else '') + msg
 984
 985
 986 class YoutubeDLError(Exception):
 987     """Base exception for YoutubeDL errors."""
 988     msg = None
 989
 990     def __init__(self, msg=None):
 991         if msg is not None:
 992             self.msg = msg
 993         elif self.msg is None:
 994             self.msg = type(self).__name__
 995         super().__init__(self.msg)
 996
 997
 998 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 999 if hasattr(ssl, 'CertificateError'):
1000     network_exceptions.append(ssl.CertificateError)
1001 network_exceptions = tuple(network_exceptions)
1002
1003
1004 class ExtractorError(YoutubeDLError):
1005     """Error during info extraction."""
1006
1007     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1008         """ tb, if given, is the original traceback (so that it can be printed out).
1009         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1010         """
1011         if sys.exc_info()[0] in network_exceptions:
1012             expected = True
1013
1014         self.orig_msg = str(msg)
1015         self.traceback = tb
1016         self.expected = expected
1017         self.cause = cause
1018         self.video_id = video_id
1019         self.ie = ie
1020         self.exc_info = sys.exc_info()  # preserve original exception
1021
1022         super().__init__(''.join((
1023             format_field(ie, template='[%s] '),
1024             format_field(video_id, template='%s: '),
1025             msg,
1026             format_field(cause, template=' (caused by %r)'),
1027             '' if expected else bug_reports_message())))
1028
1029     def format_traceback(self):
1030         return join_nonempty(
1031             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1032             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1033             delim='\n') or None
1034
1035
1036 class UnsupportedError(ExtractorError):
1037     def __init__(self, url):
1038         super().__init__(
1039             'Unsupported URL: %s' % url, expected=True)
1040         self.url = url
1041
1042
1043 class RegexNotFoundError(ExtractorError):
1044     """Error when a regex didn't match"""
1045     pass
1046
1047
1048 class GeoRestrictedError(ExtractorError):
1049     """Geographic restriction Error exception.
1050
1051     This exception may be thrown when a video is not available from your
1052     geographic location due to geographic restrictions imposed by a website.
1053     """
1054
1055     def __init__(self, msg, countries=None, **kwargs):
1056         kwargs['expected'] = True
1057         super().__init__(msg, **kwargs)
1058         self.countries = countries
1059
1060
1061 class DownloadError(YoutubeDLError):
1062     """Download Error exception.
1063
1064     This exception may be thrown by FileDownloader objects if they are not
1065     configured to continue on errors. They will contain the appropriate
1066     error message.
1067     """
1068
1069     def __init__(self, msg, exc_info=None):
1070         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1071         super().__init__(msg)
1072         self.exc_info = exc_info
1073
1074
1075 class EntryNotInPlaylist(YoutubeDLError):
1076     """Entry not in playlist exception.
1077
1078     This exception will be thrown by YoutubeDL when a requested entry
1079     is not found in the playlist info_dict
1080     """
1081     msg = 'Entry not found in info'
1082
1083
1084 class SameFileError(YoutubeDLError):
1085     """Same File exception.
1086
1087     This exception will be thrown by FileDownloader objects if they detect
1088     multiple files would have to be downloaded to the same file on disk.
1089     """
1090     msg = 'Fixed output name but more than one file to download'
1091
1092     def __init__(self, filename=None):
1093         if filename is not None:
1094             self.msg += f': {filename}'
1095         super().__init__(self.msg)
1096
1097
1098 class PostProcessingError(YoutubeDLError):
1099     """Post Processing exception.
1100
1101     This exception may be raised by PostProcessor's .run() method to
1102     indicate an error in the postprocessing task.
1103     """
1104
1105
1106 class DownloadCancelled(YoutubeDLError):
1107     """ Exception raised when the download queue should be interrupted """
1108     msg = 'The download was cancelled'
1109
1110
1111 class ExistingVideoReached(DownloadCancelled):
1112     """ --break-on-existing triggered """
1113     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1114
1115
1116 class RejectedVideoReached(DownloadCancelled):
1117     """ --break-on-reject triggered """
1118     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1119
1120
1121 class MaxDownloadsReached(DownloadCancelled):
1122     """ --max-downloads limit has been reached. """
1123     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1124
1125
1126 class ReExtractInfo(YoutubeDLError):
1127     """ Video info needs to be re-extracted. """
1128
1129     def __init__(self, msg, expected=False):
1130         super().__init__(msg)
1131         self.expected = expected
1132
1133
1134 class ThrottledDownload(ReExtractInfo):
1135     """ Download speed below --throttled-rate. """
1136     msg = 'The download speed is below throttle limit'
1137
1138     def __init__(self):
1139         super().__init__(self.msg, expected=False)
1140
1141
1142 class UnavailableVideoError(YoutubeDLError):
1143     """Unavailable Format exception.
1144
1145     This exception will be thrown when a video is requested
1146     in a format that is not available for that video.
1147     """
1148     msg = 'Unable to download video'
1149
1150     def __init__(self, err=None):
1151         if err is not None:
1152             self.msg += f': {err}'
1153         super().__init__(self.msg)
1154
1155
1156 class ContentTooShortError(YoutubeDLError):
1157     """Content Too Short exception.
1158
1159     This exception may be raised by FileDownloader objects when a file they
1160     download is too small for what the server announced first, indicating
1161     the connection was probably interrupted.
1162     """
1163
1164     def __init__(self, downloaded, expected):
1165         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1166         # Both in bytes
1167         self.downloaded = downloaded
1168         self.expected = expected
1169
1170
1171 class XAttrMetadataError(YoutubeDLError):
1172     def __init__(self, code=None, msg='Unknown error'):
1173         super().__init__(msg)
1174         self.code = code
1175         self.msg = msg
1176
1177         # Parsing code and msg
1178         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1179                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1180             self.reason = 'NO_SPACE'
1181         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1182             self.reason = 'VALUE_TOO_LONG'
1183         else:
1184             self.reason = 'NOT_SUPPORTED'
1185
1186
1187 class XAttrUnavailableError(YoutubeDLError):
1188     pass
1189
1190
1191 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1192     hc = http_class(*args, **kwargs)
1193     source_address = ydl_handler._params.get('source_address')
1194
1195     if source_address is not None:
1196         # This is to workaround _create_connection() from socket where it will try all
1197         # address data from getaddrinfo() including IPv6. This filters the result from
1198         # getaddrinfo() based on the source_address value.
1199         # This is based on the cpython socket.create_connection() function.
1200         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1201         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1202             host, port = address
1203             err = None
1204             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1205             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1206             ip_addrs = [addr for addr in addrs if addr[0] == af]
1207             if addrs and not ip_addrs:
1208                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1209                 raise OSError(
1210                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1211                     % (ip_version, source_address[0]))
1212             for res in ip_addrs:
1213                 af, socktype, proto, canonname, sa = res
1214                 sock = None
1215                 try:
1216                     sock = socket.socket(af, socktype, proto)
1217                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1218                         sock.settimeout(timeout)
1219                     sock.bind(source_address)
1220                     sock.connect(sa)
1221                     err = None  # Explicitly break reference cycle
1222                     return sock
1223                 except OSError as _:
1224                     err = _
1225                     if sock is not None:
1226                         sock.close()
1227             if err is not None:
1228                 raise err
1229             else:
1230                 raise OSError('getaddrinfo returns an empty list')
1231         if hasattr(hc, '_create_connection'):
1232             hc._create_connection = _create_connection
1233         hc.source_address = (source_address, 0)
1234
1235     return hc
1236
1237
1238 def handle_youtubedl_headers(headers):
1239     filtered_headers = headers
1240
1241     if 'Youtubedl-no-compression' in filtered_headers:
1242         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1243         del filtered_headers['Youtubedl-no-compression']
1244
1245     return filtered_headers
1246
1247
1248 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1249     """Handler for HTTP requests and responses.
1250
1251     This class, when installed with an OpenerDirector, automatically adds
1252     the standard headers to every HTTP request and handles gzipped and
1253     deflated responses from web servers. If compression is to be avoided in
1254     a particular request, the original request in the program code only has
1255     to include the HTTP header "Youtubedl-no-compression", which will be
1256     removed before making the real request.
1257
1258     Part of this code was copied from:
1259
1260     http://techknack.net/python-urllib2-handlers/
1261
1262     Andrew Rowls, the author of that code, agreed to release it to the
1263     public domain.
1264     """
1265
1266     def __init__(self, params, *args, **kwargs):
1267         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1268         self._params = params
1269
1270     def http_open(self, req):
1271         conn_class = compat_http_client.HTTPConnection
1272
1273         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1274         if socks_proxy:
1275             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1276             del req.headers['Ytdl-socks-proxy']
1277
1278         return self.do_open(functools.partial(
1279             _create_http_connection, self, conn_class, False),
1280             req)
1281
1282     @staticmethod
1283     def deflate(data):
1284         if not data:
1285             return data
1286         try:
1287             return zlib.decompress(data, -zlib.MAX_WBITS)
1288         except zlib.error:
1289             return zlib.decompress(data)
1290
1291     @staticmethod
1292     def brotli(data):
1293         if not data:
1294             return data
1295         return brotli.decompress(data)
1296
1297     def http_request(self, req):
1298         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1299         # always respected by websites, some tend to give out URLs with non percent-encoded
1300         # non-ASCII characters (see telemb.py, ard.py [#3412])
1301         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1302         # To work around aforementioned issue we will replace request's original URL with
1303         # percent-encoded one
1304         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1305         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1306         url = req.get_full_url()
1307         url_escaped = escape_url(url)
1308
1309         # Substitute URL if any change after escaping
1310         if url != url_escaped:
1311             req = update_Request(req, url=url_escaped)
1312
1313         for h, v in self._params.get('http_headers', std_headers).items():
1314             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1315             # The dict keys are capitalized because of this bug by urllib
1316             if h.capitalize() not in req.headers:
1317                 req.add_header(h, v)
1318
1319         if 'Accept-encoding' not in req.headers:
1320             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1321
1322         req.headers = handle_youtubedl_headers(req.headers)
1323
1324         return req
1325
1326     def http_response(self, req, resp):
1327         old_resp = resp
1328         # gzip
1329         if resp.headers.get('Content-encoding', '') == 'gzip':
1330             content = resp.read()
1331             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1332             try:
1333                 uncompressed = io.BytesIO(gz.read())
1334             except OSError as original_ioerror:
1335                 # There may be junk add the end of the file
1336                 # See http://stackoverflow.com/q/4928560/35070 for details
1337                 for i in range(1, 1024):
1338                     try:
1339                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1340                         uncompressed = io.BytesIO(gz.read())
1341                     except OSError:
1342                         continue
1343                     break
1344                 else:
1345                     raise original_ioerror
1346             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1347             resp.msg = old_resp.msg
1348             del resp.headers['Content-encoding']
1349         # deflate
1350         if resp.headers.get('Content-encoding', '') == 'deflate':
1351             gz = io.BytesIO(self.deflate(resp.read()))
1352             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1353             resp.msg = old_resp.msg
1354             del resp.headers['Content-encoding']
1355         # brotli
1356         if resp.headers.get('Content-encoding', '') == 'br':
1357             resp = compat_urllib_request.addinfourl(
1358                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1359             resp.msg = old_resp.msg
1360             del resp.headers['Content-encoding']
1361         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1362         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1363         if 300 <= resp.code < 400:
1364             location = resp.headers.get('Location')
1365             if location:
1366                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1367                 location = location.encode('iso-8859-1').decode()
1368                 location_escaped = escape_url(location)
1369                 if location != location_escaped:
1370                     del resp.headers['Location']
1371                     resp.headers['Location'] = location_escaped
1372         return resp
1373
1374     https_request = http_request
1375     https_response = http_response
1376
1377
1378 def make_socks_conn_class(base_class, socks_proxy):
1379     assert issubclass(base_class, (
1380         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1381
1382     url_components = compat_urlparse.urlparse(socks_proxy)
1383     if url_components.scheme.lower() == 'socks5':
1384         socks_type = ProxyType.SOCKS5
1385     elif url_components.scheme.lower() in ('socks', 'socks4'):
1386         socks_type = ProxyType.SOCKS4
1387     elif url_components.scheme.lower() == 'socks4a':
1388         socks_type = ProxyType.SOCKS4A
1389
1390     def unquote_if_non_empty(s):
1391         if not s:
1392             return s
1393         return compat_urllib_parse_unquote_plus(s)
1394
1395     proxy_args = (
1396         socks_type,
1397         url_components.hostname, url_components.port or 1080,
1398         True,  # Remote DNS
1399         unquote_if_non_empty(url_components.username),
1400         unquote_if_non_empty(url_components.password),
1401     )
1402
1403     class SocksConnection(base_class):
1404         def connect(self):
1405             self.sock = sockssocket()
1406             self.sock.setproxy(*proxy_args)
1407             if isinstance(self.timeout, (int, float)):
1408                 self.sock.settimeout(self.timeout)
1409             self.sock.connect((self.host, self.port))
1410
1411             if isinstance(self, compat_http_client.HTTPSConnection):
1412                 if hasattr(self, '_context'):  # Python > 2.6
1413                     self.sock = self._context.wrap_socket(
1414                         self.sock, server_hostname=self.host)
1415                 else:
1416                     self.sock = ssl.wrap_socket(self.sock)
1417
1418     return SocksConnection
1419
1420
1421 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1422     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1423         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1424         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1425         self._params = params
1426
1427     def https_open(self, req):
1428         kwargs = {}
1429         conn_class = self._https_conn_class
1430
1431         if hasattr(self, '_context'):  # python > 2.6
1432             kwargs['context'] = self._context
1433         if hasattr(self, '_check_hostname'):  # python 3.x
1434             kwargs['check_hostname'] = self._check_hostname
1435
1436         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1437         if socks_proxy:
1438             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1439             del req.headers['Ytdl-socks-proxy']
1440
1441         try:
1442             return self.do_open(
1443                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1444         except urllib.error.URLError as e:
1445             if (isinstance(e.reason, ssl.SSLError)
1446                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1447                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1448             raise
1449
1450
1451 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1452     """
1453     See [1] for cookie file format.
1454
1455     1. https://curl.haxx.se/docs/http-cookies.html
1456     """
1457     _HTTPONLY_PREFIX = '#HttpOnly_'
1458     _ENTRY_LEN = 7
1459     _HEADER = '''# Netscape HTTP Cookie File
1460 # This file is generated by yt-dlp.  Do not edit.
1461
1462 '''
1463     _CookieFileEntry = collections.namedtuple(
1464         'CookieFileEntry',
1465         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1466
1467     def __init__(self, filename=None, *args, **kwargs):
1468         super().__init__(None, *args, **kwargs)
1469         if self.is_path(filename):
1470             filename = os.fspath(filename)
1471         self.filename = filename
1472
1473     @staticmethod
1474     def _true_or_false(cndn):
1475         return 'TRUE' if cndn else 'FALSE'
1476
1477     @staticmethod
1478     def is_path(file):
1479         return isinstance(file, (str, bytes, os.PathLike))
1480
1481     @contextlib.contextmanager
1482     def open(self, file, *, write=False):
1483         if self.is_path(file):
1484             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1485                 yield f
1486         else:
1487             if write:
1488                 file.truncate(0)
1489             yield file
1490
1491     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1492         now = time.time()
1493         for cookie in self:
1494             if (not ignore_discard and cookie.discard
1495                     or not ignore_expires and cookie.is_expired(now)):
1496                 continue
1497             name, value = cookie.name, cookie.value
1498             if value is None:
1499                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1500                 # with no name, whereas http.cookiejar regards it as a
1501                 # cookie with no value.
1502                 name, value = '', name
1503             f.write('%s\n' % '\t'.join((
1504                 cookie.domain,
1505                 self._true_or_false(cookie.domain.startswith('.')),
1506                 cookie.path,
1507                 self._true_or_false(cookie.secure),
1508                 str_or_none(cookie.expires, default=''),
1509                 name, value
1510             )))
1511
1512     def save(self, filename=None, *args, **kwargs):
1513         """
1514         Save cookies to a file.
1515         Code is taken from CPython 3.6
1516         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1517
1518         if filename is None:
1519             if self.filename is not None:
1520                 filename = self.filename
1521             else:
1522                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1523
1524         # Store session cookies with `expires` set to 0 instead of an empty string
1525         for cookie in self:
1526             if cookie.expires is None:
1527                 cookie.expires = 0
1528
1529         with self.open(filename, write=True) as f:
1530             f.write(self._HEADER)
1531             self._really_save(f, *args, **kwargs)
1532
1533     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1534         """Load cookies from a file."""
1535         if filename is None:
1536             if self.filename is not None:
1537                 filename = self.filename
1538             else:
1539                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1540
1541         def prepare_line(line):
1542             if line.startswith(self._HTTPONLY_PREFIX):
1543                 line = line[len(self._HTTPONLY_PREFIX):]
1544             # comments and empty lines are fine
1545             if line.startswith('#') or not line.strip():
1546                 return line
1547             cookie_list = line.split('\t')
1548             if len(cookie_list) != self._ENTRY_LEN:
1549                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1550             cookie = self._CookieFileEntry(*cookie_list)
1551             if cookie.expires_at and not cookie.expires_at.isdigit():
1552                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1553             return line
1554
1555         cf = io.StringIO()
1556         with self.open(filename) as f:
1557             for line in f:
1558                 try:
1559                     cf.write(prepare_line(line))
1560                 except compat_cookiejar.LoadError as e:
1561                     if f'{line.strip()} '[0] in '[{"':
1562                         raise compat_cookiejar.LoadError(
1563                             'Cookies file must be Netscape formatted, not JSON. See  '
1564                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1565                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1566                     continue
1567         cf.seek(0)
1568         self._really_load(cf, filename, ignore_discard, ignore_expires)
1569         # Session cookies are denoted by either `expires` field set to
1570         # an empty string or 0. MozillaCookieJar only recognizes the former
1571         # (see [1]). So we need force the latter to be recognized as session
1572         # cookies on our own.
1573         # Session cookies may be important for cookies-based authentication,
1574         # e.g. usually, when user does not check 'Remember me' check box while
1575         # logging in on a site, some important cookies are stored as session
1576         # cookies so that not recognizing them will result in failed login.
1577         # 1. https://bugs.python.org/issue17164
1578         for cookie in self:
1579             # Treat `expires=0` cookies as session cookies
1580             if cookie.expires == 0:
1581                 cookie.expires = None
1582                 cookie.discard = True
1583
1584
1585 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1586     def __init__(self, cookiejar=None):
1587         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1588
1589     def http_response(self, request, response):
1590         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1591
1592     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1593     https_response = http_response
1594
1595
1596 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1597     """YoutubeDL redirect handler
1598
1599     The code is based on HTTPRedirectHandler implementation from CPython [1].
1600
1601     This redirect handler solves two issues:
1602      - ensures redirect URL is always unicode under python 2
1603      - introduces support for experimental HTTP response status code
1604        308 Permanent Redirect [2] used by some sites [3]
1605
1606     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1607     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1608     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1609     """
1610
1611     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1612
1613     def redirect_request(self, req, fp, code, msg, headers, newurl):
1614         """Return a Request or None in response to a redirect.
1615
1616         This is called by the http_error_30x methods when a
1617         redirection response is received.  If a redirection should
1618         take place, return a new Request to allow http_error_30x to
1619         perform the redirect.  Otherwise, raise HTTPError if no-one
1620         else should try to handle this url.  Return None if you can't
1621         but another Handler might.
1622         """
1623         m = req.get_method()
1624         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1625                  or code in (301, 302, 303) and m == "POST")):
1626             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1627         # Strictly (according to RFC 2616), 301 or 302 in response to
1628         # a POST MUST NOT cause a redirection without confirmation
1629         # from the user (of urllib.request, in this case).  In practice,
1630         # essentially all clients do redirect in this case, so we do
1631         # the same.
1632
1633         # Be conciliant with URIs containing a space.  This is mainly
1634         # redundant with the more complete encoding done in http_error_302(),
1635         # but it is kept for compatibility with other callers.
1636         newurl = newurl.replace(' ', '%20')
1637
1638         CONTENT_HEADERS = ("content-length", "content-type")
1639         # NB: don't use dict comprehension for python 2.6 compatibility
1640         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1641
1642         # A 303 must either use GET or HEAD for subsequent request
1643         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1644         if code == 303 and m != 'HEAD':
1645             m = 'GET'
1646         # 301 and 302 redirects are commonly turned into a GET from a POST
1647         # for subsequent requests by browsers, so we'll do the same.
1648         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1649         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1650         if code in (301, 302) and m == 'POST':
1651             m = 'GET'
1652
1653         return compat_urllib_request.Request(
1654             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1655             unverifiable=True, method=m)
1656
1657
1658 def extract_timezone(date_str):
1659     m = re.search(
1660         r'''(?x)
1661             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1662             (?P<tz>Z|                                            # just the UTC Z, or
1663                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1664                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1665                    [ ]?                                          # optional space
1666                 (?P<sign>\+|-)                                   # +/-
1667                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1668             $)
1669         ''', date_str)
1670     if not m:
1671         timezone = datetime.timedelta()
1672     else:
1673         date_str = date_str[:-len(m.group('tz'))]
1674         if not m.group('sign'):
1675             timezone = datetime.timedelta()
1676         else:
1677             sign = 1 if m.group('sign') == '+' else -1
1678             timezone = datetime.timedelta(
1679                 hours=sign * int(m.group('hours')),
1680                 minutes=sign * int(m.group('minutes')))
1681     return timezone, date_str
1682
1683
1684 def parse_iso8601(date_str, delimiter='T', timezone=None):
1685     """ Return a UNIX timestamp from the given date """
1686
1687     if date_str is None:
1688         return None
1689
1690     date_str = re.sub(r'\.[0-9]+', '', date_str)
1691
1692     if timezone is None:
1693         timezone, date_str = extract_timezone(date_str)
1694
1695     with contextlib.suppress(ValueError):
1696         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1697         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1698         return calendar.timegm(dt.timetuple())
1699
1700
1701 def date_formats(day_first=True):
1702     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1703
1704
1705 def unified_strdate(date_str, day_first=True):
1706     """Return a string with the date in the format YYYYMMDD"""
1707
1708     if date_str is None:
1709         return None
1710     upload_date = None
1711     # Replace commas
1712     date_str = date_str.replace(',', ' ')
1713     # Remove AM/PM + timezone
1714     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1715     _, date_str = extract_timezone(date_str)
1716
1717     for expression in date_formats(day_first):
1718         with contextlib.suppress(ValueError):
1719             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1720     if upload_date is None:
1721         timetuple = email.utils.parsedate_tz(date_str)
1722         if timetuple:
1723             with contextlib.suppress(ValueError):
1724                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1725     if upload_date is not None:
1726         return compat_str(upload_date)
1727
1728
1729 def unified_timestamp(date_str, day_first=True):
1730     if date_str is None:
1731         return None
1732
1733     date_str = re.sub(r'[,|]', '', date_str)
1734
1735     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1736     timezone, date_str = extract_timezone(date_str)
1737
1738     # Remove AM/PM + timezone
1739     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1740
1741     # Remove unrecognized timezones from ISO 8601 alike timestamps
1742     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1743     if m:
1744         date_str = date_str[:-len(m.group('tz'))]
1745
1746     # Python only supports microseconds, so remove nanoseconds
1747     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1748     if m:
1749         date_str = m.group(1)
1750
1751     for expression in date_formats(day_first):
1752         with contextlib.suppress(ValueError):
1753             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1754             return calendar.timegm(dt.timetuple())
1755     timetuple = email.utils.parsedate_tz(date_str)
1756     if timetuple:
1757         return calendar.timegm(timetuple) + pm_delta * 3600
1758
1759
1760 def determine_ext(url, default_ext='unknown_video'):
1761     if url is None or '.' not in url:
1762         return default_ext
1763     guess = url.partition('?')[0].rpartition('.')[2]
1764     if re.match(r'^[A-Za-z0-9]+$', guess):
1765         return guess
1766     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1767     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1768         return guess.rstrip('/')
1769     else:
1770         return default_ext
1771
1772
1773 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1774     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1775
1776
1777 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1778     R"""
1779     Return a datetime object from a string.
1780     Supported format:
1781         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1782
1783     @param format       strftime format of DATE
1784     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1785                         auto: round to the unit provided in date_str (if applicable).
1786     """
1787     auto_precision = False
1788     if precision == 'auto':
1789         auto_precision = True
1790         precision = 'microsecond'
1791     today = datetime_round(datetime.datetime.utcnow(), precision)
1792     if date_str in ('now', 'today'):
1793         return today
1794     if date_str == 'yesterday':
1795         return today - datetime.timedelta(days=1)
1796     match = re.match(
1797         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1798         date_str)
1799     if match is not None:
1800         start_time = datetime_from_str(match.group('start'), precision, format)
1801         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1802         unit = match.group('unit')
1803         if unit == 'month' or unit == 'year':
1804             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1805             unit = 'day'
1806         else:
1807             if unit == 'week':
1808                 unit = 'day'
1809                 time *= 7
1810             delta = datetime.timedelta(**{unit + 's': time})
1811             new_date = start_time + delta
1812         if auto_precision:
1813             return datetime_round(new_date, unit)
1814         return new_date
1815
1816     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1817
1818
1819 def date_from_str(date_str, format='%Y%m%d', strict=False):
1820     R"""
1821     Return a date object from a string using datetime_from_str
1822
1823     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1824                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1825     """
1826     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1827         raise ValueError(f'Invalid date format "{date_str}"')
1828     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1829
1830
1831 def datetime_add_months(dt, months):
1832     """Increment/Decrement a datetime object by months."""
1833     month = dt.month + months - 1
1834     year = dt.year + month // 12
1835     month = month % 12 + 1
1836     day = min(dt.day, calendar.monthrange(year, month)[1])
1837     return dt.replace(year, month, day)
1838
1839
1840 def datetime_round(dt, precision='day'):
1841     """
1842     Round a datetime object's time to a specific precision
1843     """
1844     if precision == 'microsecond':
1845         return dt
1846
1847     unit_seconds = {
1848         'day': 86400,
1849         'hour': 3600,
1850         'minute': 60,
1851         'second': 1,
1852     }
1853     roundto = lambda x, n: ((x + n / 2) // n) * n
1854     timestamp = calendar.timegm(dt.timetuple())
1855     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1856
1857
1858 def hyphenate_date(date_str):
1859     """
1860     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1861     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1862     if match is not None:
1863         return '-'.join(match.groups())
1864     else:
1865         return date_str
1866
1867
1868 class DateRange:
1869     """Represents a time interval between two dates"""
1870
1871     def __init__(self, start=None, end=None):
1872         """start and end must be strings in the format accepted by date"""
1873         if start is not None:
1874             self.start = date_from_str(start, strict=True)
1875         else:
1876             self.start = datetime.datetime.min.date()
1877         if end is not None:
1878             self.end = date_from_str(end, strict=True)
1879         else:
1880             self.end = datetime.datetime.max.date()
1881         if self.start > self.end:
1882             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1883
1884     @classmethod
1885     def day(cls, day):
1886         """Returns a range that only contains the given day"""
1887         return cls(day, day)
1888
1889     def __contains__(self, date):
1890         """Check if the date is in the range"""
1891         if not isinstance(date, datetime.date):
1892             date = date_from_str(date)
1893         return self.start <= date <= self.end
1894
1895     def __str__(self):
1896         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1897
1898
1899 def platform_name():
1900     """ Returns the platform name as a compat_str """
1901     res = platform.platform()
1902     if isinstance(res, bytes):
1903         res = res.decode(preferredencoding())
1904
1905     assert isinstance(res, compat_str)
1906     return res
1907
1908
1909 @functools.cache
1910 def get_windows_version():
1911     ''' Get Windows version. returns () if it's not running on Windows '''
1912     if compat_os_name == 'nt':
1913         return version_tuple(platform.win32_ver()[1])
1914     else:
1915         return ()
1916
1917
1918 def write_string(s, out=None, encoding=None):
1919     assert isinstance(s, str)
1920     out = out or sys.stderr
1921
1922     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1923         s = re.sub(r'([\r\n]+)', r' \1', s)
1924
1925     enc, buffer = None, out
1926     if 'b' in getattr(out, 'mode', ''):
1927         enc = encoding or preferredencoding()
1928     elif hasattr(out, 'buffer'):
1929         buffer = out.buffer
1930         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1931
1932     buffer.write(s.encode(enc, 'ignore') if enc else s)
1933     out.flush()
1934
1935
1936 def bytes_to_intlist(bs):
1937     if not bs:
1938         return []
1939     if isinstance(bs[0], int):  # Python 3
1940         return list(bs)
1941     else:
1942         return [ord(c) for c in bs]
1943
1944
1945 def intlist_to_bytes(xs):
1946     if not xs:
1947         return b''
1948     return compat_struct_pack('%dB' % len(xs), *xs)
1949
1950
1951 class LockingUnsupportedError(OSError):
1952     msg = 'File locking is not supported'
1953
1954     def __init__(self):
1955         super().__init__(self.msg)
1956
1957
1958 # Cross-platform file locking
1959 if sys.platform == 'win32':
1960     import ctypes.wintypes
1961     import msvcrt
1962
1963     class OVERLAPPED(ctypes.Structure):
1964         _fields_ = [
1965             ('Internal', ctypes.wintypes.LPVOID),
1966             ('InternalHigh', ctypes.wintypes.LPVOID),
1967             ('Offset', ctypes.wintypes.DWORD),
1968             ('OffsetHigh', ctypes.wintypes.DWORD),
1969             ('hEvent', ctypes.wintypes.HANDLE),
1970         ]
1971
1972     kernel32 = ctypes.windll.kernel32
1973     LockFileEx = kernel32.LockFileEx
1974     LockFileEx.argtypes = [
1975         ctypes.wintypes.HANDLE,     # hFile
1976         ctypes.wintypes.DWORD,      # dwFlags
1977         ctypes.wintypes.DWORD,      # dwReserved
1978         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1979         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1980         ctypes.POINTER(OVERLAPPED)  # Overlapped
1981     ]
1982     LockFileEx.restype = ctypes.wintypes.BOOL
1983     UnlockFileEx = kernel32.UnlockFileEx
1984     UnlockFileEx.argtypes = [
1985         ctypes.wintypes.HANDLE,     # hFile
1986         ctypes.wintypes.DWORD,      # dwReserved
1987         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1988         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1989         ctypes.POINTER(OVERLAPPED)  # Overlapped
1990     ]
1991     UnlockFileEx.restype = ctypes.wintypes.BOOL
1992     whole_low = 0xffffffff
1993     whole_high = 0x7fffffff
1994
1995     def _lock_file(f, exclusive, block):
1996         overlapped = OVERLAPPED()
1997         overlapped.Offset = 0
1998         overlapped.OffsetHigh = 0
1999         overlapped.hEvent = 0
2000         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2001
2002         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2003                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2004                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2005             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2006
2007     def _unlock_file(f):
2008         assert f._lock_file_overlapped_p
2009         handle = msvcrt.get_osfhandle(f.fileno())
2010         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2011             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2012
2013 else:
2014     try:
2015         import fcntl
2016
2017         def _lock_file(f, exclusive, block):
2018             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2019             if not block:
2020                 flags |= fcntl.LOCK_NB
2021             try:
2022                 fcntl.flock(f, flags)
2023             except BlockingIOError:
2024                 raise
2025             except OSError:  # AOSP does not have flock()
2026                 fcntl.lockf(f, flags)
2027
2028         def _unlock_file(f):
2029             try:
2030                 fcntl.flock(f, fcntl.LOCK_UN)
2031             except OSError:
2032                 fcntl.lockf(f, fcntl.LOCK_UN)
2033
2034     except ImportError:
2035
2036         def _lock_file(f, exclusive, block):
2037             raise LockingUnsupportedError()
2038
2039         def _unlock_file(f):
2040             raise LockingUnsupportedError()
2041
2042
2043 class locked_file:
2044     locked = False
2045
2046     def __init__(self, filename, mode, block=True, encoding=None):
2047         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2048             raise NotImplementedError(mode)
2049         self.mode, self.block = mode, block
2050
2051         writable = any(f in mode for f in 'wax+')
2052         readable = any(f in mode for f in 'r+')
2053         flags = functools.reduce(operator.ior, (
2054             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2055             getattr(os, 'O_BINARY', 0),  # Windows only
2056             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2057             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2058             os.O_APPEND if 'a' in mode else 0,
2059             os.O_EXCL if 'x' in mode else 0,
2060             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2061         ))
2062
2063         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2064
2065     def __enter__(self):
2066         exclusive = 'r' not in self.mode
2067         try:
2068             _lock_file(self.f, exclusive, self.block)
2069             self.locked = True
2070         except OSError:
2071             self.f.close()
2072             raise
2073         if 'w' in self.mode:
2074             try:
2075                 self.f.truncate()
2076             except OSError as e:
2077                 if e.errno not in (
2078                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2079                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2080                 ):
2081                     raise
2082         return self
2083
2084     def unlock(self):
2085         if not self.locked:
2086             return
2087         try:
2088             _unlock_file(self.f)
2089         finally:
2090             self.locked = False
2091
2092     def __exit__(self, *_):
2093         try:
2094             self.unlock()
2095         finally:
2096             self.f.close()
2097
2098     open = __enter__
2099     close = __exit__
2100
2101     def __getattr__(self, attr):
2102         return getattr(self.f, attr)
2103
2104     def __iter__(self):
2105         return iter(self.f)
2106
2107
2108 @functools.cache
2109 def get_filesystem_encoding():
2110     encoding = sys.getfilesystemencoding()
2111     return encoding if encoding is not None else 'utf-8'
2112
2113
2114 def shell_quote(args):
2115     quoted_args = []
2116     encoding = get_filesystem_encoding()
2117     for a in args:
2118         if isinstance(a, bytes):
2119             # We may get a filename encoded with 'encodeFilename'
2120             a = a.decode(encoding)
2121         quoted_args.append(compat_shlex_quote(a))
2122     return ' '.join(quoted_args)
2123
2124
2125 def smuggle_url(url, data):
2126     """ Pass additional data in a URL for internal use. """
2127
2128     url, idata = unsmuggle_url(url, {})
2129     data.update(idata)
2130     sdata = compat_urllib_parse_urlencode(
2131         {'__youtubedl_smuggle': json.dumps(data)})
2132     return url + '#' + sdata
2133
2134
2135 def unsmuggle_url(smug_url, default=None):
2136     if '#__youtubedl_smuggle' not in smug_url:
2137         return smug_url, default
2138     url, _, sdata = smug_url.rpartition('#')
2139     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2140     data = json.loads(jsond)
2141     return url, data
2142
2143
2144 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2145     """ Formats numbers with decimal sufixes like K, M, etc """
2146     num, factor = float_or_none(num), float(factor)
2147     if num is None or num < 0:
2148         return None
2149     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2150     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2151     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2152     if factor == 1024:
2153         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2154     converted = num / (factor ** exponent)
2155     return fmt % (converted, suffix)
2156
2157
2158 def format_bytes(bytes):
2159     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2160
2161
2162 def lookup_unit_table(unit_table, s):
2163     units_re = '|'.join(re.escape(u) for u in unit_table)
2164     m = re.match(
2165         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2166     if not m:
2167         return None
2168     num_str = m.group('num').replace(',', '.')
2169     mult = unit_table[m.group('unit')]
2170     return int(float(num_str) * mult)
2171
2172
2173 def parse_filesize(s):
2174     if s is None:
2175         return None
2176
2177     # The lower-case forms are of course incorrect and unofficial,
2178     # but we support those too
2179     _UNIT_TABLE = {
2180         'B': 1,
2181         'b': 1,
2182         'bytes': 1,
2183         'KiB': 1024,
2184         'KB': 1000,
2185         'kB': 1024,
2186         'Kb': 1000,
2187         'kb': 1000,
2188         'kilobytes': 1000,
2189         'kibibytes': 1024,
2190         'MiB': 1024 ** 2,
2191         'MB': 1000 ** 2,
2192         'mB': 1024 ** 2,
2193         'Mb': 1000 ** 2,
2194         'mb': 1000 ** 2,
2195         'megabytes': 1000 ** 2,
2196         'mebibytes': 1024 ** 2,
2197         'GiB': 1024 ** 3,
2198         'GB': 1000 ** 3,
2199         'gB': 1024 ** 3,
2200         'Gb': 1000 ** 3,
2201         'gb': 1000 ** 3,
2202         'gigabytes': 1000 ** 3,
2203         'gibibytes': 1024 ** 3,
2204         'TiB': 1024 ** 4,
2205         'TB': 1000 ** 4,
2206         'tB': 1024 ** 4,
2207         'Tb': 1000 ** 4,
2208         'tb': 1000 ** 4,
2209         'terabytes': 1000 ** 4,
2210         'tebibytes': 1024 ** 4,
2211         'PiB': 1024 ** 5,
2212         'PB': 1000 ** 5,
2213         'pB': 1024 ** 5,
2214         'Pb': 1000 ** 5,
2215         'pb': 1000 ** 5,
2216         'petabytes': 1000 ** 5,
2217         'pebibytes': 1024 ** 5,
2218         'EiB': 1024 ** 6,
2219         'EB': 1000 ** 6,
2220         'eB': 1024 ** 6,
2221         'Eb': 1000 ** 6,
2222         'eb': 1000 ** 6,
2223         'exabytes': 1000 ** 6,
2224         'exbibytes': 1024 ** 6,
2225         'ZiB': 1024 ** 7,
2226         'ZB': 1000 ** 7,
2227         'zB': 1024 ** 7,
2228         'Zb': 1000 ** 7,
2229         'zb': 1000 ** 7,
2230         'zettabytes': 1000 ** 7,
2231         'zebibytes': 1024 ** 7,
2232         'YiB': 1024 ** 8,
2233         'YB': 1000 ** 8,
2234         'yB': 1024 ** 8,
2235         'Yb': 1000 ** 8,
2236         'yb': 1000 ** 8,
2237         'yottabytes': 1000 ** 8,
2238         'yobibytes': 1024 ** 8,
2239     }
2240
2241     return lookup_unit_table(_UNIT_TABLE, s)
2242
2243
2244 def parse_count(s):
2245     if s is None:
2246         return None
2247
2248     s = re.sub(r'^[^\d]+\s', '', s).strip()
2249
2250     if re.match(r'^[\d,.]+$', s):
2251         return str_to_int(s)
2252
2253     _UNIT_TABLE = {
2254         'k': 1000,
2255         'K': 1000,
2256         'm': 1000 ** 2,
2257         'M': 1000 ** 2,
2258         'kk': 1000 ** 2,
2259         'KK': 1000 ** 2,
2260         'b': 1000 ** 3,
2261         'B': 1000 ** 3,
2262     }
2263
2264     ret = lookup_unit_table(_UNIT_TABLE, s)
2265     if ret is not None:
2266         return ret
2267
2268     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2269     if mobj:
2270         return str_to_int(mobj.group(1))
2271
2272
2273 def parse_resolution(s, *, lenient=False):
2274     if s is None:
2275         return {}
2276
2277     if lenient:
2278         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2279     else:
2280         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2281     if mobj:
2282         return {
2283             'width': int(mobj.group('w')),
2284             'height': int(mobj.group('h')),
2285         }
2286
2287     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2288     if mobj:
2289         return {'height': int(mobj.group(1))}
2290
2291     mobj = re.search(r'\b([48])[kK]\b', s)
2292     if mobj:
2293         return {'height': int(mobj.group(1)) * 540}
2294
2295     return {}
2296
2297
2298 def parse_bitrate(s):
2299     if not isinstance(s, compat_str):
2300         return
2301     mobj = re.search(r'\b(\d+)\s*kbps', s)
2302     if mobj:
2303         return int(mobj.group(1))
2304
2305
2306 def month_by_name(name, lang='en'):
2307     """ Return the number of a month by (locale-independently) English name """
2308
2309     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2310
2311     try:
2312         return month_names.index(name) + 1
2313     except ValueError:
2314         return None
2315
2316
2317 def month_by_abbreviation(abbrev):
2318     """ Return the number of a month by (locale-independently) English
2319         abbreviations """
2320
2321     try:
2322         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2323     except ValueError:
2324         return None
2325
2326
2327 def fix_xml_ampersands(xml_str):
2328     """Replace all the '&' by '&amp;' in XML"""
2329     return re.sub(
2330         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2331         '&amp;',
2332         xml_str)
2333
2334
2335 def setproctitle(title):
2336     assert isinstance(title, compat_str)
2337
2338     # ctypes in Jython is not complete
2339     # http://bugs.jython.org/issue2148
2340     if sys.platform.startswith('java'):
2341         return
2342
2343     try:
2344         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2345     except OSError:
2346         return
2347     except TypeError:
2348         # LoadLibrary in Windows Python 2.7.13 only expects
2349         # a bytestring, but since unicode_literals turns
2350         # every string into a unicode string, it fails.
2351         return
2352     title_bytes = title.encode()
2353     buf = ctypes.create_string_buffer(len(title_bytes))
2354     buf.value = title_bytes
2355     try:
2356         libc.prctl(15, buf, 0, 0, 0)
2357     except AttributeError:
2358         return  # Strange libc, just skip this
2359
2360
2361 def remove_start(s, start):
2362     return s[len(start):] if s is not None and s.startswith(start) else s
2363
2364
2365 def remove_end(s, end):
2366     return s[:-len(end)] if s is not None and s.endswith(end) else s
2367
2368
2369 def remove_quotes(s):
2370     if s is None or len(s) < 2:
2371         return s
2372     for quote in ('"', "'", ):
2373         if s[0] == quote and s[-1] == quote:
2374             return s[1:-1]
2375     return s
2376
2377
2378 def get_domain(url):
2379     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2380     return domain.group('domain') if domain else None
2381
2382
2383 def url_basename(url):
2384     path = compat_urlparse.urlparse(url).path
2385     return path.strip('/').split('/')[-1]
2386
2387
2388 def base_url(url):
2389     return re.match(r'https?://[^?#&]+/', url).group()
2390
2391
2392 def urljoin(base, path):
2393     if isinstance(path, bytes):
2394         path = path.decode()
2395     if not isinstance(path, compat_str) or not path:
2396         return None
2397     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2398         return path
2399     if isinstance(base, bytes):
2400         base = base.decode()
2401     if not isinstance(base, compat_str) or not re.match(
2402             r'^(?:https?:)?//', base):
2403         return None
2404     return compat_urlparse.urljoin(base, path)
2405
2406
2407 class HEADRequest(compat_urllib_request.Request):
2408     def get_method(self):
2409         return 'HEAD'
2410
2411
2412 class PUTRequest(compat_urllib_request.Request):
2413     def get_method(self):
2414         return 'PUT'
2415
2416
2417 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2418     if get_attr and v is not None:
2419         v = getattr(v, get_attr, None)
2420     try:
2421         return int(v) * invscale // scale
2422     except (ValueError, TypeError, OverflowError):
2423         return default
2424
2425
2426 def str_or_none(v, default=None):
2427     return default if v is None else compat_str(v)
2428
2429
2430 def str_to_int(int_str):
2431     """ A more relaxed version of int_or_none """
2432     if isinstance(int_str, int):
2433         return int_str
2434     elif isinstance(int_str, compat_str):
2435         int_str = re.sub(r'[,\.\+]', '', int_str)
2436         return int_or_none(int_str)
2437
2438
2439 def float_or_none(v, scale=1, invscale=1, default=None):
2440     if v is None:
2441         return default
2442     try:
2443         return float(v) * invscale / scale
2444     except (ValueError, TypeError):
2445         return default
2446
2447
2448 def bool_or_none(v, default=None):
2449     return v if isinstance(v, bool) else default
2450
2451
2452 def strip_or_none(v, default=None):
2453     return v.strip() if isinstance(v, compat_str) else default
2454
2455
2456 def url_or_none(url):
2457     if not url or not isinstance(url, compat_str):
2458         return None
2459     url = url.strip()
2460     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2461
2462
2463 def request_to_url(req):
2464     if isinstance(req, compat_urllib_request.Request):
2465         return req.get_full_url()
2466     else:
2467         return req
2468
2469
2470 def strftime_or_none(timestamp, date_format, default=None):
2471     datetime_object = None
2472     try:
2473         if isinstance(timestamp, (int, float)):  # unix timestamp
2474             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2475         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2476             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2477         return datetime_object.strftime(date_format)
2478     except (ValueError, TypeError, AttributeError):
2479         return default
2480
2481
2482 def parse_duration(s):
2483     if not isinstance(s, str):
2484         return None
2485     s = s.strip()
2486     if not s:
2487         return None
2488
2489     days, hours, mins, secs, ms = [None] * 5
2490     m = re.match(r'''(?x)
2491             (?P<before_secs>
2492                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2493             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2494             (?P<ms>[.:][0-9]+)?Z?$
2495         ''', s)
2496     if m:
2497         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2498     else:
2499         m = re.match(
2500             r'''(?ix)(?:P?
2501                 (?:
2502                     [0-9]+\s*y(?:ears?)?,?\s*
2503                 )?
2504                 (?:
2505                     [0-9]+\s*m(?:onths?)?,?\s*
2506                 )?
2507                 (?:
2508                     [0-9]+\s*w(?:eeks?)?,?\s*
2509                 )?
2510                 (?:
2511                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2512                 )?
2513                 T)?
2514                 (?:
2515                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2516                 )?
2517                 (?:
2518                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2519                 )?
2520                 (?:
2521                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2522                 )?Z?$''', s)
2523         if m:
2524             days, hours, mins, secs, ms = m.groups()
2525         else:
2526             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2527             if m:
2528                 hours, mins = m.groups()
2529             else:
2530                 return None
2531
2532     if ms:
2533         ms = ms.replace(':', '.')
2534     return sum(float(part or 0) * mult for part, mult in (
2535         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2536
2537
2538 def prepend_extension(filename, ext, expected_real_ext=None):
2539     name, real_ext = os.path.splitext(filename)
2540     return (
2541         f'{name}.{ext}{real_ext}'
2542         if not expected_real_ext or real_ext[1:] == expected_real_ext
2543         else f'{filename}.{ext}')
2544
2545
2546 def replace_extension(filename, ext, expected_real_ext=None):
2547     name, real_ext = os.path.splitext(filename)
2548     return '{}.{}'.format(
2549         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2550         ext)
2551
2552
2553 def check_executable(exe, args=[]):
2554     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2555     args can be a list of arguments for a short output (like -version) """
2556     try:
2557         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2558     except OSError:
2559         return False
2560     return exe
2561
2562
2563 def _get_exe_version_output(exe, args, *, to_screen=None):
2564     if to_screen:
2565         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2566     try:
2567         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2568         # SIGTTOU if yt-dlp is run in the background.
2569         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2570         out, _ = Popen(
2571             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2572             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2573     except OSError:
2574         return False
2575     if isinstance(out, bytes):  # Python 2.x
2576         out = out.decode('ascii', 'ignore')
2577     return out
2578
2579
2580 def detect_exe_version(output, version_re=None, unrecognized='present'):
2581     assert isinstance(output, compat_str)
2582     if version_re is None:
2583         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2584     m = re.search(version_re, output)
2585     if m:
2586         return m.group(1)
2587     else:
2588         return unrecognized
2589
2590
2591 def get_exe_version(exe, args=['--version'],
2592                     version_re=None, unrecognized='present'):
2593     """ Returns the version of the specified executable,
2594     or False if the executable is not present """
2595     out = _get_exe_version_output(exe, args)
2596     return detect_exe_version(out, version_re, unrecognized) if out else False
2597
2598
2599 class LazyList(collections.abc.Sequence):
2600     """Lazy immutable list from an iterable
2601     Note that slices of a LazyList are lists and not LazyList"""
2602
2603     class IndexError(IndexError):
2604         pass
2605
2606     def __init__(self, iterable, *, reverse=False, _cache=None):
2607         self._iterable = iter(iterable)
2608         self._cache = [] if _cache is None else _cache
2609         self._reversed = reverse
2610
2611     def __iter__(self):
2612         if self._reversed:
2613             # We need to consume the entire iterable to iterate in reverse
2614             yield from self.exhaust()
2615             return
2616         yield from self._cache
2617         for item in self._iterable:
2618             self._cache.append(item)
2619             yield item
2620
2621     def _exhaust(self):
2622         self._cache.extend(self._iterable)
2623         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2624         return self._cache
2625
2626     def exhaust(self):
2627         """Evaluate the entire iterable"""
2628         return self._exhaust()[::-1 if self._reversed else 1]
2629
2630     @staticmethod
2631     def _reverse_index(x):
2632         return None if x is None else -(x + 1)
2633
2634     def __getitem__(self, idx):
2635         if isinstance(idx, slice):
2636             if self._reversed:
2637                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2638             start, stop, step = idx.start, idx.stop, idx.step or 1
2639         elif isinstance(idx, int):
2640             if self._reversed:
2641                 idx = self._reverse_index(idx)
2642             start, stop, step = idx, idx, 0
2643         else:
2644             raise TypeError('indices must be integers or slices')
2645         if ((start or 0) < 0 or (stop or 0) < 0
2646                 or (start is None and step < 0)
2647                 or (stop is None and step > 0)):
2648             # We need to consume the entire iterable to be able to slice from the end
2649             # Obviously, never use this with infinite iterables
2650             self._exhaust()
2651             try:
2652                 return self._cache[idx]
2653             except IndexError as e:
2654                 raise self.IndexError(e) from e
2655         n = max(start or 0, stop or 0) - len(self._cache) + 1
2656         if n > 0:
2657             self._cache.extend(itertools.islice(self._iterable, n))
2658         try:
2659             return self._cache[idx]
2660         except IndexError as e:
2661             raise self.IndexError(e) from e
2662
2663     def __bool__(self):
2664         try:
2665             self[-1] if self._reversed else self[0]
2666         except self.IndexError:
2667             return False
2668         return True
2669
2670     def __len__(self):
2671         self._exhaust()
2672         return len(self._cache)
2673
2674     def __reversed__(self):
2675         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2676
2677     def __copy__(self):
2678         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2679
2680     def __repr__(self):
2681         # repr and str should mimic a list. So we exhaust the iterable
2682         return repr(self.exhaust())
2683
2684     def __str__(self):
2685         return repr(self.exhaust())
2686
2687
2688 class PagedList:
2689
2690     class IndexError(IndexError):
2691         pass
2692
2693     def __len__(self):
2694         # This is only useful for tests
2695         return len(self.getslice())
2696
2697     def __init__(self, pagefunc, pagesize, use_cache=True):
2698         self._pagefunc = pagefunc
2699         self._pagesize = pagesize
2700         self._pagecount = float('inf')
2701         self._use_cache = use_cache
2702         self._cache = {}
2703
2704     def getpage(self, pagenum):
2705         page_results = self._cache.get(pagenum)
2706         if page_results is None:
2707             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2708         if self._use_cache:
2709             self._cache[pagenum] = page_results
2710         return page_results
2711
2712     def getslice(self, start=0, end=None):
2713         return list(self._getslice(start, end))
2714
2715     def _getslice(self, start, end):
2716         raise NotImplementedError('This method must be implemented by subclasses')
2717
2718     def __getitem__(self, idx):
2719         assert self._use_cache, 'Indexing PagedList requires cache'
2720         if not isinstance(idx, int) or idx < 0:
2721             raise TypeError('indices must be non-negative integers')
2722         entries = self.getslice(idx, idx + 1)
2723         if not entries:
2724             raise self.IndexError()
2725         return entries[0]
2726
2727
2728 class OnDemandPagedList(PagedList):
2729     """Download pages until a page with less than maximum results"""
2730
2731     def _getslice(self, start, end):
2732         for pagenum in itertools.count(start // self._pagesize):
2733             firstid = pagenum * self._pagesize
2734             nextfirstid = pagenum * self._pagesize + self._pagesize
2735             if start >= nextfirstid:
2736                 continue
2737
2738             startv = (
2739                 start % self._pagesize
2740                 if firstid <= start < nextfirstid
2741                 else 0)
2742             endv = (
2743                 ((end - 1) % self._pagesize) + 1
2744                 if (end is not None and firstid <= end <= nextfirstid)
2745                 else None)
2746
2747             try:
2748                 page_results = self.getpage(pagenum)
2749             except Exception:
2750                 self._pagecount = pagenum - 1
2751                 raise
2752             if startv != 0 or endv is not None:
2753                 page_results = page_results[startv:endv]
2754             yield from page_results
2755
2756             # A little optimization - if current page is not "full", ie. does
2757             # not contain page_size videos then we can assume that this page
2758             # is the last one - there are no more ids on further pages -
2759             # i.e. no need to query again.
2760             if len(page_results) + startv < self._pagesize:
2761                 break
2762
2763             # If we got the whole page, but the next page is not interesting,
2764             # break out early as well
2765             if end == nextfirstid:
2766                 break
2767
2768
2769 class InAdvancePagedList(PagedList):
2770     """PagedList with total number of pages known in advance"""
2771
2772     def __init__(self, pagefunc, pagecount, pagesize):
2773         PagedList.__init__(self, pagefunc, pagesize, True)
2774         self._pagecount = pagecount
2775
2776     def _getslice(self, start, end):
2777         start_page = start // self._pagesize
2778         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2779         skip_elems = start - start_page * self._pagesize
2780         only_more = None if end is None else end - start
2781         for pagenum in range(start_page, end_page):
2782             page_results = self.getpage(pagenum)
2783             if skip_elems:
2784                 page_results = page_results[skip_elems:]
2785                 skip_elems = None
2786             if only_more is not None:
2787                 if len(page_results) < only_more:
2788                     only_more -= len(page_results)
2789                 else:
2790                     yield from page_results[:only_more]
2791                     break
2792             yield from page_results
2793
2794
2795 def uppercase_escape(s):
2796     unicode_escape = codecs.getdecoder('unicode_escape')
2797     return re.sub(
2798         r'\\U[0-9a-fA-F]{8}',
2799         lambda m: unicode_escape(m.group(0))[0],
2800         s)
2801
2802
2803 def lowercase_escape(s):
2804     unicode_escape = codecs.getdecoder('unicode_escape')
2805     return re.sub(
2806         r'\\u[0-9a-fA-F]{4}',
2807         lambda m: unicode_escape(m.group(0))[0],
2808         s)
2809
2810
2811 def escape_rfc3986(s):
2812     """Escape non-ASCII characters as suggested by RFC 3986"""
2813     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2814
2815
2816 def escape_url(url):
2817     """Escape URL as suggested by RFC 3986"""
2818     url_parsed = compat_urllib_parse_urlparse(url)
2819     return url_parsed._replace(
2820         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2821         path=escape_rfc3986(url_parsed.path),
2822         params=escape_rfc3986(url_parsed.params),
2823         query=escape_rfc3986(url_parsed.query),
2824         fragment=escape_rfc3986(url_parsed.fragment)
2825     ).geturl()
2826
2827
2828 def parse_qs(url):
2829     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2830
2831
2832 def read_batch_urls(batch_fd):
2833     def fixup(url):
2834         if not isinstance(url, compat_str):
2835             url = url.decode('utf-8', 'replace')
2836         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2837         for bom in BOM_UTF8:
2838             if url.startswith(bom):
2839                 url = url[len(bom):]
2840         url = url.lstrip()
2841         if not url or url.startswith(('#', ';', ']')):
2842             return False
2843         # "#" cannot be stripped out since it is part of the URI
2844         # However, it can be safely stipped out if follwing a whitespace
2845         return re.split(r'\s#', url, 1)[0].rstrip()
2846
2847     with contextlib.closing(batch_fd) as fd:
2848         return [url for url in map(fixup, fd) if url]
2849
2850
2851 def urlencode_postdata(*args, **kargs):
2852     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2853
2854
2855 def update_url_query(url, query):
2856     if not query:
2857         return url
2858     parsed_url = compat_urlparse.urlparse(url)
2859     qs = compat_parse_qs(parsed_url.query)
2860     qs.update(query)
2861     return compat_urlparse.urlunparse(parsed_url._replace(
2862         query=compat_urllib_parse_urlencode(qs, True)))
2863
2864
2865 def update_Request(req, url=None, data=None, headers={}, query={}):
2866     req_headers = req.headers.copy()
2867     req_headers.update(headers)
2868     req_data = data or req.data
2869     req_url = update_url_query(url or req.get_full_url(), query)
2870     req_get_method = req.get_method()
2871     if req_get_method == 'HEAD':
2872         req_type = HEADRequest
2873     elif req_get_method == 'PUT':
2874         req_type = PUTRequest
2875     else:
2876         req_type = compat_urllib_request.Request
2877     new_req = req_type(
2878         req_url, data=req_data, headers=req_headers,
2879         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2880     if hasattr(req, 'timeout'):
2881         new_req.timeout = req.timeout
2882     return new_req
2883
2884
2885 def _multipart_encode_impl(data, boundary):
2886     content_type = 'multipart/form-data; boundary=%s' % boundary
2887
2888     out = b''
2889     for k, v in data.items():
2890         out += b'--' + boundary.encode('ascii') + b'\r\n'
2891         if isinstance(k, compat_str):
2892             k = k.encode()
2893         if isinstance(v, compat_str):
2894             v = v.encode()
2895         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2896         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2897         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2898         if boundary.encode('ascii') in content:
2899             raise ValueError('Boundary overlaps with data')
2900         out += content
2901
2902     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2903
2904     return out, content_type
2905
2906
2907 def multipart_encode(data, boundary=None):
2908     '''
2909     Encode a dict to RFC 7578-compliant form-data
2910
2911     data:
2912         A dict where keys and values can be either Unicode or bytes-like
2913         objects.
2914     boundary:
2915         If specified a Unicode object, it's used as the boundary. Otherwise
2916         a random boundary is generated.
2917
2918     Reference: https://tools.ietf.org/html/rfc7578
2919     '''
2920     has_specified_boundary = boundary is not None
2921
2922     while True:
2923         if boundary is None:
2924             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2925
2926         try:
2927             out, content_type = _multipart_encode_impl(data, boundary)
2928             break
2929         except ValueError:
2930             if has_specified_boundary:
2931                 raise
2932             boundary = None
2933
2934     return out, content_type
2935
2936
2937 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2938     for val in map(d.get, variadic(key_or_keys)):
2939         if val is not None and (val or not skip_false_values):
2940             return val
2941     return default
2942
2943
2944 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2945     for f in funcs:
2946         try:
2947             val = f(*args, **kwargs)
2948         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2949             pass
2950         else:
2951             if expected_type is None or isinstance(val, expected_type):
2952                 return val
2953
2954
2955 def try_get(src, getter, expected_type=None):
2956     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2957
2958
2959 def filter_dict(dct, cndn=lambda _, v: v is not None):
2960     return {k: v for k, v in dct.items() if cndn(k, v)}
2961
2962
2963 def merge_dicts(*dicts):
2964     merged = {}
2965     for a_dict in dicts:
2966         for k, v in a_dict.items():
2967             if (v is not None and k not in merged
2968                     or isinstance(v, str) and merged[k] == ''):
2969                 merged[k] = v
2970     return merged
2971
2972
2973 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2974     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2975
2976
2977 US_RATINGS = {
2978     'G': 0,
2979     'PG': 10,
2980     'PG-13': 13,
2981     'R': 16,
2982     'NC': 18,
2983 }
2984
2985
2986 TV_PARENTAL_GUIDELINES = {
2987     'TV-Y': 0,
2988     'TV-Y7': 7,
2989     'TV-G': 0,
2990     'TV-PG': 0,
2991     'TV-14': 14,
2992     'TV-MA': 17,
2993 }
2994
2995
2996 def parse_age_limit(s):
2997     # isinstance(False, int) is True. So type() must be used instead
2998     if type(s) is int:  # noqa: E721
2999         return s if 0 <= s <= 21 else None
3000     elif not isinstance(s, str):
3001         return None
3002     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3003     if m:
3004         return int(m.group('age'))
3005     s = s.upper()
3006     if s in US_RATINGS:
3007         return US_RATINGS[s]
3008     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3009     if m:
3010         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3011     return None
3012
3013
3014 def strip_jsonp(code):
3015     return re.sub(
3016         r'''(?sx)^
3017             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3018             (?:\s*&&\s*(?P=func_name))?
3019             \s*\(\s*(?P<callback_data>.*)\);?
3020             \s*?(?://[^\n]*)*$''',
3021         r'\g<callback_data>', code)
3022
3023
3024 def js_to_json(code, vars={}):
3025     # vars is a dict of var, val pairs to substitute
3026     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3027     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3028     INTEGER_TABLE = (
3029         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3030         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3031     )
3032
3033     def fix_kv(m):
3034         v = m.group(0)
3035         if v in ('true', 'false', 'null'):
3036             return v
3037         elif v in ('undefined', 'void 0'):
3038             return 'null'
3039         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3040             return ""
3041
3042         if v[0] in ("'", '"'):
3043             v = re.sub(r'(?s)\\.|"', lambda m: {
3044                 '"': '\\"',
3045                 "\\'": "'",
3046                 '\\\n': '',
3047                 '\\x': '\\u00',
3048             }.get(m.group(0), m.group(0)), v[1:-1])
3049         else:
3050             for regex, base in INTEGER_TABLE:
3051                 im = re.match(regex, v)
3052                 if im:
3053                     i = int(im.group(1), base)
3054                     return '"%d":' % i if v.endswith(':') else '%d' % i
3055
3056             if v in vars:
3057                 return vars[v]
3058
3059         return '"%s"' % v
3060
3061     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3062
3063     return re.sub(r'''(?sx)
3064         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3065         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3066         {comment}|,(?={skip}[\]}}])|
3067         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3068         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3069         [0-9]+(?={skip}:)|
3070         !+
3071         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3072
3073
3074 def qualities(quality_ids):
3075     """ Get a numeric quality value out of a list of possible values """
3076     def q(qid):
3077         try:
3078             return quality_ids.index(qid)
3079         except ValueError:
3080             return -1
3081     return q
3082
3083
3084 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3085
3086
3087 DEFAULT_OUTTMPL = {
3088     'default': '%(title)s [%(id)s].%(ext)s',
3089     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3090 }
3091 OUTTMPL_TYPES = {
3092     'chapter': None,
3093     'subtitle': None,
3094     'thumbnail': None,
3095     'description': 'description',
3096     'annotation': 'annotations.xml',
3097     'infojson': 'info.json',
3098     'link': None,
3099     'pl_video': None,
3100     'pl_thumbnail': None,
3101     'pl_description': 'description',
3102     'pl_infojson': 'info.json',
3103 }
3104
3105 # As of [1] format syntax is:
3106 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3107 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3108 STR_FORMAT_RE_TMPL = r'''(?x)
3109     (?<!%)(?P<prefix>(?:%%)*)
3110     %
3111     (?P<has_key>\((?P<key>{0})\))?
3112     (?P<format>
3113         (?P<conversion>[#0\-+ ]+)?
3114         (?P<min_width>\d+)?
3115         (?P<precision>\.\d+)?
3116         (?P<len_mod>[hlL])?  # unused in python
3117         {1}  # conversion type
3118     )
3119 '''
3120
3121
3122 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3123
3124
3125 def limit_length(s, length):
3126     """ Add ellipses to overly long strings """
3127     if s is None:
3128         return None
3129     ELLIPSES = '...'
3130     if len(s) > length:
3131         return s[:length - len(ELLIPSES)] + ELLIPSES
3132     return s
3133
3134
3135 def version_tuple(v):
3136     return tuple(int(e) for e in re.split(r'[-.]', v))
3137
3138
3139 def is_outdated_version(version, limit, assume_new=True):
3140     if not version:
3141         return not assume_new
3142     try:
3143         return version_tuple(version) < version_tuple(limit)
3144     except ValueError:
3145         return not assume_new
3146
3147
3148 def ytdl_is_updateable():
3149     """ Returns if yt-dlp can be updated with -U """
3150
3151     from .update import is_non_updateable
3152
3153     return not is_non_updateable()
3154
3155
3156 def args_to_str(args):
3157     # Get a short string representation for a subprocess command
3158     return ' '.join(compat_shlex_quote(a) for a in args)
3159
3160
3161 def error_to_compat_str(err):
3162     return str(err)
3163
3164
3165 def error_to_str(err):
3166     return f'{type(err).__name__}: {err}'
3167
3168
3169 def mimetype2ext(mt):
3170     if mt is None:
3171         return None
3172
3173     mt, _, params = mt.partition(';')
3174     mt = mt.strip()
3175
3176     FULL_MAP = {
3177         'audio/mp4': 'm4a',
3178         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3179         # it's the most popular one
3180         'audio/mpeg': 'mp3',
3181         'audio/x-wav': 'wav',
3182         'audio/wav': 'wav',
3183         'audio/wave': 'wav',
3184     }
3185
3186     ext = FULL_MAP.get(mt)
3187     if ext is not None:
3188         return ext
3189
3190     SUBTYPE_MAP = {
3191         '3gpp': '3gp',
3192         'smptett+xml': 'tt',
3193         'ttaf+xml': 'dfxp',
3194         'ttml+xml': 'ttml',
3195         'x-flv': 'flv',
3196         'x-mp4-fragmented': 'mp4',
3197         'x-ms-sami': 'sami',
3198         'x-ms-wmv': 'wmv',
3199         'mpegurl': 'm3u8',
3200         'x-mpegurl': 'm3u8',
3201         'vnd.apple.mpegurl': 'm3u8',
3202         'dash+xml': 'mpd',
3203         'f4m+xml': 'f4m',
3204         'hds+xml': 'f4m',
3205         'vnd.ms-sstr+xml': 'ism',
3206         'quicktime': 'mov',
3207         'mp2t': 'ts',
3208         'x-wav': 'wav',
3209         'filmstrip+json': 'fs',
3210         'svg+xml': 'svg',
3211     }
3212
3213     _, _, subtype = mt.rpartition('/')
3214     ext = SUBTYPE_MAP.get(subtype.lower())
3215     if ext is not None:
3216         return ext
3217
3218     SUFFIX_MAP = {
3219         'json': 'json',
3220         'xml': 'xml',
3221         'zip': 'zip',
3222         'gzip': 'gz',
3223     }
3224
3225     _, _, suffix = subtype.partition('+')
3226     ext = SUFFIX_MAP.get(suffix)
3227     if ext is not None:
3228         return ext
3229
3230     return subtype.replace('+', '.')
3231
3232
3233 def ext2mimetype(ext_or_url):
3234     if not ext_or_url:
3235         return None
3236     if '.' not in ext_or_url:
3237         ext_or_url = f'file.{ext_or_url}'
3238     return mimetypes.guess_type(ext_or_url)[0]
3239
3240
3241 def parse_codecs(codecs_str):
3242     # http://tools.ietf.org/html/rfc6381
3243     if not codecs_str:
3244         return {}
3245     split_codecs = list(filter(None, map(
3246         str.strip, codecs_str.strip().strip(',').split(','))))
3247     vcodec, acodec, scodec, hdr = None, None, None, None
3248     for full_codec in split_codecs:
3249         parts = full_codec.split('.')
3250         codec = parts[0].replace('0', '')
3251         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3252                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3253             if not vcodec:
3254                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3255                 if codec in ('dvh1', 'dvhe'):
3256                     hdr = 'DV'
3257                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3258                     hdr = 'HDR10'
3259                 elif full_codec.replace('0', '').startswith('vp9.2'):
3260                     hdr = 'HDR10'
3261         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3262             if not acodec:
3263                 acodec = full_codec
3264         elif codec in ('stpp', 'wvtt',):
3265             if not scodec:
3266                 scodec = full_codec
3267         else:
3268             write_string(f'WARNING: Unknown codec {full_codec}\n')
3269     if vcodec or acodec or scodec:
3270         return {
3271             'vcodec': vcodec or 'none',
3272             'acodec': acodec or 'none',
3273             'dynamic_range': hdr,
3274             **({'scodec': scodec} if scodec is not None else {}),
3275         }
3276     elif len(split_codecs) == 2:
3277         return {
3278             'vcodec': split_codecs[0],
3279             'acodec': split_codecs[1],
3280         }
3281     return {}
3282
3283
3284 def urlhandle_detect_ext(url_handle):
3285     getheader = url_handle.headers.get
3286
3287     cd = getheader('Content-Disposition')
3288     if cd:
3289         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3290         if m:
3291             e = determine_ext(m.group('filename'), default_ext=None)
3292             if e:
3293                 return e
3294
3295     return mimetype2ext(getheader('Content-Type'))
3296
3297
3298 def encode_data_uri(data, mime_type):
3299     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3300
3301
3302 def age_restricted(content_limit, age_limit):
3303     """ Returns True iff the content should be blocked """
3304
3305     if age_limit is None:  # No limit set
3306         return False
3307     if content_limit is None:
3308         return False  # Content available for everyone
3309     return age_limit < content_limit
3310
3311
3312 def is_html(first_bytes):
3313     """ Detect whether a file contains HTML by examining its first bytes. """
3314
3315     BOMS = [
3316         (b'\xef\xbb\xbf', 'utf-8'),
3317         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3318         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3319         (b'\xff\xfe', 'utf-16-le'),
3320         (b'\xfe\xff', 'utf-16-be'),
3321     ]
3322
3323     encoding = 'utf-8'
3324     for bom, enc in BOMS:
3325         while first_bytes.startswith(bom):
3326             encoding, first_bytes = enc, first_bytes[len(bom):]
3327
3328     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3329
3330
3331 def determine_protocol(info_dict):
3332     protocol = info_dict.get('protocol')
3333     if protocol is not None:
3334         return protocol
3335
3336     url = sanitize_url(info_dict['url'])
3337     if url.startswith('rtmp'):
3338         return 'rtmp'
3339     elif url.startswith('mms'):
3340         return 'mms'
3341     elif url.startswith('rtsp'):
3342         return 'rtsp'
3343
3344     ext = determine_ext(url)
3345     if ext == 'm3u8':
3346         return 'm3u8'
3347     elif ext == 'f4m':
3348         return 'f4m'
3349
3350     return compat_urllib_parse_urlparse(url).scheme
3351
3352
3353 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3354     """ Render a list of rows, each as a list of values.
3355     Text after a \t will be right aligned """
3356     def width(string):
3357         return len(remove_terminal_sequences(string).replace('\t', ''))
3358
3359     def get_max_lens(table):
3360         return [max(width(str(v)) for v in col) for col in zip(*table)]
3361
3362     def filter_using_list(row, filterArray):
3363         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3364
3365     max_lens = get_max_lens(data) if hide_empty else []
3366     header_row = filter_using_list(header_row, max_lens)
3367     data = [filter_using_list(row, max_lens) for row in data]
3368
3369     table = [header_row] + data
3370     max_lens = get_max_lens(table)
3371     extra_gap += 1
3372     if delim:
3373         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3374         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3375     for row in table:
3376         for pos, text in enumerate(map(str, row)):
3377             if '\t' in text:
3378                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3379             else:
3380                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3381     ret = '\n'.join(''.join(row).rstrip() for row in table)
3382     return ret
3383
3384
3385 def _match_one(filter_part, dct, incomplete):
3386     # TODO: Generalize code with YoutubeDL._build_format_filter
3387     STRING_OPERATORS = {
3388         '*=': operator.contains,
3389         '^=': lambda attr, value: attr.startswith(value),
3390         '$=': lambda attr, value: attr.endswith(value),
3391         '~=': lambda attr, value: re.search(value, attr),
3392     }
3393     COMPARISON_OPERATORS = {
3394         **STRING_OPERATORS,
3395         '<=': operator.le,  # "<=" must be defined above "<"
3396         '<': operator.lt,
3397         '>=': operator.ge,
3398         '>': operator.gt,
3399         '=': operator.eq,
3400     }
3401
3402     if isinstance(incomplete, bool):
3403         is_incomplete = lambda _: incomplete
3404     else:
3405         is_incomplete = lambda k: k in incomplete
3406
3407     operator_rex = re.compile(r'''(?x)\s*
3408         (?P<key>[a-z_]+)
3409         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3410         (?:
3411             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3412             (?P<strval>.+?)
3413         )
3414         \s*$
3415         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3416     m = operator_rex.search(filter_part)
3417     if m:
3418         m = m.groupdict()
3419         unnegated_op = COMPARISON_OPERATORS[m['op']]
3420         if m['negation']:
3421             op = lambda attr, value: not unnegated_op(attr, value)
3422         else:
3423             op = unnegated_op
3424         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3425         if m['quote']:
3426             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3427         actual_value = dct.get(m['key'])
3428         numeric_comparison = None
3429         if isinstance(actual_value, (int, float)):
3430             # If the original field is a string and matching comparisonvalue is
3431             # a number we should respect the origin of the original field
3432             # and process comparison value as a string (see
3433             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3434             try:
3435                 numeric_comparison = int(comparison_value)
3436             except ValueError:
3437                 numeric_comparison = parse_filesize(comparison_value)
3438                 if numeric_comparison is None:
3439                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3440                 if numeric_comparison is None:
3441                     numeric_comparison = parse_duration(comparison_value)
3442         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3443             raise ValueError('Operator %s only supports string values!' % m['op'])
3444         if actual_value is None:
3445             return is_incomplete(m['key']) or m['none_inclusive']
3446         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3447
3448     UNARY_OPERATORS = {
3449         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3450         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3451     }
3452     operator_rex = re.compile(r'''(?x)\s*
3453         (?P<op>%s)\s*(?P<key>[a-z_]+)
3454         \s*$
3455         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3456     m = operator_rex.search(filter_part)
3457     if m:
3458         op = UNARY_OPERATORS[m.group('op')]
3459         actual_value = dct.get(m.group('key'))
3460         if is_incomplete(m.group('key')) and actual_value is None:
3461             return True
3462         return op(actual_value)
3463
3464     raise ValueError('Invalid filter part %r' % filter_part)
3465
3466
3467 def match_str(filter_str, dct, incomplete=False):
3468     """ Filter a dictionary with a simple string syntax.
3469     @returns           Whether the filter passes
3470     @param incomplete  Set of keys that is expected to be missing from dct.
3471                        Can be True/False to indicate all/none of the keys may be missing.
3472                        All conditions on incomplete keys pass if the key is missing
3473     """
3474     return all(
3475         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3476         for filter_part in re.split(r'(?<!\\)&', filter_str))
3477
3478
3479 def match_filter_func(filters):
3480     if not filters:
3481         return None
3482     filters = set(variadic(filters))
3483
3484     interactive = '-' in filters
3485     if interactive:
3486         filters.remove('-')
3487
3488     def _match_func(info_dict, incomplete=False):
3489         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3490             return NO_DEFAULT if interactive and not incomplete else None
3491         else:
3492             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3493             filter_str = ') | ('.join(map(str.strip, filters))
3494             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3495     return _match_func
3496
3497
3498 def download_range_func(chapters, ranges):
3499     def inner(info_dict, ydl):
3500         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3501                    else 'Chapter information is unavailable')
3502         for regex in chapters or []:
3503             for i, chapter in enumerate(info_dict.get('chapters') or []):
3504                 if re.search(regex, chapter['title']):
3505                     warning = None
3506                     yield {**chapter, 'index': i}
3507         if warning:
3508             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3509
3510         yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3511
3512     return inner
3513
3514
3515 def parse_dfxp_time_expr(time_expr):
3516     if not time_expr:
3517         return
3518
3519     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3520     if mobj:
3521         return float(mobj.group('time_offset'))
3522
3523     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3524     if mobj:
3525         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3526
3527
3528 def srt_subtitles_timecode(seconds):
3529     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3530
3531
3532 def ass_subtitles_timecode(seconds):
3533     time = timetuple_from_msec(seconds * 1000)
3534     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3535
3536
3537 def dfxp2srt(dfxp_data):
3538     '''
3539     @param dfxp_data A bytes-like object containing DFXP data
3540     @returns A unicode object containing converted SRT data
3541     '''
3542     LEGACY_NAMESPACES = (
3543         (b'http://www.w3.org/ns/ttml', [
3544             b'http://www.w3.org/2004/11/ttaf1',
3545             b'http://www.w3.org/2006/04/ttaf1',
3546             b'http://www.w3.org/2006/10/ttaf1',
3547         ]),
3548         (b'http://www.w3.org/ns/ttml#styling', [
3549             b'http://www.w3.org/ns/ttml#style',
3550         ]),
3551     )
3552
3553     SUPPORTED_STYLING = [
3554         'color',
3555         'fontFamily',
3556         'fontSize',
3557         'fontStyle',
3558         'fontWeight',
3559         'textDecoration'
3560     ]
3561
3562     _x = functools.partial(xpath_with_ns, ns_map={
3563         'xml': 'http://www.w3.org/XML/1998/namespace',
3564         'ttml': 'http://www.w3.org/ns/ttml',
3565         'tts': 'http://www.w3.org/ns/ttml#styling',
3566     })
3567
3568     styles = {}
3569     default_style = {}
3570
3571     class TTMLPElementParser:
3572         _out = ''
3573         _unclosed_elements = []
3574         _applied_styles = []
3575
3576         def start(self, tag, attrib):
3577             if tag in (_x('ttml:br'), 'br'):
3578                 self._out += '\n'
3579             else:
3580                 unclosed_elements = []
3581                 style = {}
3582                 element_style_id = attrib.get('style')
3583                 if default_style:
3584                     style.update(default_style)
3585                 if element_style_id:
3586                     style.update(styles.get(element_style_id, {}))
3587                 for prop in SUPPORTED_STYLING:
3588                     prop_val = attrib.get(_x('tts:' + prop))
3589                     if prop_val:
3590                         style[prop] = prop_val
3591                 if style:
3592                     font = ''
3593                     for k, v in sorted(style.items()):
3594                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3595                             continue
3596                         if k == 'color':
3597                             font += ' color="%s"' % v
3598                         elif k == 'fontSize':
3599                             font += ' size="%s"' % v
3600                         elif k == 'fontFamily':
3601                             font += ' face="%s"' % v
3602                         elif k == 'fontWeight' and v == 'bold':
3603                             self._out += '<b>'
3604                             unclosed_elements.append('b')
3605                         elif k == 'fontStyle' and v == 'italic':
3606                             self._out += '<i>'
3607                             unclosed_elements.append('i')
3608                         elif k == 'textDecoration' and v == 'underline':
3609                             self._out += '<u>'
3610                             unclosed_elements.append('u')
3611                     if font:
3612                         self._out += '<font' + font + '>'
3613                         unclosed_elements.append('font')
3614                     applied_style = {}
3615                     if self._applied_styles:
3616                         applied_style.update(self._applied_styles[-1])
3617                     applied_style.update(style)
3618                     self._applied_styles.append(applied_style)
3619                 self._unclosed_elements.append(unclosed_elements)
3620
3621         def end(self, tag):
3622             if tag not in (_x('ttml:br'), 'br'):
3623                 unclosed_elements = self._unclosed_elements.pop()
3624                 for element in reversed(unclosed_elements):
3625                     self._out += '</%s>' % element
3626                 if unclosed_elements and self._applied_styles:
3627                     self._applied_styles.pop()
3628
3629         def data(self, data):
3630             self._out += data
3631
3632         def close(self):
3633             return self._out.strip()
3634
3635     def parse_node(node):
3636         target = TTMLPElementParser()
3637         parser = xml.etree.ElementTree.XMLParser(target=target)
3638         parser.feed(xml.etree.ElementTree.tostring(node))
3639         return parser.close()
3640
3641     for k, v in LEGACY_NAMESPACES:
3642         for ns in v:
3643             dfxp_data = dfxp_data.replace(ns, k)
3644
3645     dfxp = compat_etree_fromstring(dfxp_data)
3646     out = []
3647     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3648
3649     if not paras:
3650         raise ValueError('Invalid dfxp/TTML subtitle')
3651
3652     repeat = False
3653     while True:
3654         for style in dfxp.findall(_x('.//ttml:style')):
3655             style_id = style.get('id') or style.get(_x('xml:id'))
3656             if not style_id:
3657                 continue
3658             parent_style_id = style.get('style')
3659             if parent_style_id:
3660                 if parent_style_id not in styles:
3661                     repeat = True
3662                     continue
3663                 styles[style_id] = styles[parent_style_id].copy()
3664             for prop in SUPPORTED_STYLING:
3665                 prop_val = style.get(_x('tts:' + prop))
3666                 if prop_val:
3667                     styles.setdefault(style_id, {})[prop] = prop_val
3668         if repeat:
3669             repeat = False
3670         else:
3671             break
3672
3673     for p in ('body', 'div'):
3674         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3675         if ele is None:
3676             continue
3677         style = styles.get(ele.get('style'))
3678         if not style:
3679             continue
3680         default_style.update(style)
3681
3682     for para, index in zip(paras, itertools.count(1)):
3683         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3684         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3685         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3686         if begin_time is None:
3687             continue
3688         if not end_time:
3689             if not dur:
3690                 continue
3691             end_time = begin_time + dur
3692         out.append('%d\n%s --> %s\n%s\n\n' % (
3693             index,
3694             srt_subtitles_timecode(begin_time),
3695             srt_subtitles_timecode(end_time),
3696             parse_node(para)))
3697
3698     return ''.join(out)
3699
3700
3701 def cli_option(params, command_option, param, separator=None):
3702     param = params.get(param)
3703     return ([] if param is None
3704             else [command_option, str(param)] if separator is None
3705             else [f'{command_option}{separator}{param}'])
3706
3707
3708 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3709     param = params.get(param)
3710     assert param in (True, False, None)
3711     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3712
3713
3714 def cli_valueless_option(params, command_option, param, expected_value=True):
3715     return [command_option] if params.get(param) == expected_value else []
3716
3717
3718 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3719     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3720         if use_compat:
3721             return argdict
3722         else:
3723             argdict = None
3724     if argdict is None:
3725         return default
3726     assert isinstance(argdict, dict)
3727
3728     assert isinstance(keys, (list, tuple))
3729     for key_list in keys:
3730         arg_list = list(filter(
3731             lambda x: x is not None,
3732             [argdict.get(key.lower()) for key in variadic(key_list)]))
3733         if arg_list:
3734             return [arg for args in arg_list for arg in args]
3735     return default
3736
3737
3738 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3739     main_key, exe = main_key.lower(), exe.lower()
3740     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3741     keys = [f'{root_key}{k}' for k in (keys or [''])]
3742     if root_key in keys:
3743         if main_key != exe:
3744             keys.append((main_key, exe))
3745         keys.append('default')
3746     else:
3747         use_compat = False
3748     return cli_configuration_args(argdict, keys, default, use_compat)
3749
3750
3751 class ISO639Utils:
3752     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3753     _lang_map = {
3754         'aa': 'aar',
3755         'ab': 'abk',
3756         'ae': 'ave',
3757         'af': 'afr',
3758         'ak': 'aka',
3759         'am': 'amh',
3760         'an': 'arg',
3761         'ar': 'ara',
3762         'as': 'asm',
3763         'av': 'ava',
3764         'ay': 'aym',
3765         'az': 'aze',
3766         'ba': 'bak',
3767         'be': 'bel',
3768         'bg': 'bul',
3769         'bh': 'bih',
3770         'bi': 'bis',
3771         'bm': 'bam',
3772         'bn': 'ben',
3773         'bo': 'bod',
3774         'br': 'bre',
3775         'bs': 'bos',
3776         'ca': 'cat',
3777         'ce': 'che',
3778         'ch': 'cha',
3779         'co': 'cos',
3780         'cr': 'cre',
3781         'cs': 'ces',
3782         'cu': 'chu',
3783         'cv': 'chv',
3784         'cy': 'cym',
3785         'da': 'dan',
3786         'de': 'deu',
3787         'dv': 'div',
3788         'dz': 'dzo',
3789         'ee': 'ewe',
3790         'el': 'ell',
3791         'en': 'eng',
3792         'eo': 'epo',
3793         'es': 'spa',
3794         'et': 'est',
3795         'eu': 'eus',
3796         'fa': 'fas',
3797         'ff': 'ful',
3798         'fi': 'fin',
3799         'fj': 'fij',
3800         'fo': 'fao',
3801         'fr': 'fra',
3802         'fy': 'fry',
3803         'ga': 'gle',
3804         'gd': 'gla',
3805         'gl': 'glg',
3806         'gn': 'grn',
3807         'gu': 'guj',
3808         'gv': 'glv',
3809         'ha': 'hau',
3810         'he': 'heb',
3811         'iw': 'heb',  # Replaced by he in 1989 revision
3812         'hi': 'hin',
3813         'ho': 'hmo',
3814         'hr': 'hrv',
3815         'ht': 'hat',
3816         'hu': 'hun',
3817         'hy': 'hye',
3818         'hz': 'her',
3819         'ia': 'ina',
3820         'id': 'ind',
3821         'in': 'ind',  # Replaced by id in 1989 revision
3822         'ie': 'ile',
3823         'ig': 'ibo',
3824         'ii': 'iii',
3825         'ik': 'ipk',
3826         'io': 'ido',
3827         'is': 'isl',
3828         'it': 'ita',
3829         'iu': 'iku',
3830         'ja': 'jpn',
3831         'jv': 'jav',
3832         'ka': 'kat',
3833         'kg': 'kon',
3834         'ki': 'kik',
3835         'kj': 'kua',
3836         'kk': 'kaz',
3837         'kl': 'kal',
3838         'km': 'khm',
3839         'kn': 'kan',
3840         'ko': 'kor',
3841         'kr': 'kau',
3842         'ks': 'kas',
3843         'ku': 'kur',
3844         'kv': 'kom',
3845         'kw': 'cor',
3846         'ky': 'kir',
3847         'la': 'lat',
3848         'lb': 'ltz',
3849         'lg': 'lug',
3850         'li': 'lim',
3851         'ln': 'lin',
3852         'lo': 'lao',
3853         'lt': 'lit',
3854         'lu': 'lub',
3855         'lv': 'lav',
3856         'mg': 'mlg',
3857         'mh': 'mah',
3858         'mi': 'mri',
3859         'mk': 'mkd',
3860         'ml': 'mal',
3861         'mn': 'mon',
3862         'mr': 'mar',
3863         'ms': 'msa',
3864         'mt': 'mlt',
3865         'my': 'mya',
3866         'na': 'nau',
3867         'nb': 'nob',
3868         'nd': 'nde',
3869         'ne': 'nep',
3870         'ng': 'ndo',
3871         'nl': 'nld',
3872         'nn': 'nno',
3873         'no': 'nor',
3874         'nr': 'nbl',
3875         'nv': 'nav',
3876         'ny': 'nya',
3877         'oc': 'oci',
3878         'oj': 'oji',
3879         'om': 'orm',
3880         'or': 'ori',
3881         'os': 'oss',
3882         'pa': 'pan',
3883         'pi': 'pli',
3884         'pl': 'pol',
3885         'ps': 'pus',
3886         'pt': 'por',
3887         'qu': 'que',
3888         'rm': 'roh',
3889         'rn': 'run',
3890         'ro': 'ron',
3891         'ru': 'rus',
3892         'rw': 'kin',
3893         'sa': 'san',
3894         'sc': 'srd',
3895         'sd': 'snd',
3896         'se': 'sme',
3897         'sg': 'sag',
3898         'si': 'sin',
3899         'sk': 'slk',
3900         'sl': 'slv',
3901         'sm': 'smo',
3902         'sn': 'sna',
3903         'so': 'som',
3904         'sq': 'sqi',
3905         'sr': 'srp',
3906         'ss': 'ssw',
3907         'st': 'sot',
3908         'su': 'sun',
3909         'sv': 'swe',
3910         'sw': 'swa',
3911         'ta': 'tam',
3912         'te': 'tel',
3913         'tg': 'tgk',
3914         'th': 'tha',
3915         'ti': 'tir',
3916         'tk': 'tuk',
3917         'tl': 'tgl',
3918         'tn': 'tsn',
3919         'to': 'ton',
3920         'tr': 'tur',
3921         'ts': 'tso',
3922         'tt': 'tat',
3923         'tw': 'twi',
3924         'ty': 'tah',
3925         'ug': 'uig',
3926         'uk': 'ukr',
3927         'ur': 'urd',
3928         'uz': 'uzb',
3929         've': 'ven',
3930         'vi': 'vie',
3931         'vo': 'vol',
3932         'wa': 'wln',
3933         'wo': 'wol',
3934         'xh': 'xho',
3935         'yi': 'yid',
3936         'ji': 'yid',  # Replaced by yi in 1989 revision
3937         'yo': 'yor',
3938         'za': 'zha',
3939         'zh': 'zho',
3940         'zu': 'zul',
3941     }
3942
3943     @classmethod
3944     def short2long(cls, code):
3945         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3946         return cls._lang_map.get(code[:2])
3947
3948     @classmethod
3949     def long2short(cls, code):
3950         """Convert language code from ISO 639-2/T to ISO 639-1"""
3951         for short_name, long_name in cls._lang_map.items():
3952             if long_name == code:
3953                 return short_name
3954
3955
3956 class ISO3166Utils:
3957     # From http://data.okfn.org/data/core/country-list
3958     _country_map = {
3959         'AF': 'Afghanistan',
3960         'AX': 'Åland Islands',
3961         'AL': 'Albania',
3962         'DZ': 'Algeria',
3963         'AS': 'American Samoa',
3964         'AD': 'Andorra',
3965         'AO': 'Angola',
3966         'AI': 'Anguilla',
3967         'AQ': 'Antarctica',
3968         'AG': 'Antigua and Barbuda',
3969         'AR': 'Argentina',
3970         'AM': 'Armenia',
3971         'AW': 'Aruba',
3972         'AU': 'Australia',
3973         'AT': 'Austria',
3974         'AZ': 'Azerbaijan',
3975         'BS': 'Bahamas',
3976         'BH': 'Bahrain',
3977         'BD': 'Bangladesh',
3978         'BB': 'Barbados',
3979         'BY': 'Belarus',
3980         'BE': 'Belgium',
3981         'BZ': 'Belize',
3982         'BJ': 'Benin',
3983         'BM': 'Bermuda',
3984         'BT': 'Bhutan',
3985         'BO': 'Bolivia, Plurinational State of',
3986         'BQ': 'Bonaire, Sint Eustatius and Saba',
3987         'BA': 'Bosnia and Herzegovina',
3988         'BW': 'Botswana',
3989         'BV': 'Bouvet Island',
3990         'BR': 'Brazil',
3991         'IO': 'British Indian Ocean Territory',
3992         'BN': 'Brunei Darussalam',
3993         'BG': 'Bulgaria',
3994         'BF': 'Burkina Faso',
3995         'BI': 'Burundi',
3996         'KH': 'Cambodia',
3997         'CM': 'Cameroon',
3998         'CA': 'Canada',
3999         'CV': 'Cape Verde',
4000         'KY': 'Cayman Islands',
4001         'CF': 'Central African Republic',
4002         'TD': 'Chad',
4003         'CL': 'Chile',
4004         'CN': 'China',
4005         'CX': 'Christmas Island',
4006         'CC': 'Cocos (Keeling) Islands',
4007         'CO': 'Colombia',
4008         'KM': 'Comoros',
4009         'CG': 'Congo',
4010         'CD': 'Congo, the Democratic Republic of the',
4011         'CK': 'Cook Islands',
4012         'CR': 'Costa Rica',
4013         'CI': 'Côte d\'Ivoire',
4014         'HR': 'Croatia',
4015         'CU': 'Cuba',
4016         'CW': 'Curaçao',
4017         'CY': 'Cyprus',
4018         'CZ': 'Czech Republic',
4019         'DK': 'Denmark',
4020         'DJ': 'Djibouti',
4021         'DM': 'Dominica',
4022         'DO': 'Dominican Republic',
4023         'EC': 'Ecuador',
4024         'EG': 'Egypt',
4025         'SV': 'El Salvador',
4026         'GQ': 'Equatorial Guinea',
4027         'ER': 'Eritrea',
4028         'EE': 'Estonia',
4029         'ET': 'Ethiopia',
4030         'FK': 'Falkland Islands (Malvinas)',
4031         'FO': 'Faroe Islands',
4032         'FJ': 'Fiji',
4033         'FI': 'Finland',
4034         'FR': 'France',
4035         'GF': 'French Guiana',
4036         'PF': 'French Polynesia',
4037         'TF': 'French Southern Territories',
4038         'GA': 'Gabon',
4039         'GM': 'Gambia',
4040         'GE': 'Georgia',
4041         'DE': 'Germany',
4042         'GH': 'Ghana',
4043         'GI': 'Gibraltar',
4044         'GR': 'Greece',
4045         'GL': 'Greenland',
4046         'GD': 'Grenada',
4047         'GP': 'Guadeloupe',
4048         'GU': 'Guam',
4049         'GT': 'Guatemala',
4050         'GG': 'Guernsey',
4051         'GN': 'Guinea',
4052         'GW': 'Guinea-Bissau',
4053         'GY': 'Guyana',
4054         'HT': 'Haiti',
4055         'HM': 'Heard Island and McDonald Islands',
4056         'VA': 'Holy See (Vatican City State)',
4057         'HN': 'Honduras',
4058         'HK': 'Hong Kong',
4059         'HU': 'Hungary',
4060         'IS': 'Iceland',
4061         'IN': 'India',
4062         'ID': 'Indonesia',
4063         'IR': 'Iran, Islamic Republic of',
4064         'IQ': 'Iraq',
4065         'IE': 'Ireland',
4066         'IM': 'Isle of Man',
4067         'IL': 'Israel',
4068         'IT': 'Italy',
4069         'JM': 'Jamaica',
4070         'JP': 'Japan',
4071         'JE': 'Jersey',
4072         'JO': 'Jordan',
4073         'KZ': 'Kazakhstan',
4074         'KE': 'Kenya',
4075         'KI': 'Kiribati',
4076         'KP': 'Korea, Democratic People\'s Republic of',
4077         'KR': 'Korea, Republic of',
4078         'KW': 'Kuwait',
4079         'KG': 'Kyrgyzstan',
4080         'LA': 'Lao People\'s Democratic Republic',
4081         'LV': 'Latvia',
4082         'LB': 'Lebanon',
4083         'LS': 'Lesotho',
4084         'LR': 'Liberia',
4085         'LY': 'Libya',
4086         'LI': 'Liechtenstein',
4087         'LT': 'Lithuania',
4088         'LU': 'Luxembourg',
4089         'MO': 'Macao',
4090         'MK': 'Macedonia, the Former Yugoslav Republic of',
4091         'MG': 'Madagascar',
4092         'MW': 'Malawi',
4093         'MY': 'Malaysia',
4094         'MV': 'Maldives',
4095         'ML': 'Mali',
4096         'MT': 'Malta',
4097         'MH': 'Marshall Islands',
4098         'MQ': 'Martinique',
4099         'MR': 'Mauritania',
4100         'MU': 'Mauritius',
4101         'YT': 'Mayotte',
4102         'MX': 'Mexico',
4103         'FM': 'Micronesia, Federated States of',
4104         'MD': 'Moldova, Republic of',
4105         'MC': 'Monaco',
4106         'MN': 'Mongolia',
4107         'ME': 'Montenegro',
4108         'MS': 'Montserrat',
4109         'MA': 'Morocco',
4110         'MZ': 'Mozambique',
4111         'MM': 'Myanmar',
4112         'NA': 'Namibia',
4113         'NR': 'Nauru',
4114         'NP': 'Nepal',
4115         'NL': 'Netherlands',
4116         'NC': 'New Caledonia',
4117         'NZ': 'New Zealand',
4118         'NI': 'Nicaragua',
4119         'NE': 'Niger',
4120         'NG': 'Nigeria',
4121         'NU': 'Niue',
4122         'NF': 'Norfolk Island',
4123         'MP': 'Northern Mariana Islands',
4124         'NO': 'Norway',
4125         'OM': 'Oman',
4126         'PK': 'Pakistan',
4127         'PW': 'Palau',
4128         'PS': 'Palestine, State of',
4129         'PA': 'Panama',
4130         'PG': 'Papua New Guinea',
4131         'PY': 'Paraguay',
4132         'PE': 'Peru',
4133         'PH': 'Philippines',
4134         'PN': 'Pitcairn',
4135         'PL': 'Poland',
4136         'PT': 'Portugal',
4137         'PR': 'Puerto Rico',
4138         'QA': 'Qatar',
4139         'RE': 'Réunion',
4140         'RO': 'Romania',
4141         'RU': 'Russian Federation',
4142         'RW': 'Rwanda',
4143         'BL': 'Saint Barthélemy',
4144         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4145         'KN': 'Saint Kitts and Nevis',
4146         'LC': 'Saint Lucia',
4147         'MF': 'Saint Martin (French part)',
4148         'PM': 'Saint Pierre and Miquelon',
4149         'VC': 'Saint Vincent and the Grenadines',
4150         'WS': 'Samoa',
4151         'SM': 'San Marino',
4152         'ST': 'Sao Tome and Principe',
4153         'SA': 'Saudi Arabia',
4154         'SN': 'Senegal',
4155         'RS': 'Serbia',
4156         'SC': 'Seychelles',
4157         'SL': 'Sierra Leone',
4158         'SG': 'Singapore',
4159         'SX': 'Sint Maarten (Dutch part)',
4160         'SK': 'Slovakia',
4161         'SI': 'Slovenia',
4162         'SB': 'Solomon Islands',
4163         'SO': 'Somalia',
4164         'ZA': 'South Africa',
4165         'GS': 'South Georgia and the South Sandwich Islands',
4166         'SS': 'South Sudan',
4167         'ES': 'Spain',
4168         'LK': 'Sri Lanka',
4169         'SD': 'Sudan',
4170         'SR': 'Suriname',
4171         'SJ': 'Svalbard and Jan Mayen',
4172         'SZ': 'Swaziland',
4173         'SE': 'Sweden',
4174         'CH': 'Switzerland',
4175         'SY': 'Syrian Arab Republic',
4176         'TW': 'Taiwan, Province of China',
4177         'TJ': 'Tajikistan',
4178         'TZ': 'Tanzania, United Republic of',
4179         'TH': 'Thailand',
4180         'TL': 'Timor-Leste',
4181         'TG': 'Togo',
4182         'TK': 'Tokelau',
4183         'TO': 'Tonga',
4184         'TT': 'Trinidad and Tobago',
4185         'TN': 'Tunisia',
4186         'TR': 'Turkey',
4187         'TM': 'Turkmenistan',
4188         'TC': 'Turks and Caicos Islands',
4189         'TV': 'Tuvalu',
4190         'UG': 'Uganda',
4191         'UA': 'Ukraine',
4192         'AE': 'United Arab Emirates',
4193         'GB': 'United Kingdom',
4194         'US': 'United States',
4195         'UM': 'United States Minor Outlying Islands',
4196         'UY': 'Uruguay',
4197         'UZ': 'Uzbekistan',
4198         'VU': 'Vanuatu',
4199         'VE': 'Venezuela, Bolivarian Republic of',
4200         'VN': 'Viet Nam',
4201         'VG': 'Virgin Islands, British',
4202         'VI': 'Virgin Islands, U.S.',
4203         'WF': 'Wallis and Futuna',
4204         'EH': 'Western Sahara',
4205         'YE': 'Yemen',
4206         'ZM': 'Zambia',
4207         'ZW': 'Zimbabwe',
4208         # Not ISO 3166 codes, but used for IP blocks
4209         'AP': 'Asia/Pacific Region',
4210         'EU': 'Europe',
4211     }
4212
4213     @classmethod
4214     def short2full(cls, code):
4215         """Convert an ISO 3166-2 country code to the corresponding full name"""
4216         return cls._country_map.get(code.upper())
4217
4218
4219 class GeoUtils:
4220     # Major IPv4 address blocks per country
4221     _country_ip_map = {
4222         'AD': '46.172.224.0/19',
4223         'AE': '94.200.0.0/13',
4224         'AF': '149.54.0.0/17',
4225         'AG': '209.59.64.0/18',
4226         'AI': '204.14.248.0/21',
4227         'AL': '46.99.0.0/16',
4228         'AM': '46.70.0.0/15',
4229         'AO': '105.168.0.0/13',
4230         'AP': '182.50.184.0/21',
4231         'AQ': '23.154.160.0/24',
4232         'AR': '181.0.0.0/12',
4233         'AS': '202.70.112.0/20',
4234         'AT': '77.116.0.0/14',
4235         'AU': '1.128.0.0/11',
4236         'AW': '181.41.0.0/18',
4237         'AX': '185.217.4.0/22',
4238         'AZ': '5.197.0.0/16',
4239         'BA': '31.176.128.0/17',
4240         'BB': '65.48.128.0/17',
4241         'BD': '114.130.0.0/16',
4242         'BE': '57.0.0.0/8',
4243         'BF': '102.178.0.0/15',
4244         'BG': '95.42.0.0/15',
4245         'BH': '37.131.0.0/17',
4246         'BI': '154.117.192.0/18',
4247         'BJ': '137.255.0.0/16',
4248         'BL': '185.212.72.0/23',
4249         'BM': '196.12.64.0/18',
4250         'BN': '156.31.0.0/16',
4251         'BO': '161.56.0.0/16',
4252         'BQ': '161.0.80.0/20',
4253         'BR': '191.128.0.0/12',
4254         'BS': '24.51.64.0/18',
4255         'BT': '119.2.96.0/19',
4256         'BW': '168.167.0.0/16',
4257         'BY': '178.120.0.0/13',
4258         'BZ': '179.42.192.0/18',
4259         'CA': '99.224.0.0/11',
4260         'CD': '41.243.0.0/16',
4261         'CF': '197.242.176.0/21',
4262         'CG': '160.113.0.0/16',
4263         'CH': '85.0.0.0/13',
4264         'CI': '102.136.0.0/14',
4265         'CK': '202.65.32.0/19',
4266         'CL': '152.172.0.0/14',
4267         'CM': '102.244.0.0/14',
4268         'CN': '36.128.0.0/10',
4269         'CO': '181.240.0.0/12',
4270         'CR': '201.192.0.0/12',
4271         'CU': '152.206.0.0/15',
4272         'CV': '165.90.96.0/19',
4273         'CW': '190.88.128.0/17',
4274         'CY': '31.153.0.0/16',
4275         'CZ': '88.100.0.0/14',
4276         'DE': '53.0.0.0/8',
4277         'DJ': '197.241.0.0/17',
4278         'DK': '87.48.0.0/12',
4279         'DM': '192.243.48.0/20',
4280         'DO': '152.166.0.0/15',
4281         'DZ': '41.96.0.0/12',
4282         'EC': '186.68.0.0/15',
4283         'EE': '90.190.0.0/15',
4284         'EG': '156.160.0.0/11',
4285         'ER': '196.200.96.0/20',
4286         'ES': '88.0.0.0/11',
4287         'ET': '196.188.0.0/14',
4288         'EU': '2.16.0.0/13',
4289         'FI': '91.152.0.0/13',
4290         'FJ': '144.120.0.0/16',
4291         'FK': '80.73.208.0/21',
4292         'FM': '119.252.112.0/20',
4293         'FO': '88.85.32.0/19',
4294         'FR': '90.0.0.0/9',
4295         'GA': '41.158.0.0/15',
4296         'GB': '25.0.0.0/8',
4297         'GD': '74.122.88.0/21',
4298         'GE': '31.146.0.0/16',
4299         'GF': '161.22.64.0/18',
4300         'GG': '62.68.160.0/19',
4301         'GH': '154.160.0.0/12',
4302         'GI': '95.164.0.0/16',
4303         'GL': '88.83.0.0/19',
4304         'GM': '160.182.0.0/15',
4305         'GN': '197.149.192.0/18',
4306         'GP': '104.250.0.0/19',
4307         'GQ': '105.235.224.0/20',
4308         'GR': '94.64.0.0/13',
4309         'GT': '168.234.0.0/16',
4310         'GU': '168.123.0.0/16',
4311         'GW': '197.214.80.0/20',
4312         'GY': '181.41.64.0/18',
4313         'HK': '113.252.0.0/14',
4314         'HN': '181.210.0.0/16',
4315         'HR': '93.136.0.0/13',
4316         'HT': '148.102.128.0/17',
4317         'HU': '84.0.0.0/14',
4318         'ID': '39.192.0.0/10',
4319         'IE': '87.32.0.0/12',
4320         'IL': '79.176.0.0/13',
4321         'IM': '5.62.80.0/20',
4322         'IN': '117.192.0.0/10',
4323         'IO': '203.83.48.0/21',
4324         'IQ': '37.236.0.0/14',
4325         'IR': '2.176.0.0/12',
4326         'IS': '82.221.0.0/16',
4327         'IT': '79.0.0.0/10',
4328         'JE': '87.244.64.0/18',
4329         'JM': '72.27.0.0/17',
4330         'JO': '176.29.0.0/16',
4331         'JP': '133.0.0.0/8',
4332         'KE': '105.48.0.0/12',
4333         'KG': '158.181.128.0/17',
4334         'KH': '36.37.128.0/17',
4335         'KI': '103.25.140.0/22',
4336         'KM': '197.255.224.0/20',
4337         'KN': '198.167.192.0/19',
4338         'KP': '175.45.176.0/22',
4339         'KR': '175.192.0.0/10',
4340         'KW': '37.36.0.0/14',
4341         'KY': '64.96.0.0/15',
4342         'KZ': '2.72.0.0/13',
4343         'LA': '115.84.64.0/18',
4344         'LB': '178.135.0.0/16',
4345         'LC': '24.92.144.0/20',
4346         'LI': '82.117.0.0/19',
4347         'LK': '112.134.0.0/15',
4348         'LR': '102.183.0.0/16',
4349         'LS': '129.232.0.0/17',
4350         'LT': '78.56.0.0/13',
4351         'LU': '188.42.0.0/16',
4352         'LV': '46.109.0.0/16',
4353         'LY': '41.252.0.0/14',
4354         'MA': '105.128.0.0/11',
4355         'MC': '88.209.64.0/18',
4356         'MD': '37.246.0.0/16',
4357         'ME': '178.175.0.0/17',
4358         'MF': '74.112.232.0/21',
4359         'MG': '154.126.0.0/17',
4360         'MH': '117.103.88.0/21',
4361         'MK': '77.28.0.0/15',
4362         'ML': '154.118.128.0/18',
4363         'MM': '37.111.0.0/17',
4364         'MN': '49.0.128.0/17',
4365         'MO': '60.246.0.0/16',
4366         'MP': '202.88.64.0/20',
4367         'MQ': '109.203.224.0/19',
4368         'MR': '41.188.64.0/18',
4369         'MS': '208.90.112.0/22',
4370         'MT': '46.11.0.0/16',
4371         'MU': '105.16.0.0/12',
4372         'MV': '27.114.128.0/18',
4373         'MW': '102.70.0.0/15',
4374         'MX': '187.192.0.0/11',
4375         'MY': '175.136.0.0/13',
4376         'MZ': '197.218.0.0/15',
4377         'NA': '41.182.0.0/16',
4378         'NC': '101.101.0.0/18',
4379         'NE': '197.214.0.0/18',
4380         'NF': '203.17.240.0/22',
4381         'NG': '105.112.0.0/12',
4382         'NI': '186.76.0.0/15',
4383         'NL': '145.96.0.0/11',
4384         'NO': '84.208.0.0/13',
4385         'NP': '36.252.0.0/15',
4386         'NR': '203.98.224.0/19',
4387         'NU': '49.156.48.0/22',
4388         'NZ': '49.224.0.0/14',
4389         'OM': '5.36.0.0/15',
4390         'PA': '186.72.0.0/15',
4391         'PE': '186.160.0.0/14',
4392         'PF': '123.50.64.0/18',
4393         'PG': '124.240.192.0/19',
4394         'PH': '49.144.0.0/13',
4395         'PK': '39.32.0.0/11',
4396         'PL': '83.0.0.0/11',
4397         'PM': '70.36.0.0/20',
4398         'PR': '66.50.0.0/16',
4399         'PS': '188.161.0.0/16',
4400         'PT': '85.240.0.0/13',
4401         'PW': '202.124.224.0/20',
4402         'PY': '181.120.0.0/14',
4403         'QA': '37.210.0.0/15',
4404         'RE': '102.35.0.0/16',
4405         'RO': '79.112.0.0/13',
4406         'RS': '93.86.0.0/15',
4407         'RU': '5.136.0.0/13',
4408         'RW': '41.186.0.0/16',
4409         'SA': '188.48.0.0/13',
4410         'SB': '202.1.160.0/19',
4411         'SC': '154.192.0.0/11',
4412         'SD': '102.120.0.0/13',
4413         'SE': '78.64.0.0/12',
4414         'SG': '8.128.0.0/10',
4415         'SI': '188.196.0.0/14',
4416         'SK': '78.98.0.0/15',
4417         'SL': '102.143.0.0/17',
4418         'SM': '89.186.32.0/19',
4419         'SN': '41.82.0.0/15',
4420         'SO': '154.115.192.0/18',
4421         'SR': '186.179.128.0/17',
4422         'SS': '105.235.208.0/21',
4423         'ST': '197.159.160.0/19',
4424         'SV': '168.243.0.0/16',
4425         'SX': '190.102.0.0/20',
4426         'SY': '5.0.0.0/16',
4427         'SZ': '41.84.224.0/19',
4428         'TC': '65.255.48.0/20',
4429         'TD': '154.68.128.0/19',
4430         'TG': '196.168.0.0/14',
4431         'TH': '171.96.0.0/13',
4432         'TJ': '85.9.128.0/18',
4433         'TK': '27.96.24.0/21',
4434         'TL': '180.189.160.0/20',
4435         'TM': '95.85.96.0/19',
4436         'TN': '197.0.0.0/11',
4437         'TO': '175.176.144.0/21',
4438         'TR': '78.160.0.0/11',
4439         'TT': '186.44.0.0/15',
4440         'TV': '202.2.96.0/19',
4441         'TW': '120.96.0.0/11',
4442         'TZ': '156.156.0.0/14',
4443         'UA': '37.52.0.0/14',
4444         'UG': '102.80.0.0/13',
4445         'US': '6.0.0.0/8',
4446         'UY': '167.56.0.0/13',
4447         'UZ': '84.54.64.0/18',
4448         'VA': '212.77.0.0/19',
4449         'VC': '207.191.240.0/21',
4450         'VE': '186.88.0.0/13',
4451         'VG': '66.81.192.0/20',
4452         'VI': '146.226.0.0/16',
4453         'VN': '14.160.0.0/11',
4454         'VU': '202.80.32.0/20',
4455         'WF': '117.20.32.0/21',
4456         'WS': '202.4.32.0/19',
4457         'YE': '134.35.0.0/16',
4458         'YT': '41.242.116.0/22',
4459         'ZA': '41.0.0.0/11',
4460         'ZM': '102.144.0.0/13',
4461         'ZW': '102.177.192.0/18',
4462     }
4463
4464     @classmethod
4465     def random_ipv4(cls, code_or_block):
4466         if len(code_or_block) == 2:
4467             block = cls._country_ip_map.get(code_or_block.upper())
4468             if not block:
4469                 return None
4470         else:
4471             block = code_or_block
4472         addr, preflen = block.split('/')
4473         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4474         addr_max = addr_min | (0xffffffff >> int(preflen))
4475         return compat_str(socket.inet_ntoa(
4476             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4477
4478
4479 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4480     def __init__(self, proxies=None):
4481         # Set default handlers
4482         for type in ('http', 'https'):
4483             setattr(self, '%s_open' % type,
4484                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4485                         meth(r, proxy, type))
4486         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4487
4488     def proxy_open(self, req, proxy, type):
4489         req_proxy = req.headers.get('Ytdl-request-proxy')
4490         if req_proxy is not None:
4491             proxy = req_proxy
4492             del req.headers['Ytdl-request-proxy']
4493
4494         if proxy == '__noproxy__':
4495             return None  # No Proxy
4496         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4497             req.add_header('Ytdl-socks-proxy', proxy)
4498             # yt-dlp's http/https handlers do wrapping the socket with socks
4499             return None
4500         return compat_urllib_request.ProxyHandler.proxy_open(
4501             self, req, proxy, type)
4502
4503
4504 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4505 # released into Public Domain
4506 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4507
4508 def long_to_bytes(n, blocksize=0):
4509     """long_to_bytes(n:long, blocksize:int) : string
4510     Convert a long integer to a byte string.
4511
4512     If optional blocksize is given and greater than zero, pad the front of the
4513     byte string with binary zeros so that the length is a multiple of
4514     blocksize.
4515     """
4516     # after much testing, this algorithm was deemed to be the fastest
4517     s = b''
4518     n = int(n)
4519     while n > 0:
4520         s = compat_struct_pack('>I', n & 0xffffffff) + s
4521         n = n >> 32
4522     # strip off leading zeros
4523     for i in range(len(s)):
4524         if s[i] != b'\000'[0]:
4525             break
4526     else:
4527         # only happens when n == 0
4528         s = b'\000'
4529         i = 0
4530     s = s[i:]
4531     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4532     # de-padding being done above, but sigh...
4533     if blocksize > 0 and len(s) % blocksize:
4534         s = (blocksize - len(s) % blocksize) * b'\000' + s
4535     return s
4536
4537
4538 def bytes_to_long(s):
4539     """bytes_to_long(string) : long
4540     Convert a byte string to a long integer.
4541
4542     This is (essentially) the inverse of long_to_bytes().
4543     """
4544     acc = 0
4545     length = len(s)
4546     if length % 4:
4547         extra = (4 - length % 4)
4548         s = b'\000' * extra + s
4549         length = length + extra
4550     for i in range(0, length, 4):
4551         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4552     return acc
4553
4554
4555 def ohdave_rsa_encrypt(data, exponent, modulus):
4556     '''
4557     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4558
4559     Input:
4560         data: data to encrypt, bytes-like object
4561         exponent, modulus: parameter e and N of RSA algorithm, both integer
4562     Output: hex string of encrypted data
4563
4564     Limitation: supports one block encryption only
4565     '''
4566
4567     payload = int(binascii.hexlify(data[::-1]), 16)
4568     encrypted = pow(payload, exponent, modulus)
4569     return '%x' % encrypted
4570
4571
4572 def pkcs1pad(data, length):
4573     """
4574     Padding input data with PKCS#1 scheme
4575
4576     @param {int[]} data        input data
4577     @param {int}   length      target length
4578     @returns {int[]}           padded data
4579     """
4580     if len(data) > length - 11:
4581         raise ValueError('Input data too long for PKCS#1 padding')
4582
4583     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4584     return [0, 2] + pseudo_random + [0] + data
4585
4586
4587 def encode_base_n(num, n, table=None):
4588     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4589     if not table:
4590         table = FULL_TABLE[:n]
4591
4592     if n > len(table):
4593         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4594
4595     if num == 0:
4596         return table[0]
4597
4598     ret = ''
4599     while num:
4600         ret = table[num % n] + ret
4601         num = num // n
4602     return ret
4603
4604
4605 def decode_packed_codes(code):
4606     mobj = re.search(PACKED_CODES_RE, code)
4607     obfuscated_code, base, count, symbols = mobj.groups()
4608     base = int(base)
4609     count = int(count)
4610     symbols = symbols.split('|')
4611     symbol_table = {}
4612
4613     while count:
4614         count -= 1
4615         base_n_count = encode_base_n(count, base)
4616         symbol_table[base_n_count] = symbols[count] or base_n_count
4617
4618     return re.sub(
4619         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4620         obfuscated_code)
4621
4622
4623 def caesar(s, alphabet, shift):
4624     if shift == 0:
4625         return s
4626     l = len(alphabet)
4627     return ''.join(
4628         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4629         for c in s)
4630
4631
4632 def rot47(s):
4633     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4634
4635
4636 def parse_m3u8_attributes(attrib):
4637     info = {}
4638     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4639         if val.startswith('"'):
4640             val = val[1:-1]
4641         info[key] = val
4642     return info
4643
4644
4645 def urshift(val, n):
4646     return val >> n if val >= 0 else (val + 0x100000000) >> n
4647
4648
4649 # Based on png2str() written by @gdkchan and improved by @yokrysty
4650 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4651 def decode_png(png_data):
4652     # Reference: https://www.w3.org/TR/PNG/
4653     header = png_data[8:]
4654
4655     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4656         raise OSError('Not a valid PNG file.')
4657
4658     int_map = {1: '>B', 2: '>H', 4: '>I'}
4659     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4660
4661     chunks = []
4662
4663     while header:
4664         length = unpack_integer(header[:4])
4665         header = header[4:]
4666
4667         chunk_type = header[:4]
4668         header = header[4:]
4669
4670         chunk_data = header[:length]
4671         header = header[length:]
4672
4673         header = header[4:]  # Skip CRC
4674
4675         chunks.append({
4676             'type': chunk_type,
4677             'length': length,
4678             'data': chunk_data
4679         })
4680
4681     ihdr = chunks[0]['data']
4682
4683     width = unpack_integer(ihdr[:4])
4684     height = unpack_integer(ihdr[4:8])
4685
4686     idat = b''
4687
4688     for chunk in chunks:
4689         if chunk['type'] == b'IDAT':
4690             idat += chunk['data']
4691
4692     if not idat:
4693         raise OSError('Unable to read PNG data.')
4694
4695     decompressed_data = bytearray(zlib.decompress(idat))
4696
4697     stride = width * 3
4698     pixels = []
4699
4700     def _get_pixel(idx):
4701         x = idx % stride
4702         y = idx // stride
4703         return pixels[y][x]
4704
4705     for y in range(height):
4706         basePos = y * (1 + stride)
4707         filter_type = decompressed_data[basePos]
4708
4709         current_row = []
4710
4711         pixels.append(current_row)
4712
4713         for x in range(stride):
4714             color = decompressed_data[1 + basePos + x]
4715             basex = y * stride + x
4716             left = 0
4717             up = 0
4718
4719             if x > 2:
4720                 left = _get_pixel(basex - 3)
4721             if y > 0:
4722                 up = _get_pixel(basex - stride)
4723
4724             if filter_type == 1:  # Sub
4725                 color = (color + left) & 0xff
4726             elif filter_type == 2:  # Up
4727                 color = (color + up) & 0xff
4728             elif filter_type == 3:  # Average
4729                 color = (color + ((left + up) >> 1)) & 0xff
4730             elif filter_type == 4:  # Paeth
4731                 a = left
4732                 b = up
4733                 c = 0
4734
4735                 if x > 2 and y > 0:
4736                     c = _get_pixel(basex - stride - 3)
4737
4738                 p = a + b - c
4739
4740                 pa = abs(p - a)
4741                 pb = abs(p - b)
4742                 pc = abs(p - c)
4743
4744                 if pa <= pb and pa <= pc:
4745                     color = (color + a) & 0xff
4746                 elif pb <= pc:
4747                     color = (color + b) & 0xff
4748                 else:
4749                     color = (color + c) & 0xff
4750
4751             current_row.append(color)
4752
4753     return width, height, pixels
4754
4755
4756 def write_xattr(path, key, value):
4757     # Windows: Write xattrs to NTFS Alternate Data Streams:
4758     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4759     if compat_os_name == 'nt':
4760         assert ':' not in key
4761         assert os.path.exists(path)
4762
4763         try:
4764             with open(f'{path}:{key}', 'wb') as f:
4765                 f.write(value)
4766         except OSError as e:
4767             raise XAttrMetadataError(e.errno, e.strerror)
4768         return
4769
4770     # UNIX Method 1. Use xattrs/pyxattrs modules
4771     from .dependencies import xattr
4772
4773     setxattr = None
4774     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4775         # Unicode arguments are not supported in pyxattr until version 0.5.0
4776         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4777         if version_tuple(xattr.__version__) >= (0, 5, 0):
4778             setxattr = xattr.set
4779     elif xattr:
4780         setxattr = xattr.setxattr
4781
4782     if setxattr:
4783         try:
4784             setxattr(path, key, value)
4785         except OSError as e:
4786             raise XAttrMetadataError(e.errno, e.strerror)
4787         return
4788
4789     # UNIX Method 2. Use setfattr/xattr executables
4790     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4791            else 'xattr' if check_executable('xattr', ['-h']) else None)
4792     if not exe:
4793         raise XAttrUnavailableError(
4794             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4795             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4796
4797     value = value.decode()
4798     try:
4799         p = Popen(
4800             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4801             stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4802     except OSError as e:
4803         raise XAttrMetadataError(e.errno, e.strerror)
4804     stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4805     if p.returncode:
4806         raise XAttrMetadataError(p.returncode, stderr)
4807
4808
4809 def random_birthday(year_field, month_field, day_field):
4810     start_date = datetime.date(1950, 1, 1)
4811     end_date = datetime.date(1995, 12, 31)
4812     offset = random.randint(0, (end_date - start_date).days)
4813     random_date = start_date + datetime.timedelta(offset)
4814     return {
4815         year_field: str(random_date.year),
4816         month_field: str(random_date.month),
4817         day_field: str(random_date.day),
4818     }
4819
4820
4821 # Templates for internet shortcut files, which are plain text files.
4822 DOT_URL_LINK_TEMPLATE = '''\
4823 [InternetShortcut]
4824 URL=%(url)s
4825 '''
4826
4827 DOT_WEBLOC_LINK_TEMPLATE = '''\
4828 <?xml version="1.0" encoding="UTF-8"?>
4829 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4830 <plist version="1.0">
4831 <dict>
4832 \t<key>URL</key>
4833 \t<string>%(url)s</string>
4834 </dict>
4835 </plist>
4836 '''
4837
4838 DOT_DESKTOP_LINK_TEMPLATE = '''\
4839 [Desktop Entry]
4840 Encoding=UTF-8
4841 Name=%(filename)s
4842 Type=Link
4843 URL=%(url)s
4844 Icon=text-html
4845 '''
4846
4847 LINK_TEMPLATES = {
4848     'url': DOT_URL_LINK_TEMPLATE,
4849     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4850     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4851 }
4852
4853
4854 def iri_to_uri(iri):
4855     """
4856     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4857
4858     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4859     """
4860
4861     iri_parts = compat_urllib_parse_urlparse(iri)
4862
4863     if '[' in iri_parts.netloc:
4864         raise ValueError('IPv6 URIs are not, yet, supported.')
4865         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4866
4867     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4868
4869     net_location = ''
4870     if iri_parts.username:
4871         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4872         if iri_parts.password is not None:
4873             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4874         net_location += '@'
4875
4876     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4877     # The 'idna' encoding produces ASCII text.
4878     if iri_parts.port is not None and iri_parts.port != 80:
4879         net_location += ':' + str(iri_parts.port)
4880
4881     return urllib.parse.urlunparse(
4882         (iri_parts.scheme,
4883             net_location,
4884
4885             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4886
4887             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4888             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4889
4890             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4891             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4892
4893             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4894
4895     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4896
4897
4898 def to_high_limit_path(path):
4899     if sys.platform in ['win32', 'cygwin']:
4900         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4901         return '\\\\?\\' + os.path.abspath(path)
4902
4903     return path
4904
4905
4906 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4907     val = traverse_obj(obj, *variadic(field))
4908     if val in ignore:
4909         return default
4910     return template % (func(val) if func else val)
4911
4912
4913 def clean_podcast_url(url):
4914     return re.sub(r'''(?x)
4915         (?:
4916             (?:
4917                 chtbl\.com/track|
4918                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4919                 play\.podtrac\.com
4920             )/[^/]+|
4921             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4922             flex\.acast\.com|
4923             pd(?:
4924                 cn\.co| # https://podcorn.com/analytics-prefix/
4925                 st\.fm # https://podsights.com/docs/
4926             )/e
4927         )/''', '', url)
4928
4929
4930 _HEX_TABLE = '0123456789abcdef'
4931
4932
4933 def random_uuidv4():
4934     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4935
4936
4937 def make_dir(path, to_screen=None):
4938     try:
4939         dn = os.path.dirname(path)
4940         if dn and not os.path.exists(dn):
4941             os.makedirs(dn)
4942         return True
4943     except OSError as err:
4944         if callable(to_screen) is not None:
4945             to_screen('unable to create directory ' + error_to_compat_str(err))
4946         return False
4947
4948
4949 def get_executable_path():
4950     from .update import _get_variant_and_executable_path
4951
4952     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4953
4954
4955 def load_plugins(name, suffix, namespace):
4956     classes = {}
4957     with contextlib.suppress(FileNotFoundError):
4958         plugins_spec = importlib.util.spec_from_file_location(
4959             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4960         plugins = importlib.util.module_from_spec(plugins_spec)
4961         sys.modules[plugins_spec.name] = plugins
4962         plugins_spec.loader.exec_module(plugins)
4963         for name in dir(plugins):
4964             if name in namespace:
4965                 continue
4966             if not name.endswith(suffix):
4967                 continue
4968             klass = getattr(plugins, name)
4969             classes[name] = namespace[name] = klass
4970     return classes
4971
4972
4973 def traverse_obj(
4974         obj, *path_list, default=None, expected_type=None, get_all=True,
4975         casesense=True, is_user_input=False, traverse_string=False):
4976     ''' Traverse nested list/dict/tuple
4977     @param path_list        A list of paths which are checked one by one.
4978                             Each path is a list of keys where each key is a:
4979                               - None:     Do nothing
4980                               - string:   A dictionary key
4981                               - int:      An index into a list
4982                               - tuple:    A list of keys all of which will be traversed
4983                               - Ellipsis: Fetch all values in the object
4984                               - Function: Takes the key and value as arguments
4985                                           and returns whether the key matches or not
4986     @param default          Default value to return
4987     @param expected_type    Only accept final value of this type (Can also be any callable)
4988     @param get_all          Return all the values obtained from a path or only the first one
4989     @param casesense        Whether to consider dictionary keys as case sensitive
4990     @param is_user_input    Whether the keys are generated from user input. If True,
4991                             strings are converted to int/slice if necessary
4992     @param traverse_string  Whether to traverse inside strings. If True, any
4993                             non-compatible object will also be converted into a string
4994     # TODO: Write tests
4995     '''
4996     if not casesense:
4997         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4998         path_list = (map(_lower, variadic(path)) for path in path_list)
4999
5000     def _traverse_obj(obj, path, _current_depth=0):
5001         nonlocal depth
5002         path = tuple(variadic(path))
5003         for i, key in enumerate(path):
5004             if None in (key, obj):
5005                 return obj
5006             if isinstance(key, (list, tuple)):
5007                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5008                 key = ...
5009             if key is ...:
5010                 obj = (obj.values() if isinstance(obj, dict)
5011                        else obj if isinstance(obj, (list, tuple, LazyList))
5012                        else str(obj) if traverse_string else [])
5013                 _current_depth += 1
5014                 depth = max(depth, _current_depth)
5015                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5016             elif callable(key):
5017                 if isinstance(obj, (list, tuple, LazyList)):
5018                     obj = enumerate(obj)
5019                 elif isinstance(obj, dict):
5020                     obj = obj.items()
5021                 else:
5022                     if not traverse_string:
5023                         return None
5024                     obj = str(obj)
5025                 _current_depth += 1
5026                 depth = max(depth, _current_depth)
5027                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5028             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5029                 obj = (obj.get(key) if casesense or (key in obj)
5030                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5031             else:
5032                 if is_user_input:
5033                     key = (int_or_none(key) if ':' not in key
5034                            else slice(*map(int_or_none, key.split(':'))))
5035                     if key == slice(None):
5036                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5037                 if not isinstance(key, (int, slice)):
5038                     return None
5039                 if not isinstance(obj, (list, tuple, LazyList)):
5040                     if not traverse_string:
5041                         return None
5042                     obj = str(obj)
5043                 try:
5044                     obj = obj[key]
5045                 except IndexError:
5046                     return None
5047         return obj
5048
5049     if isinstance(expected_type, type):
5050         type_test = lambda val: val if isinstance(val, expected_type) else None
5051     elif expected_type is not None:
5052         type_test = expected_type
5053     else:
5054         type_test = lambda val: val
5055
5056     for path in path_list:
5057         depth = 0
5058         val = _traverse_obj(obj, path)
5059         if val is not None:
5060             if depth:
5061                 for _ in range(depth - 1):
5062                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5063                 val = [v for v in map(type_test, val) if v is not None]
5064                 if val:
5065                     return val if get_all else val[0]
5066             else:
5067                 val = type_test(val)
5068                 if val is not None:
5069                     return val
5070     return default
5071
5072
5073 def traverse_dict(dictn, keys, casesense=True):
5074     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5075                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5076     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5077
5078
5079 def get_first(obj, keys, **kwargs):
5080     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5081
5082
5083 def variadic(x, allowed_types=(str, bytes, dict)):
5084     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5085
5086
5087 def decode_base(value, digits):
5088     # This will convert given base-x string to scalar (long or int)
5089     table = {char: index for index, char in enumerate(digits)}
5090     result = 0
5091     base = len(digits)
5092     for chr in value:
5093         result *= base
5094         result += table[chr]
5095     return result
5096
5097
5098 def time_seconds(**kwargs):
5099     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5100     return t.timestamp()
5101
5102
5103 # create a JSON Web Signature (jws) with HS256 algorithm
5104 # the resulting format is in JWS Compact Serialization
5105 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5106 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5107 def jwt_encode_hs256(payload_data, key, headers={}):
5108     header_data = {
5109         'alg': 'HS256',
5110         'typ': 'JWT',
5111     }
5112     if headers:
5113         header_data.update(headers)
5114     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5115     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5116     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5117     signature_b64 = base64.b64encode(h.digest())
5118     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5119     return token
5120
5121
5122 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5123 def jwt_decode_hs256(jwt):
5124     header_b64, payload_b64, signature_b64 = jwt.split('.')
5125     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5126     return payload_data
5127
5128
5129 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5130
5131
5132 @functools.cache
5133 def supports_terminal_sequences(stream):
5134     if compat_os_name == 'nt':
5135         if not WINDOWS_VT_MODE:
5136             return False
5137     elif not os.getenv('TERM'):
5138         return False
5139     try:
5140         return stream.isatty()
5141     except BaseException:
5142         return False
5143
5144
5145 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5146     if get_windows_version() < (10, 0, 10586):
5147         return
5148     global WINDOWS_VT_MODE
5149     startupinfo = subprocess.STARTUPINFO()
5150     startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
5151     try:
5152         subprocess.Popen('', shell=True, startupinfo=startupinfo).wait()
5153     except Exception:
5154         return
5155
5156     WINDOWS_VT_MODE = True
5157     supports_terminal_sequences.cache_clear()
5158
5159
5160 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5161
5162
5163 def remove_terminal_sequences(string):
5164     return _terminal_sequences_re.sub('', string)
5165
5166
5167 def number_of_digits(number):
5168     return len('%d' % number)
5169
5170
5171 def join_nonempty(*values, delim='-', from_dict=None):
5172     if from_dict is not None:
5173         values = map(from_dict.get, values)
5174     return delim.join(map(str, filter(None, values)))
5175
5176
5177 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5178     """
5179     Find the largest format dimensions in terms of video width and, for each thumbnail:
5180     * Modify the URL: Match the width with the provided regex and replace with the former width
5181     * Update dimensions
5182
5183     This function is useful with video services that scale the provided thumbnails on demand
5184     """
5185     _keys = ('width', 'height')
5186     max_dimensions = max(
5187         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5188         default=(0, 0))
5189     if not max_dimensions[0]:
5190         return thumbnails
5191     return [
5192         merge_dicts(
5193             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5194             dict(zip(_keys, max_dimensions)), thumbnail)
5195         for thumbnail in thumbnails
5196     ]
5197
5198
5199 def parse_http_range(range):
5200     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5201     if not range:
5202         return None, None, None
5203     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5204     if not crg:
5205         return None, None, None
5206     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5207
5208
5209 def read_stdin(what):
5210     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5211     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5212     return sys.stdin
5213
5214
5215 class Config:
5216     own_args = None
5217     parsed_args = None
5218     filename = None
5219     __initialized = False
5220
5221     def __init__(self, parser, label=None):
5222         self.parser, self.label = parser, label
5223         self._loaded_paths, self.configs = set(), []
5224
5225     def init(self, args=None, filename=None):
5226         assert not self.__initialized
5227         directory = ''
5228         if filename:
5229             location = os.path.realpath(filename)
5230             directory = os.path.dirname(location)
5231             if location in self._loaded_paths:
5232                 return False
5233             self._loaded_paths.add(location)
5234
5235         self.own_args, self.__initialized = args, True
5236         opts, _ = self.parser.parse_known_args(args)
5237         self.parsed_args, self.filename = args, filename
5238
5239         for location in opts.config_locations or []:
5240             if location == '-':
5241                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5242                 continue
5243             location = os.path.join(directory, expand_path(location))
5244             if os.path.isdir(location):
5245                 location = os.path.join(location, 'yt-dlp.conf')
5246             if not os.path.exists(location):
5247                 self.parser.error(f'config location {location} does not exist')
5248             self.append_config(self.read_file(location), location)
5249         return True
5250
5251     def __str__(self):
5252         label = join_nonempty(
5253             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5254             delim=' ')
5255         return join_nonempty(
5256             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5257             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5258             delim='\n')
5259
5260     @staticmethod
5261     def read_file(filename, default=[]):
5262         try:
5263             optionf = open(filename)
5264         except OSError:
5265             return default  # silently skip if file is not present
5266         try:
5267             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5268             contents = optionf.read()
5269             res = shlex.split(contents, comments=True)
5270         finally:
5271             optionf.close()
5272         return res
5273
5274     @staticmethod
5275     def hide_login_info(opts):
5276         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5277         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5278
5279         def _scrub_eq(o):
5280             m = eqre.match(o)
5281             if m:
5282                 return m.group('key') + '=PRIVATE'
5283             else:
5284                 return o
5285
5286         opts = list(map(_scrub_eq, opts))
5287         for idx, opt in enumerate(opts):
5288             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5289                 opts[idx + 1] = 'PRIVATE'
5290         return opts
5291
5292     def append_config(self, *args, label=None):
5293         config = type(self)(self.parser, label)
5294         config._loaded_paths = self._loaded_paths
5295         if config.init(*args):
5296             self.configs.append(config)
5297
5298     @property
5299     def all_args(self):
5300         for config in reversed(self.configs):
5301             yield from config.all_args
5302         yield from self.parsed_args or []
5303
5304     def parse_known_args(self, **kwargs):
5305         return self.parser.parse_known_args(self.all_args, **kwargs)
5306
5307     def parse_args(self):
5308         return self.parser.parse_args(self.all_args)
5309
5310
5311 class WebSocketsWrapper():
5312     """Wraps websockets module to use in non-async scopes"""
5313     pool = None
5314
5315     def __init__(self, url, headers=None, connect=True):
5316         self.loop = asyncio.new_event_loop()
5317         # XXX: "loop" is deprecated
5318         self.conn = websockets.connect(
5319             url, extra_headers=headers, ping_interval=None,
5320             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5321         if connect:
5322             self.__enter__()
5323         atexit.register(self.__exit__, None, None, None)
5324
5325     def __enter__(self):
5326         if not self.pool:
5327             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5328         return self
5329
5330     def send(self, *args):
5331         self.run_with_loop(self.pool.send(*args), self.loop)
5332
5333     def recv(self, *args):
5334         return self.run_with_loop(self.pool.recv(*args), self.loop)
5335
5336     def __exit__(self, type, value, traceback):
5337         try:
5338             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5339         finally:
5340             self.loop.close()
5341             self._cancel_all_tasks(self.loop)
5342
5343     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5344     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5345     @staticmethod
5346     def run_with_loop(main, loop):
5347         if not asyncio.iscoroutine(main):
5348             raise ValueError(f'a coroutine was expected, got {main!r}')
5349
5350         try:
5351             return loop.run_until_complete(main)
5352         finally:
5353             loop.run_until_complete(loop.shutdown_asyncgens())
5354             if hasattr(loop, 'shutdown_default_executor'):
5355                 loop.run_until_complete(loop.shutdown_default_executor())
5356
5357     @staticmethod
5358     def _cancel_all_tasks(loop):
5359         to_cancel = asyncio.all_tasks(loop)
5360
5361         if not to_cancel:
5362             return
5363
5364         for task in to_cancel:
5365             task.cancel()
5366
5367         # XXX: "loop" is removed in python 3.10+
5368         loop.run_until_complete(
5369             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5370
5371         for task in to_cancel:
5372             if task.cancelled():
5373                 continue
5374             if task.exception() is not None:
5375                 loop.call_exception_handler({
5376                     'message': 'unhandled exception during asyncio.run() shutdown',
5377                     'exception': task.exception(),
5378                     'task': task,
5379                 })
5380
5381
5382 def merge_headers(*dicts):
5383     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5384     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5385
5386
5387 class classproperty:
5388     """classmethod(property(func)) that works in py < 3.9"""
5389
5390     def __init__(self, func):
5391         functools.update_wrapper(self, func)
5392         self.func = func
5393
5394     def __get__(self, _, cls):
5395         return self.func(cls)
5396
5397
5398 class Namespace:
5399     """Immutable namespace"""
5400
5401     def __init__(self, **kwargs):
5402         self._dict = kwargs
5403
5404     def __getattr__(self, attr):
5405         return self._dict[attr]
5406
5407     def __contains__(self, item):
5408         return item in self._dict.values()
5409
5410     def __iter__(self):
5411         return iter(self._dict.items())
5412
5413     def __repr__(self):
5414         return f'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5415
5416
5417 # Deprecated
5418 has_certifi = bool(certifi)
5419 has_websockets = bool(websockets)