yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_brotli,
  51     compat_chr,
  52     compat_cookiejar,
  53     compat_ctypes_WINFUNCTYPE,
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_html_entities,
  57     compat_html_entities_html5,
  58     compat_http_client,
  59     compat_integer_types,
  60     compat_numeric_types,
  61     compat_kwargs,
  62     compat_os_name,
  63     compat_parse_qs,
  64     compat_shlex_split,
  65     compat_shlex_quote,
  66     compat_str,
  67     compat_struct_pack,
  68     compat_struct_unpack,
  69     compat_urllib_error,
  70     compat_urllib_parse,
  71     compat_urllib_parse_urlencode,
  72     compat_urllib_parse_urlparse,
  73     compat_urllib_parse_urlunparse,
  74     compat_urllib_parse_quote,
  75     compat_urllib_parse_quote_plus,
  76     compat_urllib_parse_unquote_plus,
  77     compat_urllib_request,
  78     compat_urlparse,
  79     compat_websockets,
  80     compat_xpath,
  81 )
  82
  83 from .socks import (
  84     ProxyType,
  85     sockssocket,
  86 )
  87
  88 try:
  89     import certifi
  90     has_certifi = True
  91 except ImportError:
  92     has_certifi = False
  93
  94
  95 def register_socks_protocols():
  96     # "Register" SOCKS protocols
  97     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  98     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  99     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
 100         if scheme not in compat_urlparse.uses_netloc:
 101             compat_urlparse.uses_netloc.append(scheme)
 102
 103
 104 # This is not clearly defined otherwise
 105 compiled_regex_type = type(re.compile(''))
 106
 107
 108 def random_user_agent():
 109     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 110     _CHROME_VERSIONS = (
 111         '90.0.4430.212',
 112         '90.0.4430.24',
 113         '90.0.4430.70',
 114         '90.0.4430.72',
 115         '90.0.4430.85',
 116         '90.0.4430.93',
 117         '91.0.4472.101',
 118         '91.0.4472.106',
 119         '91.0.4472.114',
 120         '91.0.4472.124',
 121         '91.0.4472.164',
 122         '91.0.4472.19',
 123         '91.0.4472.77',
 124         '92.0.4515.107',
 125         '92.0.4515.115',
 126         '92.0.4515.131',
 127         '92.0.4515.159',
 128         '92.0.4515.43',
 129         '93.0.4556.0',
 130         '93.0.4577.15',
 131         '93.0.4577.63',
 132         '93.0.4577.82',
 133         '94.0.4606.41',
 134         '94.0.4606.54',
 135         '94.0.4606.61',
 136         '94.0.4606.71',
 137         '94.0.4606.81',
 138         '94.0.4606.85',
 139         '95.0.4638.17',
 140         '95.0.4638.50',
 141         '95.0.4638.54',
 142         '95.0.4638.69',
 143         '95.0.4638.74',
 144         '96.0.4664.18',
 145         '96.0.4664.45',
 146         '96.0.4664.55',
 147         '96.0.4664.93',
 148         '97.0.4692.20',
 149     )
 150     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 151
 152
 153 SUPPORTED_ENCODINGS = [
 154     'gzip', 'deflate'
 155 ]
 156 if compat_brotli:
 157     SUPPORTED_ENCODINGS.append('br')
 158
 159 std_headers = {
 160     'User-Agent': random_user_agent(),
 161     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 162     'Accept-Language': 'en-us,en;q=0.5',
 163     'Sec-Fetch-Mode': 'navigate',
 164 }
 165
 166
 167 USER_AGENTS = {
 168     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 169 }
 170
 171
 172 NO_DEFAULT = object()
 173
 174 ENGLISH_MONTH_NAMES = [
 175     'January', 'February', 'March', 'April', 'May', 'June',
 176     'July', 'August', 'September', 'October', 'November', 'December']
 177
 178 MONTH_NAMES = {
 179     'en': ENGLISH_MONTH_NAMES,
 180     'fr': [
 181         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 182         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 183 }
 184
 185 KNOWN_EXTENSIONS = (
 186     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 187     'flv', 'f4v', 'f4a', 'f4b',
 188     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 189     'mkv', 'mka', 'mk3d',
 190     'avi', 'divx',
 191     'mov',
 192     'asf', 'wmv', 'wma',
 193     '3gp', '3g2',
 194     'mp3',
 195     'flac',
 196     'ape',
 197     'wav',
 198     'f4f', 'f4m', 'm3u8', 'smil')
 199
 200 # needed for sanitizing filenames in restricted mode
 201 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 202                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 203                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 204
 205 DATE_FORMATS = (
 206     '%d %B %Y',
 207     '%d %b %Y',
 208     '%B %d %Y',
 209     '%B %dst %Y',
 210     '%B %dnd %Y',
 211     '%B %drd %Y',
 212     '%B %dth %Y',
 213     '%b %d %Y',
 214     '%b %dst %Y',
 215     '%b %dnd %Y',
 216     '%b %drd %Y',
 217     '%b %dth %Y',
 218     '%b %dst %Y %I:%M',
 219     '%b %dnd %Y %I:%M',
 220     '%b %drd %Y %I:%M',
 221     '%b %dth %Y %I:%M',
 222     '%Y %m %d',
 223     '%Y-%m-%d',
 224     '%Y.%m.%d.',
 225     '%Y/%m/%d',
 226     '%Y/%m/%d %H:%M',
 227     '%Y/%m/%d %H:%M:%S',
 228     '%Y%m%d%H%M',
 229     '%Y%m%d%H%M%S',
 230     '%Y%m%d',
 231     '%Y-%m-%d %H:%M',
 232     '%Y-%m-%d %H:%M:%S',
 233     '%Y-%m-%d %H:%M:%S.%f',
 234     '%Y-%m-%d %H:%M:%S:%f',
 235     '%d.%m.%Y %H:%M',
 236     '%d.%m.%Y %H.%M',
 237     '%Y-%m-%dT%H:%M:%SZ',
 238     '%Y-%m-%dT%H:%M:%S.%fZ',
 239     '%Y-%m-%dT%H:%M:%S.%f0Z',
 240     '%Y-%m-%dT%H:%M:%S',
 241     '%Y-%m-%dT%H:%M:%S.%f',
 242     '%Y-%m-%dT%H:%M',
 243     '%b %d %Y at %H:%M',
 244     '%b %d %Y at %H:%M:%S',
 245     '%B %d %Y at %H:%M',
 246     '%B %d %Y at %H:%M:%S',
 247     '%H:%M %d-%b-%Y',
 248 )
 249
 250 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 251 DATE_FORMATS_DAY_FIRST.extend([
 252     '%d-%m-%Y',
 253     '%d.%m.%Y',
 254     '%d.%m.%y',
 255     '%d/%m/%Y',
 256     '%d/%m/%y',
 257     '%d/%m/%Y %H:%M:%S',
 258 ])
 259
 260 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 261 DATE_FORMATS_MONTH_FIRST.extend([
 262     '%m-%d-%Y',
 263     '%m.%d.%Y',
 264     '%m/%d/%Y',
 265     '%m/%d/%y',
 266     '%m/%d/%Y %H:%M:%S',
 267 ])
 268
 269 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 270 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 271
 272
 273 def preferredencoding():
 274     """Get preferred encoding.
 275
 276     Returns the best encoding scheme for the system, based on
 277     locale.getpreferredencoding() and some further tweaks.
 278     """
 279     try:
 280         pref = locale.getpreferredencoding()
 281         'TEST'.encode(pref)
 282     except Exception:
 283         pref = 'UTF-8'
 284
 285     return pref
 286
 287
 288 def write_json_file(obj, fn):
 289     """ Encode obj as JSON and write it to fn, atomically if possible """
 290
 291     fn = encodeFilename(fn)
 292     if sys.version_info < (3, 0) and sys.platform != 'win32':
 293         encoding = get_filesystem_encoding()
 294         # os.path.basename returns a bytes object, but NamedTemporaryFile
 295         # will fail if the filename contains non ascii characters unless we
 296         # use a unicode object
 297         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 298         # the same for os.path.dirname
 299         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 300     else:
 301         path_basename = os.path.basename
 302         path_dirname = os.path.dirname
 303
 304     args = {
 305         'suffix': '.tmp',
 306         'prefix': path_basename(fn) + '.',
 307         'dir': path_dirname(fn),
 308         'delete': False,
 309     }
 310
 311     # In Python 2.x, json.dump expects a bytestream.
 312     # In Python 3.x, it writes to a character stream
 313     if sys.version_info < (3, 0):
 314         args['mode'] = 'wb'
 315     else:
 316         args.update({
 317             'mode': 'w',
 318             'encoding': 'utf-8',
 319         })
 320
 321     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 322
 323     try:
 324         with tf:
 325             json.dump(obj, tf, ensure_ascii=False)
 326         if sys.platform == 'win32':
 327             # Need to remove existing file on Windows, else os.rename raises
 328             # WindowsError or FileExistsError.
 329             try:
 330                 os.unlink(fn)
 331             except OSError:
 332                 pass
 333         try:
 334             mask = os.umask(0)
 335             os.umask(mask)
 336             os.chmod(tf.name, 0o666 & ~mask)
 337         except OSError:
 338             pass
 339         os.rename(tf.name, fn)
 340     except Exception:
 341         try:
 342             os.remove(tf.name)
 343         except OSError:
 344             pass
 345         raise
 346
 347
 348 if sys.version_info >= (2, 7):
 349     def find_xpath_attr(node, xpath, key, val=None):
 350         """ Find the xpath xpath[@key=val] """
 351         assert re.match(r'^[a-zA-Z_-]+$', key)
 352         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 353         return node.find(expr)
 354 else:
 355     def find_xpath_attr(node, xpath, key, val=None):
 356         for f in node.findall(compat_xpath(xpath)):
 357             if key not in f.attrib:
 358                 continue
 359             if val is None or f.attrib.get(key) == val:
 360                 return f
 361         return None
 362
 363 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 364 # the namespace parameter
 365
 366
 367 def xpath_with_ns(path, ns_map):
 368     components = [c.split(':') for c in path.split('/')]
 369     replaced = []
 370     for c in components:
 371         if len(c) == 1:
 372             replaced.append(c[0])
 373         else:
 374             ns, tag = c
 375             replaced.append('{%s}%s' % (ns_map[ns], tag))
 376     return '/'.join(replaced)
 377
 378
 379 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 380     def _find_xpath(xpath):
 381         return node.find(compat_xpath(xpath))
 382
 383     if isinstance(xpath, (str, compat_str)):
 384         n = _find_xpath(xpath)
 385     else:
 386         for xp in xpath:
 387             n = _find_xpath(xp)
 388             if n is not None:
 389                 break
 390
 391     if n is None:
 392         if default is not NO_DEFAULT:
 393             return default
 394         elif fatal:
 395             name = xpath if name is None else name
 396             raise ExtractorError('Could not find XML element %s' % name)
 397         else:
 398             return None
 399     return n
 400
 401
 402 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 403     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 404     if n is None or n == default:
 405         return n
 406     if n.text is None:
 407         if default is not NO_DEFAULT:
 408             return default
 409         elif fatal:
 410             name = xpath if name is None else name
 411             raise ExtractorError('Could not find XML element\'s text %s' % name)
 412         else:
 413             return None
 414     return n.text
 415
 416
 417 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 418     n = find_xpath_attr(node, xpath, key)
 419     if n is None:
 420         if default is not NO_DEFAULT:
 421             return default
 422         elif fatal:
 423             name = '%s[@%s]' % (xpath, key) if name is None else name
 424             raise ExtractorError('Could not find XML attribute %s' % name)
 425         else:
 426             return None
 427     return n.attrib[key]
 428
 429
 430 def get_element_by_id(id, html):
 431     """Return the content of the tag with the specified ID in the passed HTML document"""
 432     return get_element_by_attribute('id', id, html)
 433
 434
 435 def get_element_html_by_id(id, html):
 436     """Return the html of the tag with the specified ID in the passed HTML document"""
 437     return get_element_html_by_attribute('id', id, html)
 438
 439
 440 def get_element_by_class(class_name, html):
 441     """Return the content of the first tag with the specified class in the passed HTML document"""
 442     retval = get_elements_by_class(class_name, html)
 443     return retval[0] if retval else None
 444
 445
 446 def get_element_html_by_class(class_name, html):
 447     """Return the html of the first tag with the specified class in the passed HTML document"""
 448     retval = get_elements_html_by_class(class_name, html)
 449     return retval[0] if retval else None
 450
 451
 452 def get_element_by_attribute(attribute, value, html, escape_value=True):
 453     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 454     return retval[0] if retval else None
 455
 456
 457 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 458     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 459     return retval[0] if retval else None
 460
 461
 462 def get_elements_by_class(class_name, html):
 463     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 464     return get_elements_by_attribute(
 465         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 466         html, escape_value=False)
 467
 468
 469 def get_elements_html_by_class(class_name, html):
 470     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 471     return get_elements_html_by_attribute(
 472         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 473         html, escape_value=False)
 474
 475
 476 def get_elements_by_attribute(*args, **kwargs):
 477     """Return the content of the tag with the specified attribute in the passed HTML document"""
 478     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 479
 480
 481 def get_elements_html_by_attribute(*args, **kwargs):
 482     """Return the html of the tag with the specified attribute in the passed HTML document"""
 483     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 484
 485
 486 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 487     """
 488     Return the text (content) and the html (whole) of the tag with the specified
 489     attribute in the passed HTML document
 490     """
 491
 492     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 493
 494     value = re.escape(value) if escape_value else value
 495
 496     partial_element_re = r'''(?x)
 497         <(?P<tag>[a-zA-Z0-9:._-]+)
 498          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 499          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 500         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 501
 502     for m in re.finditer(partial_element_re, html):
 503         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 504
 505         yield (
 506             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 507             whole
 508         )
 509
 510
 511 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 512     """
 513     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 514     closing tag for the first opening tag it has encountered, and can be used
 515     as a context manager
 516     """
 517
 518     class HTMLBreakOnClosingTagException(Exception):
 519         pass
 520
 521     def __init__(self):
 522         self.tagstack = collections.deque()
 523         compat_HTMLParser.__init__(self)
 524
 525     def __enter__(self):
 526         return self
 527
 528     def __exit__(self, *_):
 529         self.close()
 530
 531     def close(self):
 532         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 533         # so data remains buffered; we no longer have any interest in it, thus
 534         # override this method to discard it
 535         pass
 536
 537     def handle_starttag(self, tag, _):
 538         self.tagstack.append(tag)
 539
 540     def handle_endtag(self, tag):
 541         if not self.tagstack:
 542             raise compat_HTMLParseError('no tags in the stack')
 543         while self.tagstack:
 544             inner_tag = self.tagstack.pop()
 545             if inner_tag == tag:
 546                 break
 547         else:
 548             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 549         if not self.tagstack:
 550             raise self.HTMLBreakOnClosingTagException()
 551
 552
 553 def get_element_text_and_html_by_tag(tag, html):
 554     """
 555     For the first element with the specified tag in the passed HTML document
 556     return its' content (text) and the whole element (html)
 557     """
 558     def find_or_raise(haystack, needle, exc):
 559         try:
 560             return haystack.index(needle)
 561         except ValueError:
 562             raise exc
 563     closing_tag = f'</{tag}>'
 564     whole_start = find_or_raise(
 565         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 566     content_start = find_or_raise(
 567         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 568     content_start += whole_start + 1
 569     with HTMLBreakOnClosingTagParser() as parser:
 570         parser.feed(html[whole_start:content_start])
 571         if not parser.tagstack or parser.tagstack[0] != tag:
 572             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 573         offset = content_start
 574         while offset < len(html):
 575             next_closing_tag_start = find_or_raise(
 576                 html[offset:], closing_tag,
 577                 compat_HTMLParseError(f'closing {tag} tag not found'))
 578             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 579             try:
 580                 parser.feed(html[offset:offset + next_closing_tag_end])
 581                 offset += next_closing_tag_end
 582             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 583                 return html[content_start:offset + next_closing_tag_start], \
 584                     html[whole_start:offset + next_closing_tag_end]
 585         raise compat_HTMLParseError('unexpected end of html')
 586
 587
 588 class HTMLAttributeParser(compat_HTMLParser):
 589     """Trivial HTML parser to gather the attributes for a single element"""
 590
 591     def __init__(self):
 592         self.attrs = {}
 593         compat_HTMLParser.__init__(self)
 594
 595     def handle_starttag(self, tag, attrs):
 596         self.attrs = dict(attrs)
 597
 598
 599 class HTMLListAttrsParser(compat_HTMLParser):
 600     """HTML parser to gather the attributes for the elements of a list"""
 601
 602     def __init__(self):
 603         compat_HTMLParser.__init__(self)
 604         self.items = []
 605         self._level = 0
 606
 607     def handle_starttag(self, tag, attrs):
 608         if tag == 'li' and self._level == 0:
 609             self.items.append(dict(attrs))
 610         self._level += 1
 611
 612     def handle_endtag(self, tag):
 613         self._level -= 1
 614
 615
 616 def extract_attributes(html_element):
 617     """Given a string for an HTML element such as
 618     <el
 619          a="foo" B="bar" c="&98;az" d=boz
 620          empty= noval entity="&amp;"
 621          sq='"' dq="'"
 622     >
 623     Decode and return a dictionary of attributes.
 624     {
 625         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 626         'empty': '', 'noval': None, 'entity': '&',
 627         'sq': '"', 'dq': '\''
 628     }.
 629     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 630     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 631     """
 632     parser = HTMLAttributeParser()
 633     try:
 634         parser.feed(html_element)
 635         parser.close()
 636     # Older Python may throw HTMLParseError in case of malformed HTML
 637     except compat_HTMLParseError:
 638         pass
 639     return parser.attrs
 640
 641
 642 def parse_list(webpage):
 643     """Given a string for an series of HTML <li> elements,
 644     return a dictionary of their attributes"""
 645     parser = HTMLListAttrsParser()
 646     parser.feed(webpage)
 647     parser.close()
 648     return parser.items
 649
 650
 651 def clean_html(html):
 652     """Clean an HTML snippet into a readable string"""
 653
 654     if html is None:  # Convenience for sanitizing descriptions etc.
 655         return html
 656
 657     html = re.sub(r'\s+', ' ', html)
 658     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 659     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 660     # Strip html tags
 661     html = re.sub('<.*?>', '', html)
 662     # Replace html entities
 663     html = unescapeHTML(html)
 664     return html.strip()
 665
 666
 667 def sanitize_open(filename, open_mode):
 668     """Try to open the given filename, and slightly tweak it if this fails.
 669
 670     Attempts to open the given filename. If this fails, it tries to change
 671     the filename slightly, step by step, until it's either able to open it
 672     or it fails and raises a final exception, like the standard open()
 673     function.
 674
 675     It returns the tuple (stream, definitive_file_name).
 676     """
 677     if filename == '-':
 678         if sys.platform == 'win32':
 679             import msvcrt
 680             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 681         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 682
 683     for attempt in range(2):
 684         try:
 685             try:
 686                 if sys.platform == 'win32':
 687                     # FIXME: Windows only has mandatory locking which also locks the file from being read.
 688                     # So for now, don't lock the file on windows. Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 689                     raise LockingUnsupportedError()
 690                 stream = locked_file(filename, open_mode, block=False).__enter__()
 691             except LockingUnsupportedError:
 692                 stream = open(filename, open_mode)
 693             return (stream, filename)
 694         except (IOError, OSError) as err:
 695             if attempt or err.errno in (errno.EACCES,):
 696                 raise
 697             old_filename, filename = filename, sanitize_path(filename)
 698             if old_filename == filename:
 699                 raise
 700
 701
 702 def timeconvert(timestr):
 703     """Convert RFC 2822 defined time string into system timestamp"""
 704     timestamp = None
 705     timetuple = email.utils.parsedate_tz(timestr)
 706     if timetuple is not None:
 707         timestamp = email.utils.mktime_tz(timetuple)
 708     return timestamp
 709
 710
 711 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 712     """Sanitizes a string so it could be used as part of a filename.
 713     @param restricted   Use a stricter subset of allowed characters
 714     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 715                         If unset, yt-dlp's new sanitization rules are in effect
 716     """
 717     if s == '':
 718         return ''
 719
 720     def replace_insane(char):
 721         if restricted and char in ACCENT_CHARS:
 722             return ACCENT_CHARS[char]
 723         elif not restricted and char == '\n':
 724             return '\0 '
 725         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 726             return ''
 727         elif char == '"':
 728             return '' if restricted else '\''
 729         elif char == ':':
 730             return '\0_\0-' if restricted else '\0 \0-'
 731         elif char in '\\/|*<>':
 732             return '\0_'
 733         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 734             return '\0_'
 735         return char
 736
 737     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 738     result = ''.join(map(replace_insane, s))
 739     if is_id is NO_DEFAULT:
 740         result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result)  # Remove repeated substitute chars
 741         STRIP_RE = '(?:\0.|[ _-])*'
 742         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 743     result = result.replace('\0', '') or '_'
 744
 745     if not is_id:
 746         while '__' in result:
 747             result = result.replace('__', '_')
 748         result = result.strip('_')
 749         # Common case of "Foreign band name - English song title"
 750         if restricted and result.startswith('-_'):
 751             result = result[2:]
 752         if result.startswith('-'):
 753             result = '_' + result[len('-'):]
 754         result = result.lstrip('.')
 755         if not result:
 756             result = '_'
 757     return result
 758
 759
 760 def sanitize_path(s, force=False):
 761     """Sanitizes and normalizes path on Windows"""
 762     if sys.platform == 'win32':
 763         force = False
 764         drive_or_unc, _ = os.path.splitdrive(s)
 765         if sys.version_info < (2, 7) and not drive_or_unc:
 766             drive_or_unc, _ = os.path.splitunc(s)
 767     elif force:
 768         drive_or_unc = ''
 769     else:
 770         return s
 771
 772     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 773     if drive_or_unc:
 774         norm_path.pop(0)
 775     sanitized_path = [
 776         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 777         for path_part in norm_path]
 778     if drive_or_unc:
 779         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 780     elif force and s[0] == os.path.sep:
 781         sanitized_path.insert(0, os.path.sep)
 782     return os.path.join(*sanitized_path)
 783
 784
 785 def sanitize_url(url):
 786     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 787     # the number of unwanted failures due to missing protocol
 788     if url.startswith('//'):
 789         return 'http:%s' % url
 790     # Fix some common typos seen so far
 791     COMMON_TYPOS = (
 792         # https://github.com/ytdl-org/youtube-dl/issues/15649
 793         (r'^httpss://', r'https://'),
 794         # https://bx1.be/lives/direct-tv/
 795         (r'^rmtp([es]?)://', r'rtmp\1://'),
 796     )
 797     for mistake, fixup in COMMON_TYPOS:
 798         if re.match(mistake, url):
 799             return re.sub(mistake, fixup, url)
 800     return url
 801
 802
 803 def extract_basic_auth(url):
 804     parts = compat_urlparse.urlsplit(url)
 805     if parts.username is None:
 806         return url, None
 807     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 808         parts.hostname if parts.port is None
 809         else '%s:%d' % (parts.hostname, parts.port))))
 810     auth_payload = base64.b64encode(
 811         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 812     return url, 'Basic ' + auth_payload.decode('utf-8')
 813
 814
 815 def sanitized_Request(url, *args, **kwargs):
 816     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 817     if auth_header is not None:
 818         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 819         headers['Authorization'] = auth_header
 820     return compat_urllib_request.Request(url, *args, **kwargs)
 821
 822
 823 def expand_path(s):
 824     """Expand shell variables and ~"""
 825     return os.path.expandvars(compat_expanduser(s))
 826
 827
 828 def orderedSet(iterable):
 829     """ Remove all duplicates from the input iterable """
 830     res = []
 831     for el in iterable:
 832         if el not in res:
 833             res.append(el)
 834     return res
 835
 836
 837 def _htmlentity_transform(entity_with_semicolon):
 838     """Transforms an HTML entity to a character."""
 839     entity = entity_with_semicolon[:-1]
 840
 841     # Known non-numeric HTML entity
 842     if entity in compat_html_entities.name2codepoint:
 843         return compat_chr(compat_html_entities.name2codepoint[entity])
 844
 845     # TODO: HTML5 allows entities without a semicolon. For example,
 846     # '&Eacuteric' should be decoded as 'Éric'.
 847     if entity_with_semicolon in compat_html_entities_html5:
 848         return compat_html_entities_html5[entity_with_semicolon]
 849
 850     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 851     if mobj is not None:
 852         numstr = mobj.group(1)
 853         if numstr.startswith('x'):
 854             base = 16
 855             numstr = '0%s' % numstr
 856         else:
 857             base = 10
 858         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 859         try:
 860             return compat_chr(int(numstr, base))
 861         except ValueError:
 862             pass
 863
 864     # Unknown entity in name, return its literal representation
 865     return '&%s;' % entity
 866
 867
 868 def unescapeHTML(s):
 869     if s is None:
 870         return None
 871     assert type(s) == compat_str
 872
 873     return re.sub(
 874         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 875
 876
 877 def escapeHTML(text):
 878     return (
 879         text
 880         .replace('&', '&amp;')
 881         .replace('<', '&lt;')
 882         .replace('>', '&gt;')
 883         .replace('"', '&quot;')
 884         .replace("'", '&#39;')
 885     )
 886
 887
 888 def process_communicate_or_kill(p, *args, **kwargs):
 889     try:
 890         return p.communicate(*args, **kwargs)
 891     except BaseException:  # Including KeyboardInterrupt
 892         p.kill()
 893         p.wait()
 894         raise
 895
 896
 897 class Popen(subprocess.Popen):
 898     if sys.platform == 'win32':
 899         _startupinfo = subprocess.STARTUPINFO()
 900         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 901     else:
 902         _startupinfo = None
 903
 904     def __init__(self, *args, **kwargs):
 905         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 906
 907     def communicate_or_kill(self, *args, **kwargs):
 908         return process_communicate_or_kill(self, *args, **kwargs)
 909
 910
 911 def get_subprocess_encoding():
 912     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 913         # For subprocess calls, encode with locale encoding
 914         # Refer to http://stackoverflow.com/a/9951851/35070
 915         encoding = preferredencoding()
 916     else:
 917         encoding = sys.getfilesystemencoding()
 918     if encoding is None:
 919         encoding = 'utf-8'
 920     return encoding
 921
 922
 923 def encodeFilename(s, for_subprocess=False):
 924     """
 925     @param s The name of the file
 926     """
 927
 928     assert type(s) == compat_str
 929
 930     # Python 3 has a Unicode API
 931     if sys.version_info >= (3, 0):
 932         return s
 933
 934     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 935     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 936     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 937     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 938         return s
 939
 940     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 941     if sys.platform.startswith('java'):
 942         return s
 943
 944     return s.encode(get_subprocess_encoding(), 'ignore')
 945
 946
 947 def decodeFilename(b, for_subprocess=False):
 948
 949     if sys.version_info >= (3, 0):
 950         return b
 951
 952     if not isinstance(b, bytes):
 953         return b
 954
 955     return b.decode(get_subprocess_encoding(), 'ignore')
 956
 957
 958 def encodeArgument(s):
 959     if not isinstance(s, compat_str):
 960         # Legacy code that uses byte strings
 961         # Uncomment the following line after fixing all post processors
 962         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 963         s = s.decode('ascii')
 964     return encodeFilename(s, True)
 965
 966
 967 def decodeArgument(b):
 968     return decodeFilename(b, True)
 969
 970
 971 def decodeOption(optval):
 972     if optval is None:
 973         return optval
 974     if isinstance(optval, bytes):
 975         optval = optval.decode(preferredencoding())
 976
 977     assert isinstance(optval, compat_str)
 978     return optval
 979
 980
 981 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 982
 983
 984 def timetuple_from_msec(msec):
 985     secs, msec = divmod(msec, 1000)
 986     mins, secs = divmod(secs, 60)
 987     hrs, mins = divmod(mins, 60)
 988     return _timetuple(hrs, mins, secs, msec)
 989
 990
 991 def formatSeconds(secs, delim=':', msec=False):
 992     time = timetuple_from_msec(secs * 1000)
 993     if time.hours:
 994         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 995     elif time.minutes:
 996         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 997     else:
 998         ret = '%d' % time.seconds
 999     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
1000
1001
1002 def _ssl_load_windows_store_certs(ssl_context, storename):
1003     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
1004     try:
1005         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
1006                  if encoding == 'x509_asn' and (
1007                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
1008     except PermissionError:
1009         return
1010     for cert in certs:
1011         try:
1012             ssl_context.load_verify_locations(cadata=cert)
1013         except ssl.SSLError:
1014             pass
1015
1016
1017 def make_HTTPS_handler(params, **kwargs):
1018     opts_check_certificate = not params.get('nocheckcertificate')
1019     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1020     context.check_hostname = opts_check_certificate
1021     if params.get('legacyserverconnect'):
1022         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1023     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1024     if opts_check_certificate:
1025         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1026             context.load_verify_locations(cafile=certifi.where())
1027         else:
1028             try:
1029                 context.load_default_certs()
1030                 # Work around the issue in load_default_certs when there are bad certificates. See:
1031                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1032                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1033             except ssl.SSLError:
1034                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1035                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1036                     # Create a new context to discard any certificates that were already loaded
1037                     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1038                     context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1039                     for storename in ('CA', 'ROOT'):
1040                         _ssl_load_windows_store_certs(context, storename)
1041                 context.set_default_verify_paths()
1042     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1043
1044
1045 def bug_reports_message(before=';'):
1046     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , '
1047            'filling out the appropriate issue template. '
1048            'Confirm you are on the latest version using  yt-dlp -U')
1049
1050     before = before.rstrip()
1051     if not before or before.endswith(('.', '!', '?')):
1052         msg = msg[0].title() + msg[1:]
1053
1054     return (before + ' ' if before else '') + msg
1055
1056
1057 class YoutubeDLError(Exception):
1058     """Base exception for YoutubeDL errors."""
1059     msg = None
1060
1061     def __init__(self, msg=None):
1062         if msg is not None:
1063             self.msg = msg
1064         elif self.msg is None:
1065             self.msg = type(self).__name__
1066         super().__init__(self.msg)
1067
1068
1069 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1070 if hasattr(ssl, 'CertificateError'):
1071     network_exceptions.append(ssl.CertificateError)
1072 network_exceptions = tuple(network_exceptions)
1073
1074
1075 class ExtractorError(YoutubeDLError):
1076     """Error during info extraction."""
1077
1078     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1079         """ tb, if given, is the original traceback (so that it can be printed out).
1080         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1081         """
1082         if sys.exc_info()[0] in network_exceptions:
1083             expected = True
1084
1085         self.orig_msg = str(msg)
1086         self.traceback = tb
1087         self.expected = expected
1088         self.cause = cause
1089         self.video_id = video_id
1090         self.ie = ie
1091         self.exc_info = sys.exc_info()  # preserve original exception
1092
1093         super(ExtractorError, self).__init__(''.join((
1094             format_field(ie, template='[%s] '),
1095             format_field(video_id, template='%s: '),
1096             msg,
1097             format_field(cause, template=' (caused by %r)'),
1098             '' if expected else bug_reports_message())))
1099
1100     def format_traceback(self):
1101         return join_nonempty(
1102             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1103             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1104             delim='\n') or None
1105
1106
1107 class UnsupportedError(ExtractorError):
1108     def __init__(self, url):
1109         super(UnsupportedError, self).__init__(
1110             'Unsupported URL: %s' % url, expected=True)
1111         self.url = url
1112
1113
1114 class RegexNotFoundError(ExtractorError):
1115     """Error when a regex didn't match"""
1116     pass
1117
1118
1119 class GeoRestrictedError(ExtractorError):
1120     """Geographic restriction Error exception.
1121
1122     This exception may be thrown when a video is not available from your
1123     geographic location due to geographic restrictions imposed by a website.
1124     """
1125
1126     def __init__(self, msg, countries=None, **kwargs):
1127         kwargs['expected'] = True
1128         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1129         self.countries = countries
1130
1131
1132 class DownloadError(YoutubeDLError):
1133     """Download Error exception.
1134
1135     This exception may be thrown by FileDownloader objects if they are not
1136     configured to continue on errors. They will contain the appropriate
1137     error message.
1138     """
1139
1140     def __init__(self, msg, exc_info=None):
1141         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1142         super(DownloadError, self).__init__(msg)
1143         self.exc_info = exc_info
1144
1145
1146 class EntryNotInPlaylist(YoutubeDLError):
1147     """Entry not in playlist exception.
1148
1149     This exception will be thrown by YoutubeDL when a requested entry
1150     is not found in the playlist info_dict
1151     """
1152     msg = 'Entry not found in info'
1153
1154
1155 class SameFileError(YoutubeDLError):
1156     """Same File exception.
1157
1158     This exception will be thrown by FileDownloader objects if they detect
1159     multiple files would have to be downloaded to the same file on disk.
1160     """
1161     msg = 'Fixed output name but more than one file to download'
1162
1163     def __init__(self, filename=None):
1164         if filename is not None:
1165             self.msg += f': {filename}'
1166         super().__init__(self.msg)
1167
1168
1169 class PostProcessingError(YoutubeDLError):
1170     """Post Processing exception.
1171
1172     This exception may be raised by PostProcessor's .run() method to
1173     indicate an error in the postprocessing task.
1174     """
1175
1176
1177 class DownloadCancelled(YoutubeDLError):
1178     """ Exception raised when the download queue should be interrupted """
1179     msg = 'The download was cancelled'
1180
1181
1182 class ExistingVideoReached(DownloadCancelled):
1183     """ --break-on-existing triggered """
1184     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1185
1186
1187 class RejectedVideoReached(DownloadCancelled):
1188     """ --break-on-reject triggered """
1189     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1190
1191
1192 class MaxDownloadsReached(DownloadCancelled):
1193     """ --max-downloads limit has been reached. """
1194     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1195
1196
1197 class ReExtractInfo(YoutubeDLError):
1198     """ Video info needs to be re-extracted. """
1199
1200     def __init__(self, msg, expected=False):
1201         super().__init__(msg)
1202         self.expected = expected
1203
1204
1205 class ThrottledDownload(ReExtractInfo):
1206     """ Download speed below --throttled-rate. """
1207     msg = 'The download speed is below throttle limit'
1208
1209     def __init__(self):
1210         super().__init__(self.msg, expected=False)
1211
1212
1213 class UnavailableVideoError(YoutubeDLError):
1214     """Unavailable Format exception.
1215
1216     This exception will be thrown when a video is requested
1217     in a format that is not available for that video.
1218     """
1219     msg = 'Unable to download video'
1220
1221     def __init__(self, err=None):
1222         if err is not None:
1223             self.msg += f': {err}'
1224         super().__init__(self.msg)
1225
1226
1227 class ContentTooShortError(YoutubeDLError):
1228     """Content Too Short exception.
1229
1230     This exception may be raised by FileDownloader objects when a file they
1231     download is too small for what the server announced first, indicating
1232     the connection was probably interrupted.
1233     """
1234
1235     def __init__(self, downloaded, expected):
1236         super(ContentTooShortError, self).__init__(
1237             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1238         )
1239         # Both in bytes
1240         self.downloaded = downloaded
1241         self.expected = expected
1242
1243
1244 class XAttrMetadataError(YoutubeDLError):
1245     def __init__(self, code=None, msg='Unknown error'):
1246         super(XAttrMetadataError, self).__init__(msg)
1247         self.code = code
1248         self.msg = msg
1249
1250         # Parsing code and msg
1251         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1252                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1253             self.reason = 'NO_SPACE'
1254         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1255             self.reason = 'VALUE_TOO_LONG'
1256         else:
1257             self.reason = 'NOT_SUPPORTED'
1258
1259
1260 class XAttrUnavailableError(YoutubeDLError):
1261     pass
1262
1263
1264 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1265     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1266     # expected HTTP responses to meet HTTP/1.0 or later (see also
1267     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1268     if sys.version_info < (3, 0):
1269         kwargs['strict'] = True
1270     hc = http_class(*args, **compat_kwargs(kwargs))
1271     source_address = ydl_handler._params.get('source_address')
1272
1273     if source_address is not None:
1274         # This is to workaround _create_connection() from socket where it will try all
1275         # address data from getaddrinfo() including IPv6. This filters the result from
1276         # getaddrinfo() based on the source_address value.
1277         # This is based on the cpython socket.create_connection() function.
1278         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1279         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1280             host, port = address
1281             err = None
1282             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1283             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1284             ip_addrs = [addr for addr in addrs if addr[0] == af]
1285             if addrs and not ip_addrs:
1286                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1287                 raise socket.error(
1288                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1289                     % (ip_version, source_address[0]))
1290             for res in ip_addrs:
1291                 af, socktype, proto, canonname, sa = res
1292                 sock = None
1293                 try:
1294                     sock = socket.socket(af, socktype, proto)
1295                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1296                         sock.settimeout(timeout)
1297                     sock.bind(source_address)
1298                     sock.connect(sa)
1299                     err = None  # Explicitly break reference cycle
1300                     return sock
1301                 except socket.error as _:
1302                     err = _
1303                     if sock is not None:
1304                         sock.close()
1305             if err is not None:
1306                 raise err
1307             else:
1308                 raise socket.error('getaddrinfo returns an empty list')
1309         if hasattr(hc, '_create_connection'):
1310             hc._create_connection = _create_connection
1311         sa = (source_address, 0)
1312         if hasattr(hc, 'source_address'):  # Python 2.7+
1313             hc.source_address = sa
1314         else:  # Python 2.6
1315             def _hc_connect(self, *args, **kwargs):
1316                 sock = _create_connection(
1317                     (self.host, self.port), self.timeout, sa)
1318                 if is_https:
1319                     self.sock = ssl.wrap_socket(
1320                         sock, self.key_file, self.cert_file,
1321                         ssl_version=ssl.PROTOCOL_TLSv1)
1322                 else:
1323                     self.sock = sock
1324             hc.connect = functools.partial(_hc_connect, hc)
1325
1326     return hc
1327
1328
1329 def handle_youtubedl_headers(headers):
1330     filtered_headers = headers
1331
1332     if 'Youtubedl-no-compression' in filtered_headers:
1333         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1334         del filtered_headers['Youtubedl-no-compression']
1335
1336     return filtered_headers
1337
1338
1339 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1340     """Handler for HTTP requests and responses.
1341
1342     This class, when installed with an OpenerDirector, automatically adds
1343     the standard headers to every HTTP request and handles gzipped and
1344     deflated responses from web servers. If compression is to be avoided in
1345     a particular request, the original request in the program code only has
1346     to include the HTTP header "Youtubedl-no-compression", which will be
1347     removed before making the real request.
1348
1349     Part of this code was copied from:
1350
1351     http://techknack.net/python-urllib2-handlers/
1352
1353     Andrew Rowls, the author of that code, agreed to release it to the
1354     public domain.
1355     """
1356
1357     def __init__(self, params, *args, **kwargs):
1358         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1359         self._params = params
1360
1361     def http_open(self, req):
1362         conn_class = compat_http_client.HTTPConnection
1363
1364         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1365         if socks_proxy:
1366             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1367             del req.headers['Ytdl-socks-proxy']
1368
1369         return self.do_open(functools.partial(
1370             _create_http_connection, self, conn_class, False),
1371             req)
1372
1373     @staticmethod
1374     def deflate(data):
1375         if not data:
1376             return data
1377         try:
1378             return zlib.decompress(data, -zlib.MAX_WBITS)
1379         except zlib.error:
1380             return zlib.decompress(data)
1381
1382     @staticmethod
1383     def brotli(data):
1384         if not data:
1385             return data
1386         return compat_brotli.decompress(data)
1387
1388     def http_request(self, req):
1389         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1390         # always respected by websites, some tend to give out URLs with non percent-encoded
1391         # non-ASCII characters (see telemb.py, ard.py [#3412])
1392         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1393         # To work around aforementioned issue we will replace request's original URL with
1394         # percent-encoded one
1395         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1396         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1397         url = req.get_full_url()
1398         url_escaped = escape_url(url)
1399
1400         # Substitute URL if any change after escaping
1401         if url != url_escaped:
1402             req = update_Request(req, url=url_escaped)
1403
1404         for h, v in self._params.get('http_headers', std_headers).items():
1405             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1406             # The dict keys are capitalized because of this bug by urllib
1407             if h.capitalize() not in req.headers:
1408                 req.add_header(h, v)
1409
1410         if 'Accept-encoding' not in req.headers:
1411             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1412
1413         req.headers = handle_youtubedl_headers(req.headers)
1414
1415         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1416             # Python 2.6 is brain-dead when it comes to fragments
1417             req._Request__original = req._Request__original.partition('#')[0]
1418             req._Request__r_type = req._Request__r_type.partition('#')[0]
1419
1420         return req
1421
1422     def http_response(self, req, resp):
1423         old_resp = resp
1424         # gzip
1425         if resp.headers.get('Content-encoding', '') == 'gzip':
1426             content = resp.read()
1427             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1428             try:
1429                 uncompressed = io.BytesIO(gz.read())
1430             except IOError as original_ioerror:
1431                 # There may be junk add the end of the file
1432                 # See http://stackoverflow.com/q/4928560/35070 for details
1433                 for i in range(1, 1024):
1434                     try:
1435                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1436                         uncompressed = io.BytesIO(gz.read())
1437                     except IOError:
1438                         continue
1439                     break
1440                 else:
1441                     raise original_ioerror
1442             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1443             resp.msg = old_resp.msg
1444             del resp.headers['Content-encoding']
1445         # deflate
1446         if resp.headers.get('Content-encoding', '') == 'deflate':
1447             gz = io.BytesIO(self.deflate(resp.read()))
1448             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1449             resp.msg = old_resp.msg
1450             del resp.headers['Content-encoding']
1451         # brotli
1452         if resp.headers.get('Content-encoding', '') == 'br':
1453             resp = compat_urllib_request.addinfourl(
1454                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1455             resp.msg = old_resp.msg
1456             del resp.headers['Content-encoding']
1457         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1458         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1459         if 300 <= resp.code < 400:
1460             location = resp.headers.get('Location')
1461             if location:
1462                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1463                 if sys.version_info >= (3, 0):
1464                     location = location.encode('iso-8859-1').decode('utf-8')
1465                 else:
1466                     location = location.decode('utf-8')
1467                 location_escaped = escape_url(location)
1468                 if location != location_escaped:
1469                     del resp.headers['Location']
1470                     if sys.version_info < (3, 0):
1471                         location_escaped = location_escaped.encode('utf-8')
1472                     resp.headers['Location'] = location_escaped
1473         return resp
1474
1475     https_request = http_request
1476     https_response = http_response
1477
1478
1479 def make_socks_conn_class(base_class, socks_proxy):
1480     assert issubclass(base_class, (
1481         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1482
1483     url_components = compat_urlparse.urlparse(socks_proxy)
1484     if url_components.scheme.lower() == 'socks5':
1485         socks_type = ProxyType.SOCKS5
1486     elif url_components.scheme.lower() in ('socks', 'socks4'):
1487         socks_type = ProxyType.SOCKS4
1488     elif url_components.scheme.lower() == 'socks4a':
1489         socks_type = ProxyType.SOCKS4A
1490
1491     def unquote_if_non_empty(s):
1492         if not s:
1493             return s
1494         return compat_urllib_parse_unquote_plus(s)
1495
1496     proxy_args = (
1497         socks_type,
1498         url_components.hostname, url_components.port or 1080,
1499         True,  # Remote DNS
1500         unquote_if_non_empty(url_components.username),
1501         unquote_if_non_empty(url_components.password),
1502     )
1503
1504     class SocksConnection(base_class):
1505         def connect(self):
1506             self.sock = sockssocket()
1507             self.sock.setproxy(*proxy_args)
1508             if type(self.timeout) in (int, float):
1509                 self.sock.settimeout(self.timeout)
1510             self.sock.connect((self.host, self.port))
1511
1512             if isinstance(self, compat_http_client.HTTPSConnection):
1513                 if hasattr(self, '_context'):  # Python > 2.6
1514                     self.sock = self._context.wrap_socket(
1515                         self.sock, server_hostname=self.host)
1516                 else:
1517                     self.sock = ssl.wrap_socket(self.sock)
1518
1519     return SocksConnection
1520
1521
1522 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1523     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1524         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1525         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1526         self._params = params
1527
1528     def https_open(self, req):
1529         kwargs = {}
1530         conn_class = self._https_conn_class
1531
1532         if hasattr(self, '_context'):  # python > 2.6
1533             kwargs['context'] = self._context
1534         if hasattr(self, '_check_hostname'):  # python 3.x
1535             kwargs['check_hostname'] = self._check_hostname
1536
1537         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1538         if socks_proxy:
1539             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1540             del req.headers['Ytdl-socks-proxy']
1541
1542         return self.do_open(functools.partial(
1543             _create_http_connection, self, conn_class, True),
1544             req, **kwargs)
1545
1546
1547 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1548     """
1549     See [1] for cookie file format.
1550
1551     1. https://curl.haxx.se/docs/http-cookies.html
1552     """
1553     _HTTPONLY_PREFIX = '#HttpOnly_'
1554     _ENTRY_LEN = 7
1555     _HEADER = '''# Netscape HTTP Cookie File
1556 # This file is generated by yt-dlp.  Do not edit.
1557
1558 '''
1559     _CookieFileEntry = collections.namedtuple(
1560         'CookieFileEntry',
1561         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1562
1563     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1564         """
1565         Save cookies to a file.
1566
1567         Most of the code is taken from CPython 3.8 and slightly adapted
1568         to support cookie files with UTF-8 in both python 2 and 3.
1569         """
1570         if filename is None:
1571             if self.filename is not None:
1572                 filename = self.filename
1573             else:
1574                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1575
1576         # Store session cookies with `expires` set to 0 instead of an empty
1577         # string
1578         for cookie in self:
1579             if cookie.expires is None:
1580                 cookie.expires = 0
1581
1582         with io.open(filename, 'w', encoding='utf-8') as f:
1583             f.write(self._HEADER)
1584             now = time.time()
1585             for cookie in self:
1586                 if not ignore_discard and cookie.discard:
1587                     continue
1588                 if not ignore_expires and cookie.is_expired(now):
1589                     continue
1590                 if cookie.secure:
1591                     secure = 'TRUE'
1592                 else:
1593                     secure = 'FALSE'
1594                 if cookie.domain.startswith('.'):
1595                     initial_dot = 'TRUE'
1596                 else:
1597                     initial_dot = 'FALSE'
1598                 if cookie.expires is not None:
1599                     expires = compat_str(cookie.expires)
1600                 else:
1601                     expires = ''
1602                 if cookie.value is None:
1603                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1604                     # with no name, whereas http.cookiejar regards it as a
1605                     # cookie with no value.
1606                     name = ''
1607                     value = cookie.name
1608                 else:
1609                     name = cookie.name
1610                     value = cookie.value
1611                 f.write(
1612                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1613                                secure, expires, name, value]) + '\n')
1614
1615     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1616         """Load cookies from a file."""
1617         if filename is None:
1618             if self.filename is not None:
1619                 filename = self.filename
1620             else:
1621                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1622
1623         def prepare_line(line):
1624             if line.startswith(self._HTTPONLY_PREFIX):
1625                 line = line[len(self._HTTPONLY_PREFIX):]
1626             # comments and empty lines are fine
1627             if line.startswith('#') or not line.strip():
1628                 return line
1629             cookie_list = line.split('\t')
1630             if len(cookie_list) != self._ENTRY_LEN:
1631                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1632             cookie = self._CookieFileEntry(*cookie_list)
1633             if cookie.expires_at and not cookie.expires_at.isdigit():
1634                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1635             return line
1636
1637         cf = io.StringIO()
1638         with io.open(filename, encoding='utf-8') as f:
1639             for line in f:
1640                 try:
1641                     cf.write(prepare_line(line))
1642                 except compat_cookiejar.LoadError as e:
1643                     write_string(
1644                         'WARNING: skipping cookie file entry due to %s: %r\n'
1645                         % (e, line), sys.stderr)
1646                     continue
1647         cf.seek(0)
1648         self._really_load(cf, filename, ignore_discard, ignore_expires)
1649         # Session cookies are denoted by either `expires` field set to
1650         # an empty string or 0. MozillaCookieJar only recognizes the former
1651         # (see [1]). So we need force the latter to be recognized as session
1652         # cookies on our own.
1653         # Session cookies may be important for cookies-based authentication,
1654         # e.g. usually, when user does not check 'Remember me' check box while
1655         # logging in on a site, some important cookies are stored as session
1656         # cookies so that not recognizing them will result in failed login.
1657         # 1. https://bugs.python.org/issue17164
1658         for cookie in self:
1659             # Treat `expires=0` cookies as session cookies
1660             if cookie.expires == 0:
1661                 cookie.expires = None
1662                 cookie.discard = True
1663
1664
1665 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1666     def __init__(self, cookiejar=None):
1667         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1668
1669     def http_response(self, request, response):
1670         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1671         # characters in Set-Cookie HTTP header of last response (see
1672         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1673         # In order to at least prevent crashing we will percent encode Set-Cookie
1674         # header before HTTPCookieProcessor starts processing it.
1675         # if sys.version_info < (3, 0) and response.headers:
1676         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1677         #         set_cookie = response.headers.get(set_cookie_header)
1678         #         if set_cookie:
1679         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1680         #             if set_cookie != set_cookie_escaped:
1681         #                 del response.headers[set_cookie_header]
1682         #                 response.headers[set_cookie_header] = set_cookie_escaped
1683         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1684
1685     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1686     https_response = http_response
1687
1688
1689 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1690     """YoutubeDL redirect handler
1691
1692     The code is based on HTTPRedirectHandler implementation from CPython [1].
1693
1694     This redirect handler solves two issues:
1695      - ensures redirect URL is always unicode under python 2
1696      - introduces support for experimental HTTP response status code
1697        308 Permanent Redirect [2] used by some sites [3]
1698
1699     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1700     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1701     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1702     """
1703
1704     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1705
1706     def redirect_request(self, req, fp, code, msg, headers, newurl):
1707         """Return a Request or None in response to a redirect.
1708
1709         This is called by the http_error_30x methods when a
1710         redirection response is received.  If a redirection should
1711         take place, return a new Request to allow http_error_30x to
1712         perform the redirect.  Otherwise, raise HTTPError if no-one
1713         else should try to handle this url.  Return None if you can't
1714         but another Handler might.
1715         """
1716         m = req.get_method()
1717         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1718                  or code in (301, 302, 303) and m == "POST")):
1719             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1720         # Strictly (according to RFC 2616), 301 or 302 in response to
1721         # a POST MUST NOT cause a redirection without confirmation
1722         # from the user (of urllib.request, in this case).  In practice,
1723         # essentially all clients do redirect in this case, so we do
1724         # the same.
1725
1726         # On python 2 urlh.geturl() may sometimes return redirect URL
1727         # as byte string instead of unicode. This workaround allows
1728         # to force it always return unicode.
1729         if sys.version_info[0] < 3:
1730             newurl = compat_str(newurl)
1731
1732         # Be conciliant with URIs containing a space.  This is mainly
1733         # redundant with the more complete encoding done in http_error_302(),
1734         # but it is kept for compatibility with other callers.
1735         newurl = newurl.replace(' ', '%20')
1736
1737         CONTENT_HEADERS = ("content-length", "content-type")
1738         # NB: don't use dict comprehension for python 2.6 compatibility
1739         newheaders = dict((k, v) for k, v in req.headers.items()
1740                           if k.lower() not in CONTENT_HEADERS)
1741         return compat_urllib_request.Request(
1742             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1743             unverifiable=True)
1744
1745
1746 def extract_timezone(date_str):
1747     m = re.search(
1748         r'''(?x)
1749             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1750             (?P<tz>Z|                                            # just the UTC Z, or
1751                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1752                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1753                    [ ]?                                          # optional space
1754                 (?P<sign>\+|-)                                   # +/-
1755                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1756             $)
1757         ''', date_str)
1758     if not m:
1759         timezone = datetime.timedelta()
1760     else:
1761         date_str = date_str[:-len(m.group('tz'))]
1762         if not m.group('sign'):
1763             timezone = datetime.timedelta()
1764         else:
1765             sign = 1 if m.group('sign') == '+' else -1
1766             timezone = datetime.timedelta(
1767                 hours=sign * int(m.group('hours')),
1768                 minutes=sign * int(m.group('minutes')))
1769     return timezone, date_str
1770
1771
1772 def parse_iso8601(date_str, delimiter='T', timezone=None):
1773     """ Return a UNIX timestamp from the given date """
1774
1775     if date_str is None:
1776         return None
1777
1778     date_str = re.sub(r'\.[0-9]+', '', date_str)
1779
1780     if timezone is None:
1781         timezone, date_str = extract_timezone(date_str)
1782
1783     try:
1784         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1785         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1786         return calendar.timegm(dt.timetuple())
1787     except ValueError:
1788         pass
1789
1790
1791 def date_formats(day_first=True):
1792     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1793
1794
1795 def unified_strdate(date_str, day_first=True):
1796     """Return a string with the date in the format YYYYMMDD"""
1797
1798     if date_str is None:
1799         return None
1800     upload_date = None
1801     # Replace commas
1802     date_str = date_str.replace(',', ' ')
1803     # Remove AM/PM + timezone
1804     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1805     _, date_str = extract_timezone(date_str)
1806
1807     for expression in date_formats(day_first):
1808         try:
1809             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1810         except ValueError:
1811             pass
1812     if upload_date is None:
1813         timetuple = email.utils.parsedate_tz(date_str)
1814         if timetuple:
1815             try:
1816                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1817             except ValueError:
1818                 pass
1819     if upload_date is not None:
1820         return compat_str(upload_date)
1821
1822
1823 def unified_timestamp(date_str, day_first=True):
1824     if date_str is None:
1825         return None
1826
1827     date_str = re.sub(r'[,|]', '', date_str)
1828
1829     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1830     timezone, date_str = extract_timezone(date_str)
1831
1832     # Remove AM/PM + timezone
1833     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1834
1835     # Remove unrecognized timezones from ISO 8601 alike timestamps
1836     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1837     if m:
1838         date_str = date_str[:-len(m.group('tz'))]
1839
1840     # Python only supports microseconds, so remove nanoseconds
1841     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1842     if m:
1843         date_str = m.group(1)
1844
1845     for expression in date_formats(day_first):
1846         try:
1847             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1848             return calendar.timegm(dt.timetuple())
1849         except ValueError:
1850             pass
1851     timetuple = email.utils.parsedate_tz(date_str)
1852     if timetuple:
1853         return calendar.timegm(timetuple) + pm_delta * 3600
1854
1855
1856 def determine_ext(url, default_ext='unknown_video'):
1857     if url is None or '.' not in url:
1858         return default_ext
1859     guess = url.partition('?')[0].rpartition('.')[2]
1860     if re.match(r'^[A-Za-z0-9]+$', guess):
1861         return guess
1862     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1863     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1864         return guess.rstrip('/')
1865     else:
1866         return default_ext
1867
1868
1869 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1870     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1871
1872
1873 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1874     """
1875     Return a datetime object from a string in the format YYYYMMDD or
1876     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1877
1878     format: string date format used to return datetime object from
1879     precision: round the time portion of a datetime object.
1880                 auto|microsecond|second|minute|hour|day.
1881                 auto: round to the unit provided in date_str (if applicable).
1882     """
1883     auto_precision = False
1884     if precision == 'auto':
1885         auto_precision = True
1886         precision = 'microsecond'
1887     today = datetime_round(datetime.datetime.utcnow(), precision)
1888     if date_str in ('now', 'today'):
1889         return today
1890     if date_str == 'yesterday':
1891         return today - datetime.timedelta(days=1)
1892     match = re.match(
1893         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1894         date_str)
1895     if match is not None:
1896         start_time = datetime_from_str(match.group('start'), precision, format)
1897         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1898         unit = match.group('unit')
1899         if unit == 'month' or unit == 'year':
1900             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1901             unit = 'day'
1902         else:
1903             if unit == 'week':
1904                 unit = 'day'
1905                 time *= 7
1906             delta = datetime.timedelta(**{unit + 's': time})
1907             new_date = start_time + delta
1908         if auto_precision:
1909             return datetime_round(new_date, unit)
1910         return new_date
1911
1912     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1913
1914
1915 def date_from_str(date_str, format='%Y%m%d', strict=False):
1916     """
1917     Return a datetime object from a string in the format YYYYMMDD or
1918     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1919
1920     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1921
1922     format: string date format used to return datetime object from
1923     """
1924     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1925         raise ValueError(f'Invalid date format {date_str}')
1926     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1927
1928
1929 def datetime_add_months(dt, months):
1930     """Increment/Decrement a datetime object by months."""
1931     month = dt.month + months - 1
1932     year = dt.year + month // 12
1933     month = month % 12 + 1
1934     day = min(dt.day, calendar.monthrange(year, month)[1])
1935     return dt.replace(year, month, day)
1936
1937
1938 def datetime_round(dt, precision='day'):
1939     """
1940     Round a datetime object's time to a specific precision
1941     """
1942     if precision == 'microsecond':
1943         return dt
1944
1945     unit_seconds = {
1946         'day': 86400,
1947         'hour': 3600,
1948         'minute': 60,
1949         'second': 1,
1950     }
1951     roundto = lambda x, n: ((x + n / 2) // n) * n
1952     timestamp = calendar.timegm(dt.timetuple())
1953     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1954
1955
1956 def hyphenate_date(date_str):
1957     """
1958     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1959     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1960     if match is not None:
1961         return '-'.join(match.groups())
1962     else:
1963         return date_str
1964
1965
1966 class DateRange(object):
1967     """Represents a time interval between two dates"""
1968
1969     def __init__(self, start=None, end=None):
1970         """start and end must be strings in the format accepted by date"""
1971         if start is not None:
1972             self.start = date_from_str(start, strict=True)
1973         else:
1974             self.start = datetime.datetime.min.date()
1975         if end is not None:
1976             self.end = date_from_str(end, strict=True)
1977         else:
1978             self.end = datetime.datetime.max.date()
1979         if self.start > self.end:
1980             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1981
1982     @classmethod
1983     def day(cls, day):
1984         """Returns a range that only contains the given day"""
1985         return cls(day, day)
1986
1987     def __contains__(self, date):
1988         """Check if the date is in the range"""
1989         if not isinstance(date, datetime.date):
1990             date = date_from_str(date)
1991         return self.start <= date <= self.end
1992
1993     def __str__(self):
1994         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1995
1996
1997 def platform_name():
1998     """ Returns the platform name as a compat_str """
1999     res = platform.platform()
2000     if isinstance(res, bytes):
2001         res = res.decode(preferredencoding())
2002
2003     assert isinstance(res, compat_str)
2004     return res
2005
2006
2007 def get_windows_version():
2008     ''' Get Windows version. None if it's not running on Windows '''
2009     if compat_os_name == 'nt':
2010         return version_tuple(platform.win32_ver()[1])
2011     else:
2012         return None
2013
2014
2015 def _windows_write_string(s, out):
2016     """ Returns True if the string was written using special methods,
2017     False if it has yet to be written out."""
2018     # Adapted from http://stackoverflow.com/a/3259271/35070
2019
2020     import ctypes.wintypes
2021
2022     WIN_OUTPUT_IDS = {
2023         1: -11,
2024         2: -12,
2025     }
2026
2027     try:
2028         fileno = out.fileno()
2029     except AttributeError:
2030         # If the output stream doesn't have a fileno, it's virtual
2031         return False
2032     except io.UnsupportedOperation:
2033         # Some strange Windows pseudo files?
2034         return False
2035     if fileno not in WIN_OUTPUT_IDS:
2036         return False
2037
2038     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2039         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2040         ('GetStdHandle', ctypes.windll.kernel32))
2041     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2042
2043     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2044         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2045         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2046         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2047     written = ctypes.wintypes.DWORD(0)
2048
2049     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2050     FILE_TYPE_CHAR = 0x0002
2051     FILE_TYPE_REMOTE = 0x8000
2052     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2053         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2054         ctypes.POINTER(ctypes.wintypes.DWORD))(
2055         ('GetConsoleMode', ctypes.windll.kernel32))
2056     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2057
2058     def not_a_console(handle):
2059         if handle == INVALID_HANDLE_VALUE or handle is None:
2060             return True
2061         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2062                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2063
2064     if not_a_console(h):
2065         return False
2066
2067     def next_nonbmp_pos(s):
2068         try:
2069             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2070         except StopIteration:
2071             return len(s)
2072
2073     while s:
2074         count = min(next_nonbmp_pos(s), 1024)
2075
2076         ret = WriteConsoleW(
2077             h, s, count if count else 2, ctypes.byref(written), None)
2078         if ret == 0:
2079             raise OSError('Failed to write string')
2080         if not count:  # We just wrote a non-BMP character
2081             assert written.value == 2
2082             s = s[1:]
2083         else:
2084             assert written.value > 0
2085             s = s[written.value:]
2086     return True
2087
2088
2089 def write_string(s, out=None, encoding=None):
2090     if out is None:
2091         out = sys.stderr
2092     assert type(s) == compat_str
2093
2094     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2095         if _windows_write_string(s, out):
2096             return
2097
2098     if ('b' in getattr(out, 'mode', '')
2099             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2100         byt = s.encode(encoding or preferredencoding(), 'ignore')
2101         out.write(byt)
2102     elif hasattr(out, 'buffer'):
2103         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2104         byt = s.encode(enc, 'ignore')
2105         out.buffer.write(byt)
2106     else:
2107         out.write(s)
2108     out.flush()
2109
2110
2111 def bytes_to_intlist(bs):
2112     if not bs:
2113         return []
2114     if isinstance(bs[0], int):  # Python 3
2115         return list(bs)
2116     else:
2117         return [ord(c) for c in bs]
2118
2119
2120 def intlist_to_bytes(xs):
2121     if not xs:
2122         return b''
2123     return compat_struct_pack('%dB' % len(xs), *xs)
2124
2125
2126 class LockingUnsupportedError(IOError):
2127     msg = 'File locking is not supported on this platform'
2128
2129     def __init__(self):
2130         super().__init__(self.msg)
2131
2132
2133 # Cross-platform file locking
2134 if sys.platform == 'win32':
2135     import ctypes.wintypes
2136     import msvcrt
2137
2138     class OVERLAPPED(ctypes.Structure):
2139         _fields_ = [
2140             ('Internal', ctypes.wintypes.LPVOID),
2141             ('InternalHigh', ctypes.wintypes.LPVOID),
2142             ('Offset', ctypes.wintypes.DWORD),
2143             ('OffsetHigh', ctypes.wintypes.DWORD),
2144             ('hEvent', ctypes.wintypes.HANDLE),
2145         ]
2146
2147     kernel32 = ctypes.windll.kernel32
2148     LockFileEx = kernel32.LockFileEx
2149     LockFileEx.argtypes = [
2150         ctypes.wintypes.HANDLE,     # hFile
2151         ctypes.wintypes.DWORD,      # dwFlags
2152         ctypes.wintypes.DWORD,      # dwReserved
2153         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2154         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2155         ctypes.POINTER(OVERLAPPED)  # Overlapped
2156     ]
2157     LockFileEx.restype = ctypes.wintypes.BOOL
2158     UnlockFileEx = kernel32.UnlockFileEx
2159     UnlockFileEx.argtypes = [
2160         ctypes.wintypes.HANDLE,     # hFile
2161         ctypes.wintypes.DWORD,      # dwReserved
2162         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2163         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2164         ctypes.POINTER(OVERLAPPED)  # Overlapped
2165     ]
2166     UnlockFileEx.restype = ctypes.wintypes.BOOL
2167     whole_low = 0xffffffff
2168     whole_high = 0x7fffffff
2169
2170     def _lock_file(f, exclusive, block):
2171         overlapped = OVERLAPPED()
2172         overlapped.Offset = 0
2173         overlapped.OffsetHigh = 0
2174         overlapped.hEvent = 0
2175         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2176
2177         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2178                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2179                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2180             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2181
2182     def _unlock_file(f):
2183         assert f._lock_file_overlapped_p
2184         handle = msvcrt.get_osfhandle(f.fileno())
2185         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2186             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2187
2188 else:
2189     try:
2190         import fcntl
2191
2192         def _lock_file(f, exclusive, block):
2193             try:
2194                 fcntl.flock(f,
2195                             fcntl.LOCK_SH if not exclusive
2196                             else fcntl.LOCK_EX if block
2197                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2198             except BlockingIOError:
2199                 raise
2200             except OSError:  # AOSP does not have flock()
2201                 fcntl.lockf(f,
2202                             fcntl.LOCK_SH if not exclusive
2203                             else fcntl.LOCK_EX if block
2204                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2205
2206         def _unlock_file(f):
2207             try:
2208                 fcntl.flock(f, fcntl.LOCK_UN)
2209             except OSError:
2210                 fcntl.lockf(f, fcntl.LOCK_UN)
2211
2212     except ImportError:
2213
2214         def _lock_file(f, exclusive, block):
2215             raise LockingUnsupportedError()
2216
2217         def _unlock_file(f):
2218             raise LockingUnsupportedError()
2219
2220
2221 class locked_file(object):
2222     locked = False
2223
2224     def __init__(self, filename, mode, block=True, encoding=None):
2225         assert mode in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
2226         self.f = open(filename, mode, encoding=encoding)
2227         self.mode = mode
2228         self.block = block
2229
2230     def __enter__(self):
2231         exclusive = 'r' not in self.mode
2232         try:
2233             _lock_file(self.f, exclusive, self.block)
2234             self.locked = True
2235         except IOError:
2236             self.f.close()
2237             raise
2238         return self
2239
2240     def unlock(self):
2241         if not self.locked:
2242             return
2243         try:
2244             _unlock_file(self.f)
2245         finally:
2246             self.locked = False
2247
2248     def __exit__(self, *_):
2249         try:
2250             self.unlock()
2251         finally:
2252             self.f.close()
2253
2254     open = __enter__
2255     close = __exit__
2256
2257     def __getattr__(self, attr):
2258         return getattr(self.f, attr)
2259
2260     def __iter__(self):
2261         return iter(self.f)
2262
2263
2264 def get_filesystem_encoding():
2265     encoding = sys.getfilesystemencoding()
2266     return encoding if encoding is not None else 'utf-8'
2267
2268
2269 def shell_quote(args):
2270     quoted_args = []
2271     encoding = get_filesystem_encoding()
2272     for a in args:
2273         if isinstance(a, bytes):
2274             # We may get a filename encoded with 'encodeFilename'
2275             a = a.decode(encoding)
2276         quoted_args.append(compat_shlex_quote(a))
2277     return ' '.join(quoted_args)
2278
2279
2280 def smuggle_url(url, data):
2281     """ Pass additional data in a URL for internal use. """
2282
2283     url, idata = unsmuggle_url(url, {})
2284     data.update(idata)
2285     sdata = compat_urllib_parse_urlencode(
2286         {'__youtubedl_smuggle': json.dumps(data)})
2287     return url + '#' + sdata
2288
2289
2290 def unsmuggle_url(smug_url, default=None):
2291     if '#__youtubedl_smuggle' not in smug_url:
2292         return smug_url, default
2293     url, _, sdata = smug_url.rpartition('#')
2294     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2295     data = json.loads(jsond)
2296     return url, data
2297
2298
2299 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2300     """ Formats numbers with decimal sufixes like K, M, etc """
2301     num, factor = float_or_none(num), float(factor)
2302     if num is None or num < 0:
2303         return None
2304     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2305     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2306     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2307     if factor == 1024:
2308         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2309     converted = num / (factor ** exponent)
2310     return fmt % (converted, suffix)
2311
2312
2313 def format_bytes(bytes):
2314     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2315
2316
2317 def lookup_unit_table(unit_table, s):
2318     units_re = '|'.join(re.escape(u) for u in unit_table)
2319     m = re.match(
2320         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2321     if not m:
2322         return None
2323     num_str = m.group('num').replace(',', '.')
2324     mult = unit_table[m.group('unit')]
2325     return int(float(num_str) * mult)
2326
2327
2328 def parse_filesize(s):
2329     if s is None:
2330         return None
2331
2332     # The lower-case forms are of course incorrect and unofficial,
2333     # but we support those too
2334     _UNIT_TABLE = {
2335         'B': 1,
2336         'b': 1,
2337         'bytes': 1,
2338         'KiB': 1024,
2339         'KB': 1000,
2340         'kB': 1024,
2341         'Kb': 1000,
2342         'kb': 1000,
2343         'kilobytes': 1000,
2344         'kibibytes': 1024,
2345         'MiB': 1024 ** 2,
2346         'MB': 1000 ** 2,
2347         'mB': 1024 ** 2,
2348         'Mb': 1000 ** 2,
2349         'mb': 1000 ** 2,
2350         'megabytes': 1000 ** 2,
2351         'mebibytes': 1024 ** 2,
2352         'GiB': 1024 ** 3,
2353         'GB': 1000 ** 3,
2354         'gB': 1024 ** 3,
2355         'Gb': 1000 ** 3,
2356         'gb': 1000 ** 3,
2357         'gigabytes': 1000 ** 3,
2358         'gibibytes': 1024 ** 3,
2359         'TiB': 1024 ** 4,
2360         'TB': 1000 ** 4,
2361         'tB': 1024 ** 4,
2362         'Tb': 1000 ** 4,
2363         'tb': 1000 ** 4,
2364         'terabytes': 1000 ** 4,
2365         'tebibytes': 1024 ** 4,
2366         'PiB': 1024 ** 5,
2367         'PB': 1000 ** 5,
2368         'pB': 1024 ** 5,
2369         'Pb': 1000 ** 5,
2370         'pb': 1000 ** 5,
2371         'petabytes': 1000 ** 5,
2372         'pebibytes': 1024 ** 5,
2373         'EiB': 1024 ** 6,
2374         'EB': 1000 ** 6,
2375         'eB': 1024 ** 6,
2376         'Eb': 1000 ** 6,
2377         'eb': 1000 ** 6,
2378         'exabytes': 1000 ** 6,
2379         'exbibytes': 1024 ** 6,
2380         'ZiB': 1024 ** 7,
2381         'ZB': 1000 ** 7,
2382         'zB': 1024 ** 7,
2383         'Zb': 1000 ** 7,
2384         'zb': 1000 ** 7,
2385         'zettabytes': 1000 ** 7,
2386         'zebibytes': 1024 ** 7,
2387         'YiB': 1024 ** 8,
2388         'YB': 1000 ** 8,
2389         'yB': 1024 ** 8,
2390         'Yb': 1000 ** 8,
2391         'yb': 1000 ** 8,
2392         'yottabytes': 1000 ** 8,
2393         'yobibytes': 1024 ** 8,
2394     }
2395
2396     return lookup_unit_table(_UNIT_TABLE, s)
2397
2398
2399 def parse_count(s):
2400     if s is None:
2401         return None
2402
2403     s = re.sub(r'^[^\d]+\s', '', s).strip()
2404
2405     if re.match(r'^[\d,.]+$', s):
2406         return str_to_int(s)
2407
2408     _UNIT_TABLE = {
2409         'k': 1000,
2410         'K': 1000,
2411         'm': 1000 ** 2,
2412         'M': 1000 ** 2,
2413         'kk': 1000 ** 2,
2414         'KK': 1000 ** 2,
2415         'b': 1000 ** 3,
2416         'B': 1000 ** 3,
2417     }
2418
2419     ret = lookup_unit_table(_UNIT_TABLE, s)
2420     if ret is not None:
2421         return ret
2422
2423     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2424     if mobj:
2425         return str_to_int(mobj.group(1))
2426
2427
2428 def parse_resolution(s, *, lenient=False):
2429     if s is None:
2430         return {}
2431
2432     if lenient:
2433         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2434     else:
2435         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2436     if mobj:
2437         return {
2438             'width': int(mobj.group('w')),
2439             'height': int(mobj.group('h')),
2440         }
2441
2442     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2443     if mobj:
2444         return {'height': int(mobj.group(1))}
2445
2446     mobj = re.search(r'\b([48])[kK]\b', s)
2447     if mobj:
2448         return {'height': int(mobj.group(1)) * 540}
2449
2450     return {}
2451
2452
2453 def parse_bitrate(s):
2454     if not isinstance(s, compat_str):
2455         return
2456     mobj = re.search(r'\b(\d+)\s*kbps', s)
2457     if mobj:
2458         return int(mobj.group(1))
2459
2460
2461 def month_by_name(name, lang='en'):
2462     """ Return the number of a month by (locale-independently) English name """
2463
2464     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2465
2466     try:
2467         return month_names.index(name) + 1
2468     except ValueError:
2469         return None
2470
2471
2472 def month_by_abbreviation(abbrev):
2473     """ Return the number of a month by (locale-independently) English
2474         abbreviations """
2475
2476     try:
2477         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2478     except ValueError:
2479         return None
2480
2481
2482 def fix_xml_ampersands(xml_str):
2483     """Replace all the '&' by '&amp;' in XML"""
2484     return re.sub(
2485         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2486         '&amp;',
2487         xml_str)
2488
2489
2490 def setproctitle(title):
2491     assert isinstance(title, compat_str)
2492
2493     # ctypes in Jython is not complete
2494     # http://bugs.jython.org/issue2148
2495     if sys.platform.startswith('java'):
2496         return
2497
2498     try:
2499         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2500     except OSError:
2501         return
2502     except TypeError:
2503         # LoadLibrary in Windows Python 2.7.13 only expects
2504         # a bytestring, but since unicode_literals turns
2505         # every string into a unicode string, it fails.
2506         return
2507     title_bytes = title.encode('utf-8')
2508     buf = ctypes.create_string_buffer(len(title_bytes))
2509     buf.value = title_bytes
2510     try:
2511         libc.prctl(15, buf, 0, 0, 0)
2512     except AttributeError:
2513         return  # Strange libc, just skip this
2514
2515
2516 def remove_start(s, start):
2517     return s[len(start):] if s is not None and s.startswith(start) else s
2518
2519
2520 def remove_end(s, end):
2521     return s[:-len(end)] if s is not None and s.endswith(end) else s
2522
2523
2524 def remove_quotes(s):
2525     if s is None or len(s) < 2:
2526         return s
2527     for quote in ('"', "'", ):
2528         if s[0] == quote and s[-1] == quote:
2529             return s[1:-1]
2530     return s
2531
2532
2533 def get_domain(url):
2534     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2535     return domain.group('domain') if domain else None
2536
2537
2538 def url_basename(url):
2539     path = compat_urlparse.urlparse(url).path
2540     return path.strip('/').split('/')[-1]
2541
2542
2543 def base_url(url):
2544     return re.match(r'https?://[^?#&]+/', url).group()
2545
2546
2547 def urljoin(base, path):
2548     if isinstance(path, bytes):
2549         path = path.decode('utf-8')
2550     if not isinstance(path, compat_str) or not path:
2551         return None
2552     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2553         return path
2554     if isinstance(base, bytes):
2555         base = base.decode('utf-8')
2556     if not isinstance(base, compat_str) or not re.match(
2557             r'^(?:https?:)?//', base):
2558         return None
2559     return compat_urlparse.urljoin(base, path)
2560
2561
2562 class HEADRequest(compat_urllib_request.Request):
2563     def get_method(self):
2564         return 'HEAD'
2565
2566
2567 class PUTRequest(compat_urllib_request.Request):
2568     def get_method(self):
2569         return 'PUT'
2570
2571
2572 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2573     if get_attr and v is not None:
2574         v = getattr(v, get_attr, None)
2575     try:
2576         return int(v) * invscale // scale
2577     except (ValueError, TypeError, OverflowError):
2578         return default
2579
2580
2581 def str_or_none(v, default=None):
2582     return default if v is None else compat_str(v)
2583
2584
2585 def str_to_int(int_str):
2586     """ A more relaxed version of int_or_none """
2587     if isinstance(int_str, compat_integer_types):
2588         return int_str
2589     elif isinstance(int_str, compat_str):
2590         int_str = re.sub(r'[,\.\+]', '', int_str)
2591         return int_or_none(int_str)
2592
2593
2594 def float_or_none(v, scale=1, invscale=1, default=None):
2595     if v is None:
2596         return default
2597     try:
2598         return float(v) * invscale / scale
2599     except (ValueError, TypeError):
2600         return default
2601
2602
2603 def bool_or_none(v, default=None):
2604     return v if isinstance(v, bool) else default
2605
2606
2607 def strip_or_none(v, default=None):
2608     return v.strip() if isinstance(v, compat_str) else default
2609
2610
2611 def url_or_none(url):
2612     if not url or not isinstance(url, compat_str):
2613         return None
2614     url = url.strip()
2615     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2616
2617
2618 def request_to_url(req):
2619     if isinstance(req, compat_urllib_request.Request):
2620         return req.get_full_url()
2621     else:
2622         return req
2623
2624
2625 def strftime_or_none(timestamp, date_format, default=None):
2626     datetime_object = None
2627     try:
2628         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2629             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2630         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2631             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2632         return datetime_object.strftime(date_format)
2633     except (ValueError, TypeError, AttributeError):
2634         return default
2635
2636
2637 def parse_duration(s):
2638     if not isinstance(s, compat_basestring):
2639         return None
2640     s = s.strip()
2641     if not s:
2642         return None
2643
2644     days, hours, mins, secs, ms = [None] * 5
2645     m = re.match(r'''(?x)
2646             (?P<before_secs>
2647                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2648             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2649             (?P<ms>[.:][0-9]+)?Z?$
2650         ''', s)
2651     if m:
2652         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2653     else:
2654         m = re.match(
2655             r'''(?ix)(?:P?
2656                 (?:
2657                     [0-9]+\s*y(?:ears?)?,?\s*
2658                 )?
2659                 (?:
2660                     [0-9]+\s*m(?:onths?)?,?\s*
2661                 )?
2662                 (?:
2663                     [0-9]+\s*w(?:eeks?)?,?\s*
2664                 )?
2665                 (?:
2666                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2667                 )?
2668                 T)?
2669                 (?:
2670                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2671                 )?
2672                 (?:
2673                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2674                 )?
2675                 (?:
2676                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2677                 )?Z?$''', s)
2678         if m:
2679             days, hours, mins, secs, ms = m.groups()
2680         else:
2681             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2682             if m:
2683                 hours, mins = m.groups()
2684             else:
2685                 return None
2686
2687     duration = 0
2688     if secs:
2689         duration += float(secs)
2690     if mins:
2691         duration += float(mins) * 60
2692     if hours:
2693         duration += float(hours) * 60 * 60
2694     if days:
2695         duration += float(days) * 24 * 60 * 60
2696     if ms:
2697         duration += float(ms.replace(':', '.'))
2698     return duration
2699
2700
2701 def prepend_extension(filename, ext, expected_real_ext=None):
2702     name, real_ext = os.path.splitext(filename)
2703     return (
2704         '{0}.{1}{2}'.format(name, ext, real_ext)
2705         if not expected_real_ext or real_ext[1:] == expected_real_ext
2706         else '{0}.{1}'.format(filename, ext))
2707
2708
2709 def replace_extension(filename, ext, expected_real_ext=None):
2710     name, real_ext = os.path.splitext(filename)
2711     return '{0}.{1}'.format(
2712         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2713         ext)
2714
2715
2716 def check_executable(exe, args=[]):
2717     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2718     args can be a list of arguments for a short output (like -version) """
2719     try:
2720         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2721     except OSError:
2722         return False
2723     return exe
2724
2725
2726 def _get_exe_version_output(exe, args, *, to_screen=None):
2727     if to_screen:
2728         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2729     try:
2730         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2731         # SIGTTOU if yt-dlp is run in the background.
2732         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2733         out, _ = Popen(
2734             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2735             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2736     except OSError:
2737         return False
2738     if isinstance(out, bytes):  # Python 2.x
2739         out = out.decode('ascii', 'ignore')
2740     return out
2741
2742
2743 def detect_exe_version(output, version_re=None, unrecognized='present'):
2744     assert isinstance(output, compat_str)
2745     if version_re is None:
2746         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2747     m = re.search(version_re, output)
2748     if m:
2749         return m.group(1)
2750     else:
2751         return unrecognized
2752
2753
2754 def get_exe_version(exe, args=['--version'],
2755                     version_re=None, unrecognized='present'):
2756     """ Returns the version of the specified executable,
2757     or False if the executable is not present """
2758     out = _get_exe_version_output(exe, args)
2759     return detect_exe_version(out, version_re, unrecognized) if out else False
2760
2761
2762 class LazyList(collections.abc.Sequence):
2763     ''' Lazy immutable list from an iterable
2764     Note that slices of a LazyList are lists and not LazyList'''
2765
2766     class IndexError(IndexError):
2767         pass
2768
2769     def __init__(self, iterable, *, reverse=False, _cache=None):
2770         self.__iterable = iter(iterable)
2771         self.__cache = [] if _cache is None else _cache
2772         self.__reversed = reverse
2773
2774     def __iter__(self):
2775         if self.__reversed:
2776             # We need to consume the entire iterable to iterate in reverse
2777             yield from self.exhaust()
2778             return
2779         yield from self.__cache
2780         for item in self.__iterable:
2781             self.__cache.append(item)
2782             yield item
2783
2784     def __exhaust(self):
2785         self.__cache.extend(self.__iterable)
2786         # Discard the emptied iterable to make it pickle-able
2787         self.__iterable = []
2788         return self.__cache
2789
2790     def exhaust(self):
2791         ''' Evaluate the entire iterable '''
2792         return self.__exhaust()[::-1 if self.__reversed else 1]
2793
2794     @staticmethod
2795     def __reverse_index(x):
2796         return None if x is None else -(x + 1)
2797
2798     def __getitem__(self, idx):
2799         if isinstance(idx, slice):
2800             if self.__reversed:
2801                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2802             start, stop, step = idx.start, idx.stop, idx.step or 1
2803         elif isinstance(idx, int):
2804             if self.__reversed:
2805                 idx = self.__reverse_index(idx)
2806             start, stop, step = idx, idx, 0
2807         else:
2808             raise TypeError('indices must be integers or slices')
2809         if ((start or 0) < 0 or (stop or 0) < 0
2810                 or (start is None and step < 0)
2811                 or (stop is None and step > 0)):
2812             # We need to consume the entire iterable to be able to slice from the end
2813             # Obviously, never use this with infinite iterables
2814             self.__exhaust()
2815             try:
2816                 return self.__cache[idx]
2817             except IndexError as e:
2818                 raise self.IndexError(e) from e
2819         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2820         if n > 0:
2821             self.__cache.extend(itertools.islice(self.__iterable, n))
2822         try:
2823             return self.__cache[idx]
2824         except IndexError as e:
2825             raise self.IndexError(e) from e
2826
2827     def __bool__(self):
2828         try:
2829             self[-1] if self.__reversed else self[0]
2830         except self.IndexError:
2831             return False
2832         return True
2833
2834     def __len__(self):
2835         self.__exhaust()
2836         return len(self.__cache)
2837
2838     def __reversed__(self):
2839         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2840
2841     def __copy__(self):
2842         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2843
2844     def __repr__(self):
2845         # repr and str should mimic a list. So we exhaust the iterable
2846         return repr(self.exhaust())
2847
2848     def __str__(self):
2849         return repr(self.exhaust())
2850
2851
2852 class PagedList:
2853
2854     class IndexError(IndexError):
2855         pass
2856
2857     def __len__(self):
2858         # This is only useful for tests
2859         return len(self.getslice())
2860
2861     def __init__(self, pagefunc, pagesize, use_cache=True):
2862         self._pagefunc = pagefunc
2863         self._pagesize = pagesize
2864         self._pagecount = float('inf')
2865         self._use_cache = use_cache
2866         self._cache = {}
2867
2868     def getpage(self, pagenum):
2869         page_results = self._cache.get(pagenum)
2870         if page_results is None:
2871             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2872         if self._use_cache:
2873             self._cache[pagenum] = page_results
2874         return page_results
2875
2876     def getslice(self, start=0, end=None):
2877         return list(self._getslice(start, end))
2878
2879     def _getslice(self, start, end):
2880         raise NotImplementedError('This method must be implemented by subclasses')
2881
2882     def __getitem__(self, idx):
2883         assert self._use_cache, 'Indexing PagedList requires cache'
2884         if not isinstance(idx, int) or idx < 0:
2885             raise TypeError('indices must be non-negative integers')
2886         entries = self.getslice(idx, idx + 1)
2887         if not entries:
2888             raise self.IndexError()
2889         return entries[0]
2890
2891
2892 class OnDemandPagedList(PagedList):
2893     """Download pages until a page with less than maximum results"""
2894     def _getslice(self, start, end):
2895         for pagenum in itertools.count(start // self._pagesize):
2896             firstid = pagenum * self._pagesize
2897             nextfirstid = pagenum * self._pagesize + self._pagesize
2898             if start >= nextfirstid:
2899                 continue
2900
2901             startv = (
2902                 start % self._pagesize
2903                 if firstid <= start < nextfirstid
2904                 else 0)
2905             endv = (
2906                 ((end - 1) % self._pagesize) + 1
2907                 if (end is not None and firstid <= end <= nextfirstid)
2908                 else None)
2909
2910             try:
2911                 page_results = self.getpage(pagenum)
2912             except Exception:
2913                 self._pagecount = pagenum - 1
2914                 raise
2915             if startv != 0 or endv is not None:
2916                 page_results = page_results[startv:endv]
2917             yield from page_results
2918
2919             # A little optimization - if current page is not "full", ie. does
2920             # not contain page_size videos then we can assume that this page
2921             # is the last one - there are no more ids on further pages -
2922             # i.e. no need to query again.
2923             if len(page_results) + startv < self._pagesize:
2924                 break
2925
2926             # If we got the whole page, but the next page is not interesting,
2927             # break out early as well
2928             if end == nextfirstid:
2929                 break
2930
2931
2932 class InAdvancePagedList(PagedList):
2933     """PagedList with total number of pages known in advance"""
2934     def __init__(self, pagefunc, pagecount, pagesize):
2935         PagedList.__init__(self, pagefunc, pagesize, True)
2936         self._pagecount = pagecount
2937
2938     def _getslice(self, start, end):
2939         start_page = start // self._pagesize
2940         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2941         skip_elems = start - start_page * self._pagesize
2942         only_more = None if end is None else end - start
2943         for pagenum in range(start_page, end_page):
2944             page_results = self.getpage(pagenum)
2945             if skip_elems:
2946                 page_results = page_results[skip_elems:]
2947                 skip_elems = None
2948             if only_more is not None:
2949                 if len(page_results) < only_more:
2950                     only_more -= len(page_results)
2951                 else:
2952                     yield from page_results[:only_more]
2953                     break
2954             yield from page_results
2955
2956
2957 def uppercase_escape(s):
2958     unicode_escape = codecs.getdecoder('unicode_escape')
2959     return re.sub(
2960         r'\\U[0-9a-fA-F]{8}',
2961         lambda m: unicode_escape(m.group(0))[0],
2962         s)
2963
2964
2965 def lowercase_escape(s):
2966     unicode_escape = codecs.getdecoder('unicode_escape')
2967     return re.sub(
2968         r'\\u[0-9a-fA-F]{4}',
2969         lambda m: unicode_escape(m.group(0))[0],
2970         s)
2971
2972
2973 def escape_rfc3986(s):
2974     """Escape non-ASCII characters as suggested by RFC 3986"""
2975     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2976         s = s.encode('utf-8')
2977     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2978
2979
2980 def escape_url(url):
2981     """Escape URL as suggested by RFC 3986"""
2982     url_parsed = compat_urllib_parse_urlparse(url)
2983     return url_parsed._replace(
2984         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2985         path=escape_rfc3986(url_parsed.path),
2986         params=escape_rfc3986(url_parsed.params),
2987         query=escape_rfc3986(url_parsed.query),
2988         fragment=escape_rfc3986(url_parsed.fragment)
2989     ).geturl()
2990
2991
2992 def parse_qs(url):
2993     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2994
2995
2996 def read_batch_urls(batch_fd):
2997     def fixup(url):
2998         if not isinstance(url, compat_str):
2999             url = url.decode('utf-8', 'replace')
3000         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3001         for bom in BOM_UTF8:
3002             if url.startswith(bom):
3003                 url = url[len(bom):]
3004         url = url.lstrip()
3005         if not url or url.startswith(('#', ';', ']')):
3006             return False
3007         # "#" cannot be stripped out since it is part of the URI
3008         # However, it can be safely stipped out if follwing a whitespace
3009         return re.split(r'\s#', url, 1)[0].rstrip()
3010
3011     with contextlib.closing(batch_fd) as fd:
3012         return [url for url in map(fixup, fd) if url]
3013
3014
3015 def urlencode_postdata(*args, **kargs):
3016     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3017
3018
3019 def update_url_query(url, query):
3020     if not query:
3021         return url
3022     parsed_url = compat_urlparse.urlparse(url)
3023     qs = compat_parse_qs(parsed_url.query)
3024     qs.update(query)
3025     return compat_urlparse.urlunparse(parsed_url._replace(
3026         query=compat_urllib_parse_urlencode(qs, True)))
3027
3028
3029 def update_Request(req, url=None, data=None, headers={}, query={}):
3030     req_headers = req.headers.copy()
3031     req_headers.update(headers)
3032     req_data = data or req.data
3033     req_url = update_url_query(url or req.get_full_url(), query)
3034     req_get_method = req.get_method()
3035     if req_get_method == 'HEAD':
3036         req_type = HEADRequest
3037     elif req_get_method == 'PUT':
3038         req_type = PUTRequest
3039     else:
3040         req_type = compat_urllib_request.Request
3041     new_req = req_type(
3042         req_url, data=req_data, headers=req_headers,
3043         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3044     if hasattr(req, 'timeout'):
3045         new_req.timeout = req.timeout
3046     return new_req
3047
3048
3049 def _multipart_encode_impl(data, boundary):
3050     content_type = 'multipart/form-data; boundary=%s' % boundary
3051
3052     out = b''
3053     for k, v in data.items():
3054         out += b'--' + boundary.encode('ascii') + b'\r\n'
3055         if isinstance(k, compat_str):
3056             k = k.encode('utf-8')
3057         if isinstance(v, compat_str):
3058             v = v.encode('utf-8')
3059         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3060         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3061         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3062         if boundary.encode('ascii') in content:
3063             raise ValueError('Boundary overlaps with data')
3064         out += content
3065
3066     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3067
3068     return out, content_type
3069
3070
3071 def multipart_encode(data, boundary=None):
3072     '''
3073     Encode a dict to RFC 7578-compliant form-data
3074
3075     data:
3076         A dict where keys and values can be either Unicode or bytes-like
3077         objects.
3078     boundary:
3079         If specified a Unicode object, it's used as the boundary. Otherwise
3080         a random boundary is generated.
3081
3082     Reference: https://tools.ietf.org/html/rfc7578
3083     '''
3084     has_specified_boundary = boundary is not None
3085
3086     while True:
3087         if boundary is None:
3088             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3089
3090         try:
3091             out, content_type = _multipart_encode_impl(data, boundary)
3092             break
3093         except ValueError:
3094             if has_specified_boundary:
3095                 raise
3096             boundary = None
3097
3098     return out, content_type
3099
3100
3101 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3102     for val in map(d.get, variadic(key_or_keys)):
3103         if val is not None and (val or not skip_false_values):
3104             return val
3105     return default
3106
3107
3108 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3109     for f in funcs:
3110         try:
3111             val = f(*args, **kwargs)
3112         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3113             pass
3114         else:
3115             if expected_type is None or isinstance(val, expected_type):
3116                 return val
3117
3118
3119 def try_get(src, getter, expected_type=None):
3120     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3121
3122
3123 def filter_dict(dct, cndn=lambda _, v: v is not None):
3124     return {k: v for k, v in dct.items() if cndn(k, v)}
3125
3126
3127 def merge_dicts(*dicts):
3128     merged = {}
3129     for a_dict in dicts:
3130         for k, v in a_dict.items():
3131             if (v is not None and k not in merged
3132                     or isinstance(v, str) and merged[k] == ''):
3133                 merged[k] = v
3134     return merged
3135
3136
3137 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3138     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3139
3140
3141 US_RATINGS = {
3142     'G': 0,
3143     'PG': 10,
3144     'PG-13': 13,
3145     'R': 16,
3146     'NC': 18,
3147 }
3148
3149
3150 TV_PARENTAL_GUIDELINES = {
3151     'TV-Y': 0,
3152     'TV-Y7': 7,
3153     'TV-G': 0,
3154     'TV-PG': 0,
3155     'TV-14': 14,
3156     'TV-MA': 17,
3157 }
3158
3159
3160 def parse_age_limit(s):
3161     if type(s) == int:
3162         return s if 0 <= s <= 21 else None
3163     if not isinstance(s, compat_basestring):
3164         return None
3165     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3166     if m:
3167         return int(m.group('age'))
3168     s = s.upper()
3169     if s in US_RATINGS:
3170         return US_RATINGS[s]
3171     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3172     if m:
3173         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3174     return None
3175
3176
3177 def strip_jsonp(code):
3178     return re.sub(
3179         r'''(?sx)^
3180             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3181             (?:\s*&&\s*(?P=func_name))?
3182             \s*\(\s*(?P<callback_data>.*)\);?
3183             \s*?(?://[^\n]*)*$''',
3184         r'\g<callback_data>', code)
3185
3186
3187 def js_to_json(code, vars={}):
3188     # vars is a dict of var, val pairs to substitute
3189     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3190     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3191     INTEGER_TABLE = (
3192         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3193         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3194     )
3195
3196     def fix_kv(m):
3197         v = m.group(0)
3198         if v in ('true', 'false', 'null'):
3199             return v
3200         elif v in ('undefined', 'void 0'):
3201             return 'null'
3202         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3203             return ""
3204
3205         if v[0] in ("'", '"'):
3206             v = re.sub(r'(?s)\\.|"', lambda m: {
3207                 '"': '\\"',
3208                 "\\'": "'",
3209                 '\\\n': '',
3210                 '\\x': '\\u00',
3211             }.get(m.group(0), m.group(0)), v[1:-1])
3212         else:
3213             for regex, base in INTEGER_TABLE:
3214                 im = re.match(regex, v)
3215                 if im:
3216                     i = int(im.group(1), base)
3217                     return '"%d":' % i if v.endswith(':') else '%d' % i
3218
3219             if v in vars:
3220                 return vars[v]
3221
3222         return '"%s"' % v
3223
3224     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3225
3226     return re.sub(r'''(?sx)
3227         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3228         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3229         {comment}|,(?={skip}[\]}}])|
3230         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3231         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3232         [0-9]+(?={skip}:)|
3233         !+
3234         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3235
3236
3237 def qualities(quality_ids):
3238     """ Get a numeric quality value out of a list of possible values """
3239     def q(qid):
3240         try:
3241             return quality_ids.index(qid)
3242         except ValueError:
3243             return -1
3244     return q
3245
3246
3247 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3248
3249
3250 DEFAULT_OUTTMPL = {
3251     'default': '%(title)s [%(id)s].%(ext)s',
3252     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3253 }
3254 OUTTMPL_TYPES = {
3255     'chapter': None,
3256     'subtitle': None,
3257     'thumbnail': None,
3258     'description': 'description',
3259     'annotation': 'annotations.xml',
3260     'infojson': 'info.json',
3261     'link': None,
3262     'pl_video': None,
3263     'pl_thumbnail': None,
3264     'pl_description': 'description',
3265     'pl_infojson': 'info.json',
3266 }
3267
3268 # As of [1] format syntax is:
3269 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3270 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3271 STR_FORMAT_RE_TMPL = r'''(?x)
3272     (?<!%)(?P<prefix>(?:%%)*)
3273     %
3274     (?P<has_key>\((?P<key>{0})\))?
3275     (?P<format>
3276         (?P<conversion>[#0\-+ ]+)?
3277         (?P<min_width>\d+)?
3278         (?P<precision>\.\d+)?
3279         (?P<len_mod>[hlL])?  # unused in python
3280         {1}  # conversion type
3281     )
3282 '''
3283
3284
3285 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3286
3287
3288 def limit_length(s, length):
3289     """ Add ellipses to overly long strings """
3290     if s is None:
3291         return None
3292     ELLIPSES = '...'
3293     if len(s) > length:
3294         return s[:length - len(ELLIPSES)] + ELLIPSES
3295     return s
3296
3297
3298 def version_tuple(v):
3299     return tuple(int(e) for e in re.split(r'[-.]', v))
3300
3301
3302 def is_outdated_version(version, limit, assume_new=True):
3303     if not version:
3304         return not assume_new
3305     try:
3306         return version_tuple(version) < version_tuple(limit)
3307     except ValueError:
3308         return not assume_new
3309
3310
3311 def ytdl_is_updateable():
3312     """ Returns if yt-dlp can be updated with -U """
3313
3314     from .update import is_non_updateable
3315
3316     return not is_non_updateable()
3317
3318
3319 def args_to_str(args):
3320     # Get a short string representation for a subprocess command
3321     return ' '.join(compat_shlex_quote(a) for a in args)
3322
3323
3324 def error_to_compat_str(err):
3325     err_str = str(err)
3326     # On python 2 error byte string must be decoded with proper
3327     # encoding rather than ascii
3328     if sys.version_info[0] < 3:
3329         err_str = err_str.decode(preferredencoding())
3330     return err_str
3331
3332
3333 def error_to_str(err):
3334     return f'{type(err).__name__}: {err}'
3335
3336
3337 def mimetype2ext(mt):
3338     if mt is None:
3339         return None
3340
3341     mt, _, params = mt.partition(';')
3342     mt = mt.strip()
3343
3344     FULL_MAP = {
3345         'audio/mp4': 'm4a',
3346         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3347         # it's the most popular one
3348         'audio/mpeg': 'mp3',
3349         'audio/x-wav': 'wav',
3350         'audio/wav': 'wav',
3351         'audio/wave': 'wav',
3352     }
3353
3354     ext = FULL_MAP.get(mt)
3355     if ext is not None:
3356         return ext
3357
3358     SUBTYPE_MAP = {
3359         '3gpp': '3gp',
3360         'smptett+xml': 'tt',
3361         'ttaf+xml': 'dfxp',
3362         'ttml+xml': 'ttml',
3363         'x-flv': 'flv',
3364         'x-mp4-fragmented': 'mp4',
3365         'x-ms-sami': 'sami',
3366         'x-ms-wmv': 'wmv',
3367         'mpegurl': 'm3u8',
3368         'x-mpegurl': 'm3u8',
3369         'vnd.apple.mpegurl': 'm3u8',
3370         'dash+xml': 'mpd',
3371         'f4m+xml': 'f4m',
3372         'hds+xml': 'f4m',
3373         'vnd.ms-sstr+xml': 'ism',
3374         'quicktime': 'mov',
3375         'mp2t': 'ts',
3376         'x-wav': 'wav',
3377         'filmstrip+json': 'fs',
3378         'svg+xml': 'svg',
3379     }
3380
3381     _, _, subtype = mt.rpartition('/')
3382     ext = SUBTYPE_MAP.get(subtype.lower())
3383     if ext is not None:
3384         return ext
3385
3386     SUFFIX_MAP = {
3387         'json': 'json',
3388         'xml': 'xml',
3389         'zip': 'zip',
3390         'gzip': 'gz',
3391     }
3392
3393     _, _, suffix = subtype.partition('+')
3394     ext = SUFFIX_MAP.get(suffix)
3395     if ext is not None:
3396         return ext
3397
3398     return subtype.replace('+', '.')
3399
3400
3401 def ext2mimetype(ext_or_url):
3402     if not ext_or_url:
3403         return None
3404     if '.' not in ext_or_url:
3405         ext_or_url = f'file.{ext_or_url}'
3406     return mimetypes.guess_type(ext_or_url)[0]
3407
3408
3409 def parse_codecs(codecs_str):
3410     # http://tools.ietf.org/html/rfc6381
3411     if not codecs_str:
3412         return {}
3413     split_codecs = list(filter(None, map(
3414         str.strip, codecs_str.strip().strip(',').split(','))))
3415     vcodec, acodec, tcodec, hdr = None, None, None, None
3416     for full_codec in split_codecs:
3417         parts = full_codec.split('.')
3418         codec = parts[0].replace('0', '')
3419         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3420                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3421             if not vcodec:
3422                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3423                 if codec in ('dvh1', 'dvhe'):
3424                     hdr = 'DV'
3425                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3426                     hdr = 'HDR10'
3427                 elif full_codec.replace('0', '').startswith('vp9.2'):
3428                     hdr = 'HDR10'
3429         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3430             if not acodec:
3431                 acodec = full_codec
3432         elif codec in ('stpp', 'wvtt',):
3433             if not tcodec:
3434                 tcodec = full_codec
3435         else:
3436             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3437     if vcodec or acodec or tcodec:
3438         return {
3439             'vcodec': vcodec or 'none',
3440             'acodec': acodec or 'none',
3441             'dynamic_range': hdr,
3442             **({'tcodec': tcodec} if tcodec is not None else {}),
3443         }
3444     elif len(split_codecs) == 2:
3445         return {
3446             'vcodec': split_codecs[0],
3447             'acodec': split_codecs[1],
3448         }
3449     return {}
3450
3451
3452 def urlhandle_detect_ext(url_handle):
3453     getheader = url_handle.headers.get
3454
3455     cd = getheader('Content-Disposition')
3456     if cd:
3457         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3458         if m:
3459             e = determine_ext(m.group('filename'), default_ext=None)
3460             if e:
3461                 return e
3462
3463     return mimetype2ext(getheader('Content-Type'))
3464
3465
3466 def encode_data_uri(data, mime_type):
3467     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3468
3469
3470 def age_restricted(content_limit, age_limit):
3471     """ Returns True iff the content should be blocked """
3472
3473     if age_limit is None:  # No limit set
3474         return False
3475     if content_limit is None:
3476         return False  # Content available for everyone
3477     return age_limit < content_limit
3478
3479
3480 def is_html(first_bytes):
3481     """ Detect whether a file contains HTML by examining its first bytes. """
3482
3483     BOMS = [
3484         (b'\xef\xbb\xbf', 'utf-8'),
3485         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3486         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3487         (b'\xff\xfe', 'utf-16-le'),
3488         (b'\xfe\xff', 'utf-16-be'),
3489     ]
3490     for bom, enc in BOMS:
3491         if first_bytes.startswith(bom):
3492             s = first_bytes[len(bom):].decode(enc, 'replace')
3493             break
3494     else:
3495         s = first_bytes.decode('utf-8', 'replace')
3496
3497     return re.match(r'^\s*<', s)
3498
3499
3500 def determine_protocol(info_dict):
3501     protocol = info_dict.get('protocol')
3502     if protocol is not None:
3503         return protocol
3504
3505     url = sanitize_url(info_dict['url'])
3506     if url.startswith('rtmp'):
3507         return 'rtmp'
3508     elif url.startswith('mms'):
3509         return 'mms'
3510     elif url.startswith('rtsp'):
3511         return 'rtsp'
3512
3513     ext = determine_ext(url)
3514     if ext == 'm3u8':
3515         return 'm3u8'
3516     elif ext == 'f4m':
3517         return 'f4m'
3518
3519     return compat_urllib_parse_urlparse(url).scheme
3520
3521
3522 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3523     """ Render a list of rows, each as a list of values.
3524     Text after a \t will be right aligned """
3525     def width(string):
3526         return len(remove_terminal_sequences(string).replace('\t', ''))
3527
3528     def get_max_lens(table):
3529         return [max(width(str(v)) for v in col) for col in zip(*table)]
3530
3531     def filter_using_list(row, filterArray):
3532         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3533
3534     max_lens = get_max_lens(data) if hide_empty else []
3535     header_row = filter_using_list(header_row, max_lens)
3536     data = [filter_using_list(row, max_lens) for row in data]
3537
3538     table = [header_row] + data
3539     max_lens = get_max_lens(table)
3540     extra_gap += 1
3541     if delim:
3542         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3543         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3544     for row in table:
3545         for pos, text in enumerate(map(str, row)):
3546             if '\t' in text:
3547                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3548             else:
3549                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3550     ret = '\n'.join(''.join(row).rstrip() for row in table)
3551     return ret
3552
3553
3554 def _match_one(filter_part, dct, incomplete):
3555     # TODO: Generalize code with YoutubeDL._build_format_filter
3556     STRING_OPERATORS = {
3557         '*=': operator.contains,
3558         '^=': lambda attr, value: attr.startswith(value),
3559         '$=': lambda attr, value: attr.endswith(value),
3560         '~=': lambda attr, value: re.search(value, attr),
3561     }
3562     COMPARISON_OPERATORS = {
3563         **STRING_OPERATORS,
3564         '<=': operator.le,  # "<=" must be defined above "<"
3565         '<': operator.lt,
3566         '>=': operator.ge,
3567         '>': operator.gt,
3568         '=': operator.eq,
3569     }
3570
3571     if isinstance(incomplete, bool):
3572         is_incomplete = lambda _: incomplete
3573     else:
3574         is_incomplete = lambda k: k in incomplete
3575
3576     operator_rex = re.compile(r'''(?x)\s*
3577         (?P<key>[a-z_]+)
3578         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3579         (?:
3580             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3581             (?P<strval>.+?)
3582         )
3583         \s*$
3584         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3585     m = operator_rex.search(filter_part)
3586     if m:
3587         m = m.groupdict()
3588         unnegated_op = COMPARISON_OPERATORS[m['op']]
3589         if m['negation']:
3590             op = lambda attr, value: not unnegated_op(attr, value)
3591         else:
3592             op = unnegated_op
3593         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3594         if m['quote']:
3595             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3596         actual_value = dct.get(m['key'])
3597         numeric_comparison = None
3598         if isinstance(actual_value, compat_numeric_types):
3599             # If the original field is a string and matching comparisonvalue is
3600             # a number we should respect the origin of the original field
3601             # and process comparison value as a string (see
3602             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3603             try:
3604                 numeric_comparison = int(comparison_value)
3605             except ValueError:
3606                 numeric_comparison = parse_filesize(comparison_value)
3607                 if numeric_comparison is None:
3608                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3609                 if numeric_comparison is None:
3610                     numeric_comparison = parse_duration(comparison_value)
3611         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3612             raise ValueError('Operator %s only supports string values!' % m['op'])
3613         if actual_value is None:
3614             return is_incomplete(m['key']) or m['none_inclusive']
3615         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3616
3617     UNARY_OPERATORS = {
3618         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3619         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3620     }
3621     operator_rex = re.compile(r'''(?x)\s*
3622         (?P<op>%s)\s*(?P<key>[a-z_]+)
3623         \s*$
3624         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3625     m = operator_rex.search(filter_part)
3626     if m:
3627         op = UNARY_OPERATORS[m.group('op')]
3628         actual_value = dct.get(m.group('key'))
3629         if is_incomplete(m.group('key')) and actual_value is None:
3630             return True
3631         return op(actual_value)
3632
3633     raise ValueError('Invalid filter part %r' % filter_part)
3634
3635
3636 def match_str(filter_str, dct, incomplete=False):
3637     """ Filter a dictionary with a simple string syntax.
3638     @returns           Whether the filter passes
3639     @param incomplete  Set of keys that is expected to be missing from dct.
3640                        Can be True/False to indicate all/none of the keys may be missing.
3641                        All conditions on incomplete keys pass if the key is missing
3642     """
3643     return all(
3644         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3645         for filter_part in re.split(r'(?<!\\)&', filter_str))
3646
3647
3648 def match_filter_func(filters):
3649     if not filters:
3650         return None
3651     filters = variadic(filters)
3652
3653     def _match_func(info_dict, *args, **kwargs):
3654         if any(match_str(f, info_dict, *args, **kwargs) for f in filters):
3655             return None
3656         else:
3657             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3658             filter_str = ') | ('.join(map(str.strip, filters))
3659             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3660     return _match_func
3661
3662
3663 def parse_dfxp_time_expr(time_expr):
3664     if not time_expr:
3665         return
3666
3667     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3668     if mobj:
3669         return float(mobj.group('time_offset'))
3670
3671     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3672     if mobj:
3673         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3674
3675
3676 def srt_subtitles_timecode(seconds):
3677     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3678
3679
3680 def ass_subtitles_timecode(seconds):
3681     time = timetuple_from_msec(seconds * 1000)
3682     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3683
3684
3685 def dfxp2srt(dfxp_data):
3686     '''
3687     @param dfxp_data A bytes-like object containing DFXP data
3688     @returns A unicode object containing converted SRT data
3689     '''
3690     LEGACY_NAMESPACES = (
3691         (b'http://www.w3.org/ns/ttml', [
3692             b'http://www.w3.org/2004/11/ttaf1',
3693             b'http://www.w3.org/2006/04/ttaf1',
3694             b'http://www.w3.org/2006/10/ttaf1',
3695         ]),
3696         (b'http://www.w3.org/ns/ttml#styling', [
3697             b'http://www.w3.org/ns/ttml#style',
3698         ]),
3699     )
3700
3701     SUPPORTED_STYLING = [
3702         'color',
3703         'fontFamily',
3704         'fontSize',
3705         'fontStyle',
3706         'fontWeight',
3707         'textDecoration'
3708     ]
3709
3710     _x = functools.partial(xpath_with_ns, ns_map={
3711         'xml': 'http://www.w3.org/XML/1998/namespace',
3712         'ttml': 'http://www.w3.org/ns/ttml',
3713         'tts': 'http://www.w3.org/ns/ttml#styling',
3714     })
3715
3716     styles = {}
3717     default_style = {}
3718
3719     class TTMLPElementParser(object):
3720         _out = ''
3721         _unclosed_elements = []
3722         _applied_styles = []
3723
3724         def start(self, tag, attrib):
3725             if tag in (_x('ttml:br'), 'br'):
3726                 self._out += '\n'
3727             else:
3728                 unclosed_elements = []
3729                 style = {}
3730                 element_style_id = attrib.get('style')
3731                 if default_style:
3732                     style.update(default_style)
3733                 if element_style_id:
3734                     style.update(styles.get(element_style_id, {}))
3735                 for prop in SUPPORTED_STYLING:
3736                     prop_val = attrib.get(_x('tts:' + prop))
3737                     if prop_val:
3738                         style[prop] = prop_val
3739                 if style:
3740                     font = ''
3741                     for k, v in sorted(style.items()):
3742                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3743                             continue
3744                         if k == 'color':
3745                             font += ' color="%s"' % v
3746                         elif k == 'fontSize':
3747                             font += ' size="%s"' % v
3748                         elif k == 'fontFamily':
3749                             font += ' face="%s"' % v
3750                         elif k == 'fontWeight' and v == 'bold':
3751                             self._out += '<b>'
3752                             unclosed_elements.append('b')
3753                         elif k == 'fontStyle' and v == 'italic':
3754                             self._out += '<i>'
3755                             unclosed_elements.append('i')
3756                         elif k == 'textDecoration' and v == 'underline':
3757                             self._out += '<u>'
3758                             unclosed_elements.append('u')
3759                     if font:
3760                         self._out += '<font' + font + '>'
3761                         unclosed_elements.append('font')
3762                     applied_style = {}
3763                     if self._applied_styles:
3764                         applied_style.update(self._applied_styles[-1])
3765                     applied_style.update(style)
3766                     self._applied_styles.append(applied_style)
3767                 self._unclosed_elements.append(unclosed_elements)
3768
3769         def end(self, tag):
3770             if tag not in (_x('ttml:br'), 'br'):
3771                 unclosed_elements = self._unclosed_elements.pop()
3772                 for element in reversed(unclosed_elements):
3773                     self._out += '</%s>' % element
3774                 if unclosed_elements and self._applied_styles:
3775                     self._applied_styles.pop()
3776
3777         def data(self, data):
3778             self._out += data
3779
3780         def close(self):
3781             return self._out.strip()
3782
3783     def parse_node(node):
3784         target = TTMLPElementParser()
3785         parser = xml.etree.ElementTree.XMLParser(target=target)
3786         parser.feed(xml.etree.ElementTree.tostring(node))
3787         return parser.close()
3788
3789     for k, v in LEGACY_NAMESPACES:
3790         for ns in v:
3791             dfxp_data = dfxp_data.replace(ns, k)
3792
3793     dfxp = compat_etree_fromstring(dfxp_data)
3794     out = []
3795     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3796
3797     if not paras:
3798         raise ValueError('Invalid dfxp/TTML subtitle')
3799
3800     repeat = False
3801     while True:
3802         for style in dfxp.findall(_x('.//ttml:style')):
3803             style_id = style.get('id') or style.get(_x('xml:id'))
3804             if not style_id:
3805                 continue
3806             parent_style_id = style.get('style')
3807             if parent_style_id:
3808                 if parent_style_id not in styles:
3809                     repeat = True
3810                     continue
3811                 styles[style_id] = styles[parent_style_id].copy()
3812             for prop in SUPPORTED_STYLING:
3813                 prop_val = style.get(_x('tts:' + prop))
3814                 if prop_val:
3815                     styles.setdefault(style_id, {})[prop] = prop_val
3816         if repeat:
3817             repeat = False
3818         else:
3819             break
3820
3821     for p in ('body', 'div'):
3822         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3823         if ele is None:
3824             continue
3825         style = styles.get(ele.get('style'))
3826         if not style:
3827             continue
3828         default_style.update(style)
3829
3830     for para, index in zip(paras, itertools.count(1)):
3831         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3832         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3833         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3834         if begin_time is None:
3835             continue
3836         if not end_time:
3837             if not dur:
3838                 continue
3839             end_time = begin_time + dur
3840         out.append('%d\n%s --> %s\n%s\n\n' % (
3841             index,
3842             srt_subtitles_timecode(begin_time),
3843             srt_subtitles_timecode(end_time),
3844             parse_node(para)))
3845
3846     return ''.join(out)
3847
3848
3849 def cli_option(params, command_option, param):
3850     param = params.get(param)
3851     if param:
3852         param = compat_str(param)
3853     return [command_option, param] if param is not None else []
3854
3855
3856 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3857     param = params.get(param)
3858     if param is None:
3859         return []
3860     assert isinstance(param, bool)
3861     if separator:
3862         return [command_option + separator + (true_value if param else false_value)]
3863     return [command_option, true_value if param else false_value]
3864
3865
3866 def cli_valueless_option(params, command_option, param, expected_value=True):
3867     param = params.get(param)
3868     return [command_option] if param == expected_value else []
3869
3870
3871 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3872     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3873         if use_compat:
3874             return argdict
3875         else:
3876             argdict = None
3877     if argdict is None:
3878         return default
3879     assert isinstance(argdict, dict)
3880
3881     assert isinstance(keys, (list, tuple))
3882     for key_list in keys:
3883         arg_list = list(filter(
3884             lambda x: x is not None,
3885             [argdict.get(key.lower()) for key in variadic(key_list)]))
3886         if arg_list:
3887             return [arg for args in arg_list for arg in args]
3888     return default
3889
3890
3891 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3892     main_key, exe = main_key.lower(), exe.lower()
3893     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3894     keys = [f'{root_key}{k}' for k in (keys or [''])]
3895     if root_key in keys:
3896         if main_key != exe:
3897             keys.append((main_key, exe))
3898         keys.append('default')
3899     else:
3900         use_compat = False
3901     return cli_configuration_args(argdict, keys, default, use_compat)
3902
3903
3904 class ISO639Utils(object):
3905     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3906     _lang_map = {
3907         'aa': 'aar',
3908         'ab': 'abk',
3909         'ae': 'ave',
3910         'af': 'afr',
3911         'ak': 'aka',
3912         'am': 'amh',
3913         'an': 'arg',
3914         'ar': 'ara',
3915         'as': 'asm',
3916         'av': 'ava',
3917         'ay': 'aym',
3918         'az': 'aze',
3919         'ba': 'bak',
3920         'be': 'bel',
3921         'bg': 'bul',
3922         'bh': 'bih',
3923         'bi': 'bis',
3924         'bm': 'bam',
3925         'bn': 'ben',
3926         'bo': 'bod',
3927         'br': 'bre',
3928         'bs': 'bos',
3929         'ca': 'cat',
3930         'ce': 'che',
3931         'ch': 'cha',
3932         'co': 'cos',
3933         'cr': 'cre',
3934         'cs': 'ces',
3935         'cu': 'chu',
3936         'cv': 'chv',
3937         'cy': 'cym',
3938         'da': 'dan',
3939         'de': 'deu',
3940         'dv': 'div',
3941         'dz': 'dzo',
3942         'ee': 'ewe',
3943         'el': 'ell',
3944         'en': 'eng',
3945         'eo': 'epo',
3946         'es': 'spa',
3947         'et': 'est',
3948         'eu': 'eus',
3949         'fa': 'fas',
3950         'ff': 'ful',
3951         'fi': 'fin',
3952         'fj': 'fij',
3953         'fo': 'fao',
3954         'fr': 'fra',
3955         'fy': 'fry',
3956         'ga': 'gle',
3957         'gd': 'gla',
3958         'gl': 'glg',
3959         'gn': 'grn',
3960         'gu': 'guj',
3961         'gv': 'glv',
3962         'ha': 'hau',
3963         'he': 'heb',
3964         'iw': 'heb',  # Replaced by he in 1989 revision
3965         'hi': 'hin',
3966         'ho': 'hmo',
3967         'hr': 'hrv',
3968         'ht': 'hat',
3969         'hu': 'hun',
3970         'hy': 'hye',
3971         'hz': 'her',
3972         'ia': 'ina',
3973         'id': 'ind',
3974         'in': 'ind',  # Replaced by id in 1989 revision
3975         'ie': 'ile',
3976         'ig': 'ibo',
3977         'ii': 'iii',
3978         'ik': 'ipk',
3979         'io': 'ido',
3980         'is': 'isl',
3981         'it': 'ita',
3982         'iu': 'iku',
3983         'ja': 'jpn',
3984         'jv': 'jav',
3985         'ka': 'kat',
3986         'kg': 'kon',
3987         'ki': 'kik',
3988         'kj': 'kua',
3989         'kk': 'kaz',
3990         'kl': 'kal',
3991         'km': 'khm',
3992         'kn': 'kan',
3993         'ko': 'kor',
3994         'kr': 'kau',
3995         'ks': 'kas',
3996         'ku': 'kur',
3997         'kv': 'kom',
3998         'kw': 'cor',
3999         'ky': 'kir',
4000         'la': 'lat',
4001         'lb': 'ltz',
4002         'lg': 'lug',
4003         'li': 'lim',
4004         'ln': 'lin',
4005         'lo': 'lao',
4006         'lt': 'lit',
4007         'lu': 'lub',
4008         'lv': 'lav',
4009         'mg': 'mlg',
4010         'mh': 'mah',
4011         'mi': 'mri',
4012         'mk': 'mkd',
4013         'ml': 'mal',
4014         'mn': 'mon',
4015         'mr': 'mar',
4016         'ms': 'msa',
4017         'mt': 'mlt',
4018         'my': 'mya',
4019         'na': 'nau',
4020         'nb': 'nob',
4021         'nd': 'nde',
4022         'ne': 'nep',
4023         'ng': 'ndo',
4024         'nl': 'nld',
4025         'nn': 'nno',
4026         'no': 'nor',
4027         'nr': 'nbl',
4028         'nv': 'nav',
4029         'ny': 'nya',
4030         'oc': 'oci',
4031         'oj': 'oji',
4032         'om': 'orm',
4033         'or': 'ori',
4034         'os': 'oss',
4035         'pa': 'pan',
4036         'pi': 'pli',
4037         'pl': 'pol',
4038         'ps': 'pus',
4039         'pt': 'por',
4040         'qu': 'que',
4041         'rm': 'roh',
4042         'rn': 'run',
4043         'ro': 'ron',
4044         'ru': 'rus',
4045         'rw': 'kin',
4046         'sa': 'san',
4047         'sc': 'srd',
4048         'sd': 'snd',
4049         'se': 'sme',
4050         'sg': 'sag',
4051         'si': 'sin',
4052         'sk': 'slk',
4053         'sl': 'slv',
4054         'sm': 'smo',
4055         'sn': 'sna',
4056         'so': 'som',
4057         'sq': 'sqi',
4058         'sr': 'srp',
4059         'ss': 'ssw',
4060         'st': 'sot',
4061         'su': 'sun',
4062         'sv': 'swe',
4063         'sw': 'swa',
4064         'ta': 'tam',
4065         'te': 'tel',
4066         'tg': 'tgk',
4067         'th': 'tha',
4068         'ti': 'tir',
4069         'tk': 'tuk',
4070         'tl': 'tgl',
4071         'tn': 'tsn',
4072         'to': 'ton',
4073         'tr': 'tur',
4074         'ts': 'tso',
4075         'tt': 'tat',
4076         'tw': 'twi',
4077         'ty': 'tah',
4078         'ug': 'uig',
4079         'uk': 'ukr',
4080         'ur': 'urd',
4081         'uz': 'uzb',
4082         've': 'ven',
4083         'vi': 'vie',
4084         'vo': 'vol',
4085         'wa': 'wln',
4086         'wo': 'wol',
4087         'xh': 'xho',
4088         'yi': 'yid',
4089         'ji': 'yid',  # Replaced by yi in 1989 revision
4090         'yo': 'yor',
4091         'za': 'zha',
4092         'zh': 'zho',
4093         'zu': 'zul',
4094     }
4095
4096     @classmethod
4097     def short2long(cls, code):
4098         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4099         return cls._lang_map.get(code[:2])
4100
4101     @classmethod
4102     def long2short(cls, code):
4103         """Convert language code from ISO 639-2/T to ISO 639-1"""
4104         for short_name, long_name in cls._lang_map.items():
4105             if long_name == code:
4106                 return short_name
4107
4108
4109 class ISO3166Utils(object):
4110     # From http://data.okfn.org/data/core/country-list
4111     _country_map = {
4112         'AF': 'Afghanistan',
4113         'AX': 'Åland Islands',
4114         'AL': 'Albania',
4115         'DZ': 'Algeria',
4116         'AS': 'American Samoa',
4117         'AD': 'Andorra',
4118         'AO': 'Angola',
4119         'AI': 'Anguilla',
4120         'AQ': 'Antarctica',
4121         'AG': 'Antigua and Barbuda',
4122         'AR': 'Argentina',
4123         'AM': 'Armenia',
4124         'AW': 'Aruba',
4125         'AU': 'Australia',
4126         'AT': 'Austria',
4127         'AZ': 'Azerbaijan',
4128         'BS': 'Bahamas',
4129         'BH': 'Bahrain',
4130         'BD': 'Bangladesh',
4131         'BB': 'Barbados',
4132         'BY': 'Belarus',
4133         'BE': 'Belgium',
4134         'BZ': 'Belize',
4135         'BJ': 'Benin',
4136         'BM': 'Bermuda',
4137         'BT': 'Bhutan',
4138         'BO': 'Bolivia, Plurinational State of',
4139         'BQ': 'Bonaire, Sint Eustatius and Saba',
4140         'BA': 'Bosnia and Herzegovina',
4141         'BW': 'Botswana',
4142         'BV': 'Bouvet Island',
4143         'BR': 'Brazil',
4144         'IO': 'British Indian Ocean Territory',
4145         'BN': 'Brunei Darussalam',
4146         'BG': 'Bulgaria',
4147         'BF': 'Burkina Faso',
4148         'BI': 'Burundi',
4149         'KH': 'Cambodia',
4150         'CM': 'Cameroon',
4151         'CA': 'Canada',
4152         'CV': 'Cape Verde',
4153         'KY': 'Cayman Islands',
4154         'CF': 'Central African Republic',
4155         'TD': 'Chad',
4156         'CL': 'Chile',
4157         'CN': 'China',
4158         'CX': 'Christmas Island',
4159         'CC': 'Cocos (Keeling) Islands',
4160         'CO': 'Colombia',
4161         'KM': 'Comoros',
4162         'CG': 'Congo',
4163         'CD': 'Congo, the Democratic Republic of the',
4164         'CK': 'Cook Islands',
4165         'CR': 'Costa Rica',
4166         'CI': 'Côte d\'Ivoire',
4167         'HR': 'Croatia',
4168         'CU': 'Cuba',
4169         'CW': 'Curaçao',
4170         'CY': 'Cyprus',
4171         'CZ': 'Czech Republic',
4172         'DK': 'Denmark',
4173         'DJ': 'Djibouti',
4174         'DM': 'Dominica',
4175         'DO': 'Dominican Republic',
4176         'EC': 'Ecuador',
4177         'EG': 'Egypt',
4178         'SV': 'El Salvador',
4179         'GQ': 'Equatorial Guinea',
4180         'ER': 'Eritrea',
4181         'EE': 'Estonia',
4182         'ET': 'Ethiopia',
4183         'FK': 'Falkland Islands (Malvinas)',
4184         'FO': 'Faroe Islands',
4185         'FJ': 'Fiji',
4186         'FI': 'Finland',
4187         'FR': 'France',
4188         'GF': 'French Guiana',
4189         'PF': 'French Polynesia',
4190         'TF': 'French Southern Territories',
4191         'GA': 'Gabon',
4192         'GM': 'Gambia',
4193         'GE': 'Georgia',
4194         'DE': 'Germany',
4195         'GH': 'Ghana',
4196         'GI': 'Gibraltar',
4197         'GR': 'Greece',
4198         'GL': 'Greenland',
4199         'GD': 'Grenada',
4200         'GP': 'Guadeloupe',
4201         'GU': 'Guam',
4202         'GT': 'Guatemala',
4203         'GG': 'Guernsey',
4204         'GN': 'Guinea',
4205         'GW': 'Guinea-Bissau',
4206         'GY': 'Guyana',
4207         'HT': 'Haiti',
4208         'HM': 'Heard Island and McDonald Islands',
4209         'VA': 'Holy See (Vatican City State)',
4210         'HN': 'Honduras',
4211         'HK': 'Hong Kong',
4212         'HU': 'Hungary',
4213         'IS': 'Iceland',
4214         'IN': 'India',
4215         'ID': 'Indonesia',
4216         'IR': 'Iran, Islamic Republic of',
4217         'IQ': 'Iraq',
4218         'IE': 'Ireland',
4219         'IM': 'Isle of Man',
4220         'IL': 'Israel',
4221         'IT': 'Italy',
4222         'JM': 'Jamaica',
4223         'JP': 'Japan',
4224         'JE': 'Jersey',
4225         'JO': 'Jordan',
4226         'KZ': 'Kazakhstan',
4227         'KE': 'Kenya',
4228         'KI': 'Kiribati',
4229         'KP': 'Korea, Democratic People\'s Republic of',
4230         'KR': 'Korea, Republic of',
4231         'KW': 'Kuwait',
4232         'KG': 'Kyrgyzstan',
4233         'LA': 'Lao People\'s Democratic Republic',
4234         'LV': 'Latvia',
4235         'LB': 'Lebanon',
4236         'LS': 'Lesotho',
4237         'LR': 'Liberia',
4238         'LY': 'Libya',
4239         'LI': 'Liechtenstein',
4240         'LT': 'Lithuania',
4241         'LU': 'Luxembourg',
4242         'MO': 'Macao',
4243         'MK': 'Macedonia, the Former Yugoslav Republic of',
4244         'MG': 'Madagascar',
4245         'MW': 'Malawi',
4246         'MY': 'Malaysia',
4247         'MV': 'Maldives',
4248         'ML': 'Mali',
4249         'MT': 'Malta',
4250         'MH': 'Marshall Islands',
4251         'MQ': 'Martinique',
4252         'MR': 'Mauritania',
4253         'MU': 'Mauritius',
4254         'YT': 'Mayotte',
4255         'MX': 'Mexico',
4256         'FM': 'Micronesia, Federated States of',
4257         'MD': 'Moldova, Republic of',
4258         'MC': 'Monaco',
4259         'MN': 'Mongolia',
4260         'ME': 'Montenegro',
4261         'MS': 'Montserrat',
4262         'MA': 'Morocco',
4263         'MZ': 'Mozambique',
4264         'MM': 'Myanmar',
4265         'NA': 'Namibia',
4266         'NR': 'Nauru',
4267         'NP': 'Nepal',
4268         'NL': 'Netherlands',
4269         'NC': 'New Caledonia',
4270         'NZ': 'New Zealand',
4271         'NI': 'Nicaragua',
4272         'NE': 'Niger',
4273         'NG': 'Nigeria',
4274         'NU': 'Niue',
4275         'NF': 'Norfolk Island',
4276         'MP': 'Northern Mariana Islands',
4277         'NO': 'Norway',
4278         'OM': 'Oman',
4279         'PK': 'Pakistan',
4280         'PW': 'Palau',
4281         'PS': 'Palestine, State of',
4282         'PA': 'Panama',
4283         'PG': 'Papua New Guinea',
4284         'PY': 'Paraguay',
4285         'PE': 'Peru',
4286         'PH': 'Philippines',
4287         'PN': 'Pitcairn',
4288         'PL': 'Poland',
4289         'PT': 'Portugal',
4290         'PR': 'Puerto Rico',
4291         'QA': 'Qatar',
4292         'RE': 'Réunion',
4293         'RO': 'Romania',
4294         'RU': 'Russian Federation',
4295         'RW': 'Rwanda',
4296         'BL': 'Saint Barthélemy',
4297         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4298         'KN': 'Saint Kitts and Nevis',
4299         'LC': 'Saint Lucia',
4300         'MF': 'Saint Martin (French part)',
4301         'PM': 'Saint Pierre and Miquelon',
4302         'VC': 'Saint Vincent and the Grenadines',
4303         'WS': 'Samoa',
4304         'SM': 'San Marino',
4305         'ST': 'Sao Tome and Principe',
4306         'SA': 'Saudi Arabia',
4307         'SN': 'Senegal',
4308         'RS': 'Serbia',
4309         'SC': 'Seychelles',
4310         'SL': 'Sierra Leone',
4311         'SG': 'Singapore',
4312         'SX': 'Sint Maarten (Dutch part)',
4313         'SK': 'Slovakia',
4314         'SI': 'Slovenia',
4315         'SB': 'Solomon Islands',
4316         'SO': 'Somalia',
4317         'ZA': 'South Africa',
4318         'GS': 'South Georgia and the South Sandwich Islands',
4319         'SS': 'South Sudan',
4320         'ES': 'Spain',
4321         'LK': 'Sri Lanka',
4322         'SD': 'Sudan',
4323         'SR': 'Suriname',
4324         'SJ': 'Svalbard and Jan Mayen',
4325         'SZ': 'Swaziland',
4326         'SE': 'Sweden',
4327         'CH': 'Switzerland',
4328         'SY': 'Syrian Arab Republic',
4329         'TW': 'Taiwan, Province of China',
4330         'TJ': 'Tajikistan',
4331         'TZ': 'Tanzania, United Republic of',
4332         'TH': 'Thailand',
4333         'TL': 'Timor-Leste',
4334         'TG': 'Togo',
4335         'TK': 'Tokelau',
4336         'TO': 'Tonga',
4337         'TT': 'Trinidad and Tobago',
4338         'TN': 'Tunisia',
4339         'TR': 'Turkey',
4340         'TM': 'Turkmenistan',
4341         'TC': 'Turks and Caicos Islands',
4342         'TV': 'Tuvalu',
4343         'UG': 'Uganda',
4344         'UA': 'Ukraine',
4345         'AE': 'United Arab Emirates',
4346         'GB': 'United Kingdom',
4347         'US': 'United States',
4348         'UM': 'United States Minor Outlying Islands',
4349         'UY': 'Uruguay',
4350         'UZ': 'Uzbekistan',
4351         'VU': 'Vanuatu',
4352         'VE': 'Venezuela, Bolivarian Republic of',
4353         'VN': 'Viet Nam',
4354         'VG': 'Virgin Islands, British',
4355         'VI': 'Virgin Islands, U.S.',
4356         'WF': 'Wallis and Futuna',
4357         'EH': 'Western Sahara',
4358         'YE': 'Yemen',
4359         'ZM': 'Zambia',
4360         'ZW': 'Zimbabwe',
4361     }
4362
4363     @classmethod
4364     def short2full(cls, code):
4365         """Convert an ISO 3166-2 country code to the corresponding full name"""
4366         return cls._country_map.get(code.upper())
4367
4368
4369 class GeoUtils(object):
4370     # Major IPv4 address blocks per country
4371     _country_ip_map = {
4372         'AD': '46.172.224.0/19',
4373         'AE': '94.200.0.0/13',
4374         'AF': '149.54.0.0/17',
4375         'AG': '209.59.64.0/18',
4376         'AI': '204.14.248.0/21',
4377         'AL': '46.99.0.0/16',
4378         'AM': '46.70.0.0/15',
4379         'AO': '105.168.0.0/13',
4380         'AP': '182.50.184.0/21',
4381         'AQ': '23.154.160.0/24',
4382         'AR': '181.0.0.0/12',
4383         'AS': '202.70.112.0/20',
4384         'AT': '77.116.0.0/14',
4385         'AU': '1.128.0.0/11',
4386         'AW': '181.41.0.0/18',
4387         'AX': '185.217.4.0/22',
4388         'AZ': '5.197.0.0/16',
4389         'BA': '31.176.128.0/17',
4390         'BB': '65.48.128.0/17',
4391         'BD': '114.130.0.0/16',
4392         'BE': '57.0.0.0/8',
4393         'BF': '102.178.0.0/15',
4394         'BG': '95.42.0.0/15',
4395         'BH': '37.131.0.0/17',
4396         'BI': '154.117.192.0/18',
4397         'BJ': '137.255.0.0/16',
4398         'BL': '185.212.72.0/23',
4399         'BM': '196.12.64.0/18',
4400         'BN': '156.31.0.0/16',
4401         'BO': '161.56.0.0/16',
4402         'BQ': '161.0.80.0/20',
4403         'BR': '191.128.0.0/12',
4404         'BS': '24.51.64.0/18',
4405         'BT': '119.2.96.0/19',
4406         'BW': '168.167.0.0/16',
4407         'BY': '178.120.0.0/13',
4408         'BZ': '179.42.192.0/18',
4409         'CA': '99.224.0.0/11',
4410         'CD': '41.243.0.0/16',
4411         'CF': '197.242.176.0/21',
4412         'CG': '160.113.0.0/16',
4413         'CH': '85.0.0.0/13',
4414         'CI': '102.136.0.0/14',
4415         'CK': '202.65.32.0/19',
4416         'CL': '152.172.0.0/14',
4417         'CM': '102.244.0.0/14',
4418         'CN': '36.128.0.0/10',
4419         'CO': '181.240.0.0/12',
4420         'CR': '201.192.0.0/12',
4421         'CU': '152.206.0.0/15',
4422         'CV': '165.90.96.0/19',
4423         'CW': '190.88.128.0/17',
4424         'CY': '31.153.0.0/16',
4425         'CZ': '88.100.0.0/14',
4426         'DE': '53.0.0.0/8',
4427         'DJ': '197.241.0.0/17',
4428         'DK': '87.48.0.0/12',
4429         'DM': '192.243.48.0/20',
4430         'DO': '152.166.0.0/15',
4431         'DZ': '41.96.0.0/12',
4432         'EC': '186.68.0.0/15',
4433         'EE': '90.190.0.0/15',
4434         'EG': '156.160.0.0/11',
4435         'ER': '196.200.96.0/20',
4436         'ES': '88.0.0.0/11',
4437         'ET': '196.188.0.0/14',
4438         'EU': '2.16.0.0/13',
4439         'FI': '91.152.0.0/13',
4440         'FJ': '144.120.0.0/16',
4441         'FK': '80.73.208.0/21',
4442         'FM': '119.252.112.0/20',
4443         'FO': '88.85.32.0/19',
4444         'FR': '90.0.0.0/9',
4445         'GA': '41.158.0.0/15',
4446         'GB': '25.0.0.0/8',
4447         'GD': '74.122.88.0/21',
4448         'GE': '31.146.0.0/16',
4449         'GF': '161.22.64.0/18',
4450         'GG': '62.68.160.0/19',
4451         'GH': '154.160.0.0/12',
4452         'GI': '95.164.0.0/16',
4453         'GL': '88.83.0.0/19',
4454         'GM': '160.182.0.0/15',
4455         'GN': '197.149.192.0/18',
4456         'GP': '104.250.0.0/19',
4457         'GQ': '105.235.224.0/20',
4458         'GR': '94.64.0.0/13',
4459         'GT': '168.234.0.0/16',
4460         'GU': '168.123.0.0/16',
4461         'GW': '197.214.80.0/20',
4462         'GY': '181.41.64.0/18',
4463         'HK': '113.252.0.0/14',
4464         'HN': '181.210.0.0/16',
4465         'HR': '93.136.0.0/13',
4466         'HT': '148.102.128.0/17',
4467         'HU': '84.0.0.0/14',
4468         'ID': '39.192.0.0/10',
4469         'IE': '87.32.0.0/12',
4470         'IL': '79.176.0.0/13',
4471         'IM': '5.62.80.0/20',
4472         'IN': '117.192.0.0/10',
4473         'IO': '203.83.48.0/21',
4474         'IQ': '37.236.0.0/14',
4475         'IR': '2.176.0.0/12',
4476         'IS': '82.221.0.0/16',
4477         'IT': '79.0.0.0/10',
4478         'JE': '87.244.64.0/18',
4479         'JM': '72.27.0.0/17',
4480         'JO': '176.29.0.0/16',
4481         'JP': '133.0.0.0/8',
4482         'KE': '105.48.0.0/12',
4483         'KG': '158.181.128.0/17',
4484         'KH': '36.37.128.0/17',
4485         'KI': '103.25.140.0/22',
4486         'KM': '197.255.224.0/20',
4487         'KN': '198.167.192.0/19',
4488         'KP': '175.45.176.0/22',
4489         'KR': '175.192.0.0/10',
4490         'KW': '37.36.0.0/14',
4491         'KY': '64.96.0.0/15',
4492         'KZ': '2.72.0.0/13',
4493         'LA': '115.84.64.0/18',
4494         'LB': '178.135.0.0/16',
4495         'LC': '24.92.144.0/20',
4496         'LI': '82.117.0.0/19',
4497         'LK': '112.134.0.0/15',
4498         'LR': '102.183.0.0/16',
4499         'LS': '129.232.0.0/17',
4500         'LT': '78.56.0.0/13',
4501         'LU': '188.42.0.0/16',
4502         'LV': '46.109.0.0/16',
4503         'LY': '41.252.0.0/14',
4504         'MA': '105.128.0.0/11',
4505         'MC': '88.209.64.0/18',
4506         'MD': '37.246.0.0/16',
4507         'ME': '178.175.0.0/17',
4508         'MF': '74.112.232.0/21',
4509         'MG': '154.126.0.0/17',
4510         'MH': '117.103.88.0/21',
4511         'MK': '77.28.0.0/15',
4512         'ML': '154.118.128.0/18',
4513         'MM': '37.111.0.0/17',
4514         'MN': '49.0.128.0/17',
4515         'MO': '60.246.0.0/16',
4516         'MP': '202.88.64.0/20',
4517         'MQ': '109.203.224.0/19',
4518         'MR': '41.188.64.0/18',
4519         'MS': '208.90.112.0/22',
4520         'MT': '46.11.0.0/16',
4521         'MU': '105.16.0.0/12',
4522         'MV': '27.114.128.0/18',
4523         'MW': '102.70.0.0/15',
4524         'MX': '187.192.0.0/11',
4525         'MY': '175.136.0.0/13',
4526         'MZ': '197.218.0.0/15',
4527         'NA': '41.182.0.0/16',
4528         'NC': '101.101.0.0/18',
4529         'NE': '197.214.0.0/18',
4530         'NF': '203.17.240.0/22',
4531         'NG': '105.112.0.0/12',
4532         'NI': '186.76.0.0/15',
4533         'NL': '145.96.0.0/11',
4534         'NO': '84.208.0.0/13',
4535         'NP': '36.252.0.0/15',
4536         'NR': '203.98.224.0/19',
4537         'NU': '49.156.48.0/22',
4538         'NZ': '49.224.0.0/14',
4539         'OM': '5.36.0.0/15',
4540         'PA': '186.72.0.0/15',
4541         'PE': '186.160.0.0/14',
4542         'PF': '123.50.64.0/18',
4543         'PG': '124.240.192.0/19',
4544         'PH': '49.144.0.0/13',
4545         'PK': '39.32.0.0/11',
4546         'PL': '83.0.0.0/11',
4547         'PM': '70.36.0.0/20',
4548         'PR': '66.50.0.0/16',
4549         'PS': '188.161.0.0/16',
4550         'PT': '85.240.0.0/13',
4551         'PW': '202.124.224.0/20',
4552         'PY': '181.120.0.0/14',
4553         'QA': '37.210.0.0/15',
4554         'RE': '102.35.0.0/16',
4555         'RO': '79.112.0.0/13',
4556         'RS': '93.86.0.0/15',
4557         'RU': '5.136.0.0/13',
4558         'RW': '41.186.0.0/16',
4559         'SA': '188.48.0.0/13',
4560         'SB': '202.1.160.0/19',
4561         'SC': '154.192.0.0/11',
4562         'SD': '102.120.0.0/13',
4563         'SE': '78.64.0.0/12',
4564         'SG': '8.128.0.0/10',
4565         'SI': '188.196.0.0/14',
4566         'SK': '78.98.0.0/15',
4567         'SL': '102.143.0.0/17',
4568         'SM': '89.186.32.0/19',
4569         'SN': '41.82.0.0/15',
4570         'SO': '154.115.192.0/18',
4571         'SR': '186.179.128.0/17',
4572         'SS': '105.235.208.0/21',
4573         'ST': '197.159.160.0/19',
4574         'SV': '168.243.0.0/16',
4575         'SX': '190.102.0.0/20',
4576         'SY': '5.0.0.0/16',
4577         'SZ': '41.84.224.0/19',
4578         'TC': '65.255.48.0/20',
4579         'TD': '154.68.128.0/19',
4580         'TG': '196.168.0.0/14',
4581         'TH': '171.96.0.0/13',
4582         'TJ': '85.9.128.0/18',
4583         'TK': '27.96.24.0/21',
4584         'TL': '180.189.160.0/20',
4585         'TM': '95.85.96.0/19',
4586         'TN': '197.0.0.0/11',
4587         'TO': '175.176.144.0/21',
4588         'TR': '78.160.0.0/11',
4589         'TT': '186.44.0.0/15',
4590         'TV': '202.2.96.0/19',
4591         'TW': '120.96.0.0/11',
4592         'TZ': '156.156.0.0/14',
4593         'UA': '37.52.0.0/14',
4594         'UG': '102.80.0.0/13',
4595         'US': '6.0.0.0/8',
4596         'UY': '167.56.0.0/13',
4597         'UZ': '84.54.64.0/18',
4598         'VA': '212.77.0.0/19',
4599         'VC': '207.191.240.0/21',
4600         'VE': '186.88.0.0/13',
4601         'VG': '66.81.192.0/20',
4602         'VI': '146.226.0.0/16',
4603         'VN': '14.160.0.0/11',
4604         'VU': '202.80.32.0/20',
4605         'WF': '117.20.32.0/21',
4606         'WS': '202.4.32.0/19',
4607         'YE': '134.35.0.0/16',
4608         'YT': '41.242.116.0/22',
4609         'ZA': '41.0.0.0/11',
4610         'ZM': '102.144.0.0/13',
4611         'ZW': '102.177.192.0/18',
4612     }
4613
4614     @classmethod
4615     def random_ipv4(cls, code_or_block):
4616         if len(code_or_block) == 2:
4617             block = cls._country_ip_map.get(code_or_block.upper())
4618             if not block:
4619                 return None
4620         else:
4621             block = code_or_block
4622         addr, preflen = block.split('/')
4623         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4624         addr_max = addr_min | (0xffffffff >> int(preflen))
4625         return compat_str(socket.inet_ntoa(
4626             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4627
4628
4629 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4630     def __init__(self, proxies=None):
4631         # Set default handlers
4632         for type in ('http', 'https'):
4633             setattr(self, '%s_open' % type,
4634                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4635                         meth(r, proxy, type))
4636         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4637
4638     def proxy_open(self, req, proxy, type):
4639         req_proxy = req.headers.get('Ytdl-request-proxy')
4640         if req_proxy is not None:
4641             proxy = req_proxy
4642             del req.headers['Ytdl-request-proxy']
4643
4644         if proxy == '__noproxy__':
4645             return None  # No Proxy
4646         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4647             req.add_header('Ytdl-socks-proxy', proxy)
4648             # yt-dlp's http/https handlers do wrapping the socket with socks
4649             return None
4650         return compat_urllib_request.ProxyHandler.proxy_open(
4651             self, req, proxy, type)
4652
4653
4654 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4655 # released into Public Domain
4656 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4657
4658 def long_to_bytes(n, blocksize=0):
4659     """long_to_bytes(n:long, blocksize:int) : string
4660     Convert a long integer to a byte string.
4661
4662     If optional blocksize is given and greater than zero, pad the front of the
4663     byte string with binary zeros so that the length is a multiple of
4664     blocksize.
4665     """
4666     # after much testing, this algorithm was deemed to be the fastest
4667     s = b''
4668     n = int(n)
4669     while n > 0:
4670         s = compat_struct_pack('>I', n & 0xffffffff) + s
4671         n = n >> 32
4672     # strip off leading zeros
4673     for i in range(len(s)):
4674         if s[i] != b'\000'[0]:
4675             break
4676     else:
4677         # only happens when n == 0
4678         s = b'\000'
4679         i = 0
4680     s = s[i:]
4681     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4682     # de-padding being done above, but sigh...
4683     if blocksize > 0 and len(s) % blocksize:
4684         s = (blocksize - len(s) % blocksize) * b'\000' + s
4685     return s
4686
4687
4688 def bytes_to_long(s):
4689     """bytes_to_long(string) : long
4690     Convert a byte string to a long integer.
4691
4692     This is (essentially) the inverse of long_to_bytes().
4693     """
4694     acc = 0
4695     length = len(s)
4696     if length % 4:
4697         extra = (4 - length % 4)
4698         s = b'\000' * extra + s
4699         length = length + extra
4700     for i in range(0, length, 4):
4701         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4702     return acc
4703
4704
4705 def ohdave_rsa_encrypt(data, exponent, modulus):
4706     '''
4707     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4708
4709     Input:
4710         data: data to encrypt, bytes-like object
4711         exponent, modulus: parameter e and N of RSA algorithm, both integer
4712     Output: hex string of encrypted data
4713
4714     Limitation: supports one block encryption only
4715     '''
4716
4717     payload = int(binascii.hexlify(data[::-1]), 16)
4718     encrypted = pow(payload, exponent, modulus)
4719     return '%x' % encrypted
4720
4721
4722 def pkcs1pad(data, length):
4723     """
4724     Padding input data with PKCS#1 scheme
4725
4726     @param {int[]} data        input data
4727     @param {int}   length      target length
4728     @returns {int[]}           padded data
4729     """
4730     if len(data) > length - 11:
4731         raise ValueError('Input data too long for PKCS#1 padding')
4732
4733     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4734     return [0, 2] + pseudo_random + [0] + data
4735
4736
4737 def encode_base_n(num, n, table=None):
4738     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4739     if not table:
4740         table = FULL_TABLE[:n]
4741
4742     if n > len(table):
4743         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4744
4745     if num == 0:
4746         return table[0]
4747
4748     ret = ''
4749     while num:
4750         ret = table[num % n] + ret
4751         num = num // n
4752     return ret
4753
4754
4755 def decode_packed_codes(code):
4756     mobj = re.search(PACKED_CODES_RE, code)
4757     obfuscated_code, base, count, symbols = mobj.groups()
4758     base = int(base)
4759     count = int(count)
4760     symbols = symbols.split('|')
4761     symbol_table = {}
4762
4763     while count:
4764         count -= 1
4765         base_n_count = encode_base_n(count, base)
4766         symbol_table[base_n_count] = symbols[count] or base_n_count
4767
4768     return re.sub(
4769         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4770         obfuscated_code)
4771
4772
4773 def caesar(s, alphabet, shift):
4774     if shift == 0:
4775         return s
4776     l = len(alphabet)
4777     return ''.join(
4778         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4779         for c in s)
4780
4781
4782 def rot47(s):
4783     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4784
4785
4786 def parse_m3u8_attributes(attrib):
4787     info = {}
4788     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4789         if val.startswith('"'):
4790             val = val[1:-1]
4791         info[key] = val
4792     return info
4793
4794
4795 def urshift(val, n):
4796     return val >> n if val >= 0 else (val + 0x100000000) >> n
4797
4798
4799 # Based on png2str() written by @gdkchan and improved by @yokrysty
4800 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4801 def decode_png(png_data):
4802     # Reference: https://www.w3.org/TR/PNG/
4803     header = png_data[8:]
4804
4805     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4806         raise IOError('Not a valid PNG file.')
4807
4808     int_map = {1: '>B', 2: '>H', 4: '>I'}
4809     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4810
4811     chunks = []
4812
4813     while header:
4814         length = unpack_integer(header[:4])
4815         header = header[4:]
4816
4817         chunk_type = header[:4]
4818         header = header[4:]
4819
4820         chunk_data = header[:length]
4821         header = header[length:]
4822
4823         header = header[4:]  # Skip CRC
4824
4825         chunks.append({
4826             'type': chunk_type,
4827             'length': length,
4828             'data': chunk_data
4829         })
4830
4831     ihdr = chunks[0]['data']
4832
4833     width = unpack_integer(ihdr[:4])
4834     height = unpack_integer(ihdr[4:8])
4835
4836     idat = b''
4837
4838     for chunk in chunks:
4839         if chunk['type'] == b'IDAT':
4840             idat += chunk['data']
4841
4842     if not idat:
4843         raise IOError('Unable to read PNG data.')
4844
4845     decompressed_data = bytearray(zlib.decompress(idat))
4846
4847     stride = width * 3
4848     pixels = []
4849
4850     def _get_pixel(idx):
4851         x = idx % stride
4852         y = idx // stride
4853         return pixels[y][x]
4854
4855     for y in range(height):
4856         basePos = y * (1 + stride)
4857         filter_type = decompressed_data[basePos]
4858
4859         current_row = []
4860
4861         pixels.append(current_row)
4862
4863         for x in range(stride):
4864             color = decompressed_data[1 + basePos + x]
4865             basex = y * stride + x
4866             left = 0
4867             up = 0
4868
4869             if x > 2:
4870                 left = _get_pixel(basex - 3)
4871             if y > 0:
4872                 up = _get_pixel(basex - stride)
4873
4874             if filter_type == 1:  # Sub
4875                 color = (color + left) & 0xff
4876             elif filter_type == 2:  # Up
4877                 color = (color + up) & 0xff
4878             elif filter_type == 3:  # Average
4879                 color = (color + ((left + up) >> 1)) & 0xff
4880             elif filter_type == 4:  # Paeth
4881                 a = left
4882                 b = up
4883                 c = 0
4884
4885                 if x > 2 and y > 0:
4886                     c = _get_pixel(basex - stride - 3)
4887
4888                 p = a + b - c
4889
4890                 pa = abs(p - a)
4891                 pb = abs(p - b)
4892                 pc = abs(p - c)
4893
4894                 if pa <= pb and pa <= pc:
4895                     color = (color + a) & 0xff
4896                 elif pb <= pc:
4897                     color = (color + b) & 0xff
4898                 else:
4899                     color = (color + c) & 0xff
4900
4901             current_row.append(color)
4902
4903     return width, height, pixels
4904
4905
4906 def write_xattr(path, key, value):
4907     # This mess below finds the best xattr tool for the job
4908     try:
4909         # try the pyxattr module...
4910         import xattr
4911
4912         if hasattr(xattr, 'set'):  # pyxattr
4913             # Unicode arguments are not supported in python-pyxattr until
4914             # version 0.5.0
4915             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4916             pyxattr_required_version = '0.5.0'
4917             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4918                 # TODO: fallback to CLI tools
4919                 raise XAttrUnavailableError(
4920                     'python-pyxattr is detected but is too old. '
4921                     'yt-dlp requires %s or above while your version is %s. '
4922                     'Falling back to other xattr implementations' % (
4923                         pyxattr_required_version, xattr.__version__))
4924
4925             setxattr = xattr.set
4926         else:  # xattr
4927             setxattr = xattr.setxattr
4928
4929         try:
4930             setxattr(path, key, value)
4931         except EnvironmentError as e:
4932             raise XAttrMetadataError(e.errno, e.strerror)
4933
4934     except ImportError:
4935         if compat_os_name == 'nt':
4936             # Write xattrs to NTFS Alternate Data Streams:
4937             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4938             assert ':' not in key
4939             assert os.path.exists(path)
4940
4941             ads_fn = path + ':' + key
4942             try:
4943                 with open(ads_fn, 'wb') as f:
4944                     f.write(value)
4945             except EnvironmentError as e:
4946                 raise XAttrMetadataError(e.errno, e.strerror)
4947         else:
4948             user_has_setfattr = check_executable('setfattr', ['--version'])
4949             user_has_xattr = check_executable('xattr', ['-h'])
4950
4951             if user_has_setfattr or user_has_xattr:
4952
4953                 value = value.decode('utf-8')
4954                 if user_has_setfattr:
4955                     executable = 'setfattr'
4956                     opts = ['-n', key, '-v', value]
4957                 elif user_has_xattr:
4958                     executable = 'xattr'
4959                     opts = ['-w', key, value]
4960
4961                 cmd = ([encodeFilename(executable, True)]
4962                        + [encodeArgument(o) for o in opts]
4963                        + [encodeFilename(path, True)])
4964
4965                 try:
4966                     p = Popen(
4967                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4968                 except EnvironmentError as e:
4969                     raise XAttrMetadataError(e.errno, e.strerror)
4970                 stdout, stderr = p.communicate_or_kill()
4971                 stderr = stderr.decode('utf-8', 'replace')
4972                 if p.returncode != 0:
4973                     raise XAttrMetadataError(p.returncode, stderr)
4974
4975             else:
4976                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4977                 if sys.platform.startswith('linux'):
4978                     raise XAttrUnavailableError(
4979                         "Couldn't find a tool to set the xattrs. "
4980                         "Install either the python 'pyxattr' or 'xattr' "
4981                         "modules, or the GNU 'attr' package "
4982                         "(which contains the 'setfattr' tool).")
4983                 else:
4984                     raise XAttrUnavailableError(
4985                         "Couldn't find a tool to set the xattrs. "
4986                         "Install either the python 'xattr' module, "
4987                         "or the 'xattr' binary.")
4988
4989
4990 def random_birthday(year_field, month_field, day_field):
4991     start_date = datetime.date(1950, 1, 1)
4992     end_date = datetime.date(1995, 12, 31)
4993     offset = random.randint(0, (end_date - start_date).days)
4994     random_date = start_date + datetime.timedelta(offset)
4995     return {
4996         year_field: str(random_date.year),
4997         month_field: str(random_date.month),
4998         day_field: str(random_date.day),
4999     }
5000
5001
5002 # Templates for internet shortcut files, which are plain text files.
5003 DOT_URL_LINK_TEMPLATE = '''
5004 [InternetShortcut]
5005 URL=%(url)s
5006 '''.lstrip()
5007
5008 DOT_WEBLOC_LINK_TEMPLATE = '''
5009 <?xml version="1.0" encoding="UTF-8"?>
5010 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5011 <plist version="1.0">
5012 <dict>
5013 \t<key>URL</key>
5014 \t<string>%(url)s</string>
5015 </dict>
5016 </plist>
5017 '''.lstrip()
5018
5019 DOT_DESKTOP_LINK_TEMPLATE = '''
5020 [Desktop Entry]
5021 Encoding=UTF-8
5022 Name=%(filename)s
5023 Type=Link
5024 URL=%(url)s
5025 Icon=text-html
5026 '''.lstrip()
5027
5028 LINK_TEMPLATES = {
5029     'url': DOT_URL_LINK_TEMPLATE,
5030     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5031     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5032 }
5033
5034
5035 def iri_to_uri(iri):
5036     """
5037     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5038
5039     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5040     """
5041
5042     iri_parts = compat_urllib_parse_urlparse(iri)
5043
5044     if '[' in iri_parts.netloc:
5045         raise ValueError('IPv6 URIs are not, yet, supported.')
5046         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5047
5048     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5049
5050     net_location = ''
5051     if iri_parts.username:
5052         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
5053         if iri_parts.password is not None:
5054             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
5055         net_location += '@'
5056
5057     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
5058     # The 'idna' encoding produces ASCII text.
5059     if iri_parts.port is not None and iri_parts.port != 80:
5060         net_location += ':' + str(iri_parts.port)
5061
5062     return compat_urllib_parse_urlunparse(
5063         (iri_parts.scheme,
5064             net_location,
5065
5066             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5067
5068             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5069             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5070
5071             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5072             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5073
5074             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5075
5076     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5077
5078
5079 def to_high_limit_path(path):
5080     if sys.platform in ['win32', 'cygwin']:
5081         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5082         return r'\\?\ '.rstrip() + os.path.abspath(path)
5083
5084     return path
5085
5086
5087 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5088     val = traverse_obj(obj, *variadic(field))
5089     if val in ignore:
5090         return default
5091     return template % (func(val) if func else val)
5092
5093
5094 def clean_podcast_url(url):
5095     return re.sub(r'''(?x)
5096         (?:
5097             (?:
5098                 chtbl\.com/track|
5099                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5100                 play\.podtrac\.com
5101             )/[^/]+|
5102             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5103             flex\.acast\.com|
5104             pd(?:
5105                 cn\.co| # https://podcorn.com/analytics-prefix/
5106                 st\.fm # https://podsights.com/docs/
5107             )/e
5108         )/''', '', url)
5109
5110
5111 _HEX_TABLE = '0123456789abcdef'
5112
5113
5114 def random_uuidv4():
5115     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5116
5117
5118 def make_dir(path, to_screen=None):
5119     try:
5120         dn = os.path.dirname(path)
5121         if dn and not os.path.exists(dn):
5122             os.makedirs(dn)
5123         return True
5124     except (OSError, IOError) as err:
5125         if callable(to_screen) is not None:
5126             to_screen('unable to create directory ' + error_to_compat_str(err))
5127         return False
5128
5129
5130 def get_executable_path():
5131     from zipimport import zipimporter
5132     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5133         path = os.path.dirname(sys.executable)
5134     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5135         path = os.path.join(os.path.dirname(__file__), '../..')
5136     else:
5137         path = os.path.join(os.path.dirname(__file__), '..')
5138     return os.path.abspath(path)
5139
5140
5141 def load_plugins(name, suffix, namespace):
5142     classes = {}
5143     try:
5144         plugins_spec = importlib.util.spec_from_file_location(
5145             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5146         plugins = importlib.util.module_from_spec(plugins_spec)
5147         sys.modules[plugins_spec.name] = plugins
5148         plugins_spec.loader.exec_module(plugins)
5149         for name in dir(plugins):
5150             if name in namespace:
5151                 continue
5152             if not name.endswith(suffix):
5153                 continue
5154             klass = getattr(plugins, name)
5155             classes[name] = namespace[name] = klass
5156     except FileNotFoundError:
5157         pass
5158     return classes
5159
5160
5161 def traverse_obj(
5162         obj, *path_list, default=None, expected_type=None, get_all=True,
5163         casesense=True, is_user_input=False, traverse_string=False):
5164     ''' Traverse nested list/dict/tuple
5165     @param path_list        A list of paths which are checked one by one.
5166                             Each path is a list of keys where each key is a string,
5167                             a function, a tuple of strings/None or "...".
5168                             When a fuction is given, it takes the key and value as arguments
5169                             and returns whether the key matches or not. When a tuple is given,
5170                             all the keys given in the tuple are traversed, and
5171                             "..." traverses all the keys in the object
5172                             "None" returns the object without traversal
5173     @param default          Default value to return
5174     @param expected_type    Only accept final value of this type (Can also be any callable)
5175     @param get_all          Return all the values obtained from a path or only the first one
5176     @param casesense        Whether to consider dictionary keys as case sensitive
5177     @param is_user_input    Whether the keys are generated from user input. If True,
5178                             strings are converted to int/slice if necessary
5179     @param traverse_string  Whether to traverse inside strings. If True, any
5180                             non-compatible object will also be converted into a string
5181     # TODO: Write tests
5182     '''
5183     if not casesense:
5184         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5185         path_list = (map(_lower, variadic(path)) for path in path_list)
5186
5187     def _traverse_obj(obj, path, _current_depth=0):
5188         nonlocal depth
5189         path = tuple(variadic(path))
5190         for i, key in enumerate(path):
5191             if None in (key, obj):
5192                 return obj
5193             if isinstance(key, (list, tuple)):
5194                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5195                 key = ...
5196             if key is ...:
5197                 obj = (obj.values() if isinstance(obj, dict)
5198                        else obj if isinstance(obj, (list, tuple, LazyList))
5199                        else str(obj) if traverse_string else [])
5200                 _current_depth += 1
5201                 depth = max(depth, _current_depth)
5202                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5203             elif callable(key):
5204                 if isinstance(obj, (list, tuple, LazyList)):
5205                     obj = enumerate(obj)
5206                 elif isinstance(obj, dict):
5207                     obj = obj.items()
5208                 else:
5209                     if not traverse_string:
5210                         return None
5211                     obj = str(obj)
5212                 _current_depth += 1
5213                 depth = max(depth, _current_depth)
5214                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5215             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5216                 obj = (obj.get(key) if casesense or (key in obj)
5217                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5218             else:
5219                 if is_user_input:
5220                     key = (int_or_none(key) if ':' not in key
5221                            else slice(*map(int_or_none, key.split(':'))))
5222                     if key == slice(None):
5223                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5224                 if not isinstance(key, (int, slice)):
5225                     return None
5226                 if not isinstance(obj, (list, tuple, LazyList)):
5227                     if not traverse_string:
5228                         return None
5229                     obj = str(obj)
5230                 try:
5231                     obj = obj[key]
5232                 except IndexError:
5233                     return None
5234         return obj
5235
5236     if isinstance(expected_type, type):
5237         type_test = lambda val: val if isinstance(val, expected_type) else None
5238     elif expected_type is not None:
5239         type_test = expected_type
5240     else:
5241         type_test = lambda val: val
5242
5243     for path in path_list:
5244         depth = 0
5245         val = _traverse_obj(obj, path)
5246         if val is not None:
5247             if depth:
5248                 for _ in range(depth - 1):
5249                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5250                 val = [v for v in map(type_test, val) if v is not None]
5251                 if val:
5252                     return val if get_all else val[0]
5253             else:
5254                 val = type_test(val)
5255                 if val is not None:
5256                     return val
5257     return default
5258
5259
5260 def traverse_dict(dictn, keys, casesense=True):
5261     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5262                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5263     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5264
5265
5266 def get_first(obj, keys, **kwargs):
5267     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5268
5269
5270 def variadic(x, allowed_types=(str, bytes, dict)):
5271     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5272
5273
5274 def decode_base(value, digits):
5275     # This will convert given base-x string to scalar (long or int)
5276     table = {char: index for index, char in enumerate(digits)}
5277     result = 0
5278     base = len(digits)
5279     for chr in value:
5280         result *= base
5281         result += table[chr]
5282     return result
5283
5284
5285 def time_seconds(**kwargs):
5286     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5287     return t.timestamp()
5288
5289
5290 # create a JSON Web Signature (jws) with HS256 algorithm
5291 # the resulting format is in JWS Compact Serialization
5292 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5293 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5294 def jwt_encode_hs256(payload_data, key, headers={}):
5295     header_data = {
5296         'alg': 'HS256',
5297         'typ': 'JWT',
5298     }
5299     if headers:
5300         header_data.update(headers)
5301     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5302     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5303     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5304     signature_b64 = base64.b64encode(h.digest())
5305     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5306     return token
5307
5308
5309 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5310 def jwt_decode_hs256(jwt):
5311     header_b64, payload_b64, signature_b64 = jwt.split('.')
5312     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5313     return payload_data
5314
5315
5316 def supports_terminal_sequences(stream):
5317     if compat_os_name == 'nt':
5318         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5319         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5320             return False
5321     elif not os.getenv('TERM'):
5322         return False
5323     try:
5324         return stream.isatty()
5325     except BaseException:
5326         return False
5327
5328
5329 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5330
5331
5332 def remove_terminal_sequences(string):
5333     return _terminal_sequences_re.sub('', string)
5334
5335
5336 def number_of_digits(number):
5337     return len('%d' % number)
5338
5339
5340 def join_nonempty(*values, delim='-', from_dict=None):
5341     if from_dict is not None:
5342         values = map(from_dict.get, values)
5343     return delim.join(map(str, filter(None, values)))
5344
5345
5346 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5347     """
5348     Find the largest format dimensions in terms of video width and, for each thumbnail:
5349     * Modify the URL: Match the width with the provided regex and replace with the former width
5350     * Update dimensions
5351
5352     This function is useful with video services that scale the provided thumbnails on demand
5353     """
5354     _keys = ('width', 'height')
5355     max_dimensions = max(
5356         [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5357         default=(0, 0))
5358     if not max_dimensions[0]:
5359         return thumbnails
5360     return [
5361         merge_dicts(
5362             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5363             dict(zip(_keys, max_dimensions)), thumbnail)
5364         for thumbnail in thumbnails
5365     ]
5366
5367
5368 def parse_http_range(range):
5369     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5370     if not range:
5371         return None, None, None
5372     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5373     if not crg:
5374         return None, None, None
5375     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5376
5377
5378 class Config:
5379     own_args = None
5380     filename = None
5381     __initialized = False
5382
5383     def __init__(self, parser, label=None):
5384         self._parser, self.label = parser, label
5385         self._loaded_paths, self.configs = set(), []
5386
5387     def init(self, args=None, filename=None):
5388         assert not self.__initialized
5389         directory = ''
5390         if filename:
5391             location = os.path.realpath(filename)
5392             directory = os.path.dirname(location)
5393             if location in self._loaded_paths:
5394                 return False
5395             self._loaded_paths.add(location)
5396
5397         self.__initialized = True
5398         self.own_args, self.filename = args, filename
5399         for location in self._parser.parse_args(args)[0].config_locations or []:
5400             location = os.path.join(directory, expand_path(location))
5401             if os.path.isdir(location):
5402                 location = os.path.join(location, 'yt-dlp.conf')
5403             if not os.path.exists(location):
5404                 self._parser.error(f'config location {location} does not exist')
5405             self.append_config(self.read_file(location), location)
5406         return True
5407
5408     def __str__(self):
5409         label = join_nonempty(
5410             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5411             delim=' ')
5412         return join_nonempty(
5413             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5414             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5415             delim='\n')
5416
5417     @staticmethod
5418     def read_file(filename, default=[]):
5419         try:
5420             optionf = open(filename)
5421         except IOError:
5422             return default  # silently skip if file is not present
5423         try:
5424             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5425             contents = optionf.read()
5426             if sys.version_info < (3,):
5427                 contents = contents.decode(preferredencoding())
5428             res = compat_shlex_split(contents, comments=True)
5429         finally:
5430             optionf.close()
5431         return res
5432
5433     @staticmethod
5434     def hide_login_info(opts):
5435         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5436         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5437
5438         def _scrub_eq(o):
5439             m = eqre.match(o)
5440             if m:
5441                 return m.group('key') + '=PRIVATE'
5442             else:
5443                 return o
5444
5445         opts = list(map(_scrub_eq, opts))
5446         for idx, opt in enumerate(opts):
5447             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5448                 opts[idx + 1] = 'PRIVATE'
5449         return opts
5450
5451     def append_config(self, *args, label=None):
5452         config = type(self)(self._parser, label)
5453         config._loaded_paths = self._loaded_paths
5454         if config.init(*args):
5455             self.configs.append(config)
5456
5457     @property
5458     def all_args(self):
5459         for config in reversed(self.configs):
5460             yield from config.all_args
5461         yield from self.own_args or []
5462
5463     def parse_args(self):
5464         return self._parser.parse_args(list(self.all_args))
5465
5466
5467 class WebSocketsWrapper():
5468     """Wraps websockets module to use in non-async scopes"""
5469
5470     def __init__(self, url, headers=None, connect=True):
5471         self.loop = asyncio.events.new_event_loop()
5472         self.conn = compat_websockets.connect(
5473             url, extra_headers=headers, ping_interval=None,
5474             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5475         if connect:
5476             self.__enter__()
5477         atexit.register(self.__exit__, None, None, None)
5478
5479     def __enter__(self):
5480         if not self.pool:
5481             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5482         return self
5483
5484     def send(self, *args):
5485         self.run_with_loop(self.pool.send(*args), self.loop)
5486
5487     def recv(self, *args):
5488         return self.run_with_loop(self.pool.recv(*args), self.loop)
5489
5490     def __exit__(self, type, value, traceback):
5491         try:
5492             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5493         finally:
5494             self.loop.close()
5495             self._cancel_all_tasks(self.loop)
5496
5497     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5498     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5499     @staticmethod
5500     def run_with_loop(main, loop):
5501         if not asyncio.coroutines.iscoroutine(main):
5502             raise ValueError(f'a coroutine was expected, got {main!r}')
5503
5504         try:
5505             return loop.run_until_complete(main)
5506         finally:
5507             loop.run_until_complete(loop.shutdown_asyncgens())
5508             if hasattr(loop, 'shutdown_default_executor'):
5509                 loop.run_until_complete(loop.shutdown_default_executor())
5510
5511     @staticmethod
5512     def _cancel_all_tasks(loop):
5513         to_cancel = asyncio.tasks.all_tasks(loop)
5514
5515         if not to_cancel:
5516             return
5517
5518         for task in to_cancel:
5519             task.cancel()
5520
5521         loop.run_until_complete(
5522             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5523
5524         for task in to_cancel:
5525             if task.cancelled():
5526                 continue
5527             if task.exception() is not None:
5528                 loop.call_exception_handler({
5529                     'message': 'unhandled exception during asyncio.run() shutdown',
5530                     'exception': task.exception(),
5531                     'task': task,
5532                 })
5533
5534
5535 has_websockets = bool(compat_websockets)
5536
5537
5538 def merge_headers(*dicts):
5539     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5540     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5541
5542
5543 class classproperty:
5544     def __init__(self, f):
5545         self.f = f
5546
5547     def __get__(self, _, cls):
5548         return self.f(cls)