yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_quote,
  62     compat_str,
  63     compat_struct_pack,
  64     compat_struct_unpack,
  65     compat_urllib_error,
  66     compat_urllib_parse,
  67     compat_urllib_parse_urlencode,
  68     compat_urllib_parse_urlparse,
  69     compat_urllib_parse_urlunparse,
  70     compat_urllib_parse_quote,
  71     compat_urllib_parse_quote_plus,
  72     compat_urllib_parse_unquote_plus,
  73     compat_urllib_request,
  74     compat_urlparse,
  75     compat_xpath,
  76 )
  77
  78 from .socks import (
  79     ProxyType,
  80     sockssocket,
  81 )
  82
  83
  84 def register_socks_protocols():
  85     # "Register" SOCKS protocols
  86     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  87     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  88     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  89         if scheme not in compat_urlparse.uses_netloc:
  90             compat_urlparse.uses_netloc.append(scheme)
  91
  92
  93 # This is not clearly defined otherwise
  94 compiled_regex_type = type(re.compile(''))
  95
  96
  97 def random_user_agent():
  98     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  99     _CHROME_VERSIONS = (
 100         '90.0.4430.212',
 101         '90.0.4430.24',
 102         '90.0.4430.70',
 103         '90.0.4430.72',
 104         '90.0.4430.85',
 105         '90.0.4430.93',
 106         '91.0.4472.101',
 107         '91.0.4472.106',
 108         '91.0.4472.114',
 109         '91.0.4472.124',
 110         '91.0.4472.164',
 111         '91.0.4472.19',
 112         '91.0.4472.77',
 113         '92.0.4515.107',
 114         '92.0.4515.115',
 115         '92.0.4515.131',
 116         '92.0.4515.159',
 117         '92.0.4515.43',
 118         '93.0.4556.0',
 119         '93.0.4577.15',
 120         '93.0.4577.63',
 121         '93.0.4577.82',
 122         '94.0.4606.41',
 123         '94.0.4606.54',
 124         '94.0.4606.61',
 125         '94.0.4606.71',
 126         '94.0.4606.81',
 127         '94.0.4606.85',
 128         '95.0.4638.17',
 129         '95.0.4638.50',
 130         '95.0.4638.54',
 131         '95.0.4638.69',
 132         '95.0.4638.74',
 133         '96.0.4664.18',
 134         '96.0.4664.45',
 135         '96.0.4664.55',
 136         '96.0.4664.93',
 137         '97.0.4692.20',
 138     )
 139     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 140
 141
 142 std_headers = {
 143     'User-Agent': random_user_agent(),
 144     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 145     'Accept-Encoding': 'gzip, deflate',
 146     'Accept-Language': 'en-us,en;q=0.5',
 147 }
 148
 149
 150 USER_AGENTS = {
 151     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 152 }
 153
 154
 155 NO_DEFAULT = object()
 156
 157 ENGLISH_MONTH_NAMES = [
 158     'January', 'February', 'March', 'April', 'May', 'June',
 159     'July', 'August', 'September', 'October', 'November', 'December']
 160
 161 MONTH_NAMES = {
 162     'en': ENGLISH_MONTH_NAMES,
 163     'fr': [
 164         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 165         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 166 }
 167
 168 KNOWN_EXTENSIONS = (
 169     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 170     'flv', 'f4v', 'f4a', 'f4b',
 171     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 172     'mkv', 'mka', 'mk3d',
 173     'avi', 'divx',
 174     'mov',
 175     'asf', 'wmv', 'wma',
 176     '3gp', '3g2',
 177     'mp3',
 178     'flac',
 179     'ape',
 180     'wav',
 181     'f4f', 'f4m', 'm3u8', 'smil')
 182
 183 # needed for sanitizing filenames in restricted mode
 184 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 185                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 186                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 187
 188 DATE_FORMATS = (
 189     '%d %B %Y',
 190     '%d %b %Y',
 191     '%B %d %Y',
 192     '%B %dst %Y',
 193     '%B %dnd %Y',
 194     '%B %drd %Y',
 195     '%B %dth %Y',
 196     '%b %d %Y',
 197     '%b %dst %Y',
 198     '%b %dnd %Y',
 199     '%b %drd %Y',
 200     '%b %dth %Y',
 201     '%b %dst %Y %I:%M',
 202     '%b %dnd %Y %I:%M',
 203     '%b %drd %Y %I:%M',
 204     '%b %dth %Y %I:%M',
 205     '%Y %m %d',
 206     '%Y-%m-%d',
 207     '%Y.%m.%d.',
 208     '%Y/%m/%d',
 209     '%Y/%m/%d %H:%M',
 210     '%Y/%m/%d %H:%M:%S',
 211     '%Y%m%d%H%M',
 212     '%Y%m%d%H%M%S',
 213     '%Y%m%d',
 214     '%Y-%m-%d %H:%M',
 215     '%Y-%m-%d %H:%M:%S',
 216     '%Y-%m-%d %H:%M:%S.%f',
 217     '%Y-%m-%d %H:%M:%S:%f',
 218     '%d.%m.%Y %H:%M',
 219     '%d.%m.%Y %H.%M',
 220     '%Y-%m-%dT%H:%M:%SZ',
 221     '%Y-%m-%dT%H:%M:%S.%fZ',
 222     '%Y-%m-%dT%H:%M:%S.%f0Z',
 223     '%Y-%m-%dT%H:%M:%S',
 224     '%Y-%m-%dT%H:%M:%S.%f',
 225     '%Y-%m-%dT%H:%M',
 226     '%b %d %Y at %H:%M',
 227     '%b %d %Y at %H:%M:%S',
 228     '%B %d %Y at %H:%M',
 229     '%B %d %Y at %H:%M:%S',
 230     '%H:%M %d-%b-%Y',
 231 )
 232
 233 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 234 DATE_FORMATS_DAY_FIRST.extend([
 235     '%d-%m-%Y',
 236     '%d.%m.%Y',
 237     '%d.%m.%y',
 238     '%d/%m/%Y',
 239     '%d/%m/%y',
 240     '%d/%m/%Y %H:%M:%S',
 241 ])
 242
 243 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 244 DATE_FORMATS_MONTH_FIRST.extend([
 245     '%m-%d-%Y',
 246     '%m.%d.%Y',
 247     '%m/%d/%Y',
 248     '%m/%d/%y',
 249     '%m/%d/%Y %H:%M:%S',
 250 ])
 251
 252 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 253 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 254
 255
 256 def preferredencoding():
 257     """Get preferred encoding.
 258
 259     Returns the best encoding scheme for the system, based on
 260     locale.getpreferredencoding() and some further tweaks.
 261     """
 262     try:
 263         pref = locale.getpreferredencoding()
 264         'TEST'.encode(pref)
 265     except Exception:
 266         pref = 'UTF-8'
 267
 268     return pref
 269
 270
 271 def write_json_file(obj, fn):
 272     """ Encode obj as JSON and write it to fn, atomically if possible """
 273
 274     fn = encodeFilename(fn)
 275     if sys.version_info < (3, 0) and sys.platform != 'win32':
 276         encoding = get_filesystem_encoding()
 277         # os.path.basename returns a bytes object, but NamedTemporaryFile
 278         # will fail if the filename contains non ascii characters unless we
 279         # use a unicode object
 280         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 281         # the same for os.path.dirname
 282         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 283     else:
 284         path_basename = os.path.basename
 285         path_dirname = os.path.dirname
 286
 287     args = {
 288         'suffix': '.tmp',
 289         'prefix': path_basename(fn) + '.',
 290         'dir': path_dirname(fn),
 291         'delete': False,
 292     }
 293
 294     # In Python 2.x, json.dump expects a bytestream.
 295     # In Python 3.x, it writes to a character stream
 296     if sys.version_info < (3, 0):
 297         args['mode'] = 'wb'
 298     else:
 299         args.update({
 300             'mode': 'w',
 301             'encoding': 'utf-8',
 302         })
 303
 304     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 305
 306     try:
 307         with tf:
 308             json.dump(obj, tf, ensure_ascii=False)
 309         if sys.platform == 'win32':
 310             # Need to remove existing file on Windows, else os.rename raises
 311             # WindowsError or FileExistsError.
 312             try:
 313                 os.unlink(fn)
 314             except OSError:
 315                 pass
 316         try:
 317             mask = os.umask(0)
 318             os.umask(mask)
 319             os.chmod(tf.name, 0o666 & ~mask)
 320         except OSError:
 321             pass
 322         os.rename(tf.name, fn)
 323     except Exception:
 324         try:
 325             os.remove(tf.name)
 326         except OSError:
 327             pass
 328         raise
 329
 330
 331 if sys.version_info >= (2, 7):
 332     def find_xpath_attr(node, xpath, key, val=None):
 333         """ Find the xpath xpath[@key=val] """
 334         assert re.match(r'^[a-zA-Z_-]+$', key)
 335         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 336         return node.find(expr)
 337 else:
 338     def find_xpath_attr(node, xpath, key, val=None):
 339         for f in node.findall(compat_xpath(xpath)):
 340             if key not in f.attrib:
 341                 continue
 342             if val is None or f.attrib.get(key) == val:
 343                 return f
 344         return None
 345
 346 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 347 # the namespace parameter
 348
 349
 350 def xpath_with_ns(path, ns_map):
 351     components = [c.split(':') for c in path.split('/')]
 352     replaced = []
 353     for c in components:
 354         if len(c) == 1:
 355             replaced.append(c[0])
 356         else:
 357             ns, tag = c
 358             replaced.append('{%s}%s' % (ns_map[ns], tag))
 359     return '/'.join(replaced)
 360
 361
 362 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 363     def _find_xpath(xpath):
 364         return node.find(compat_xpath(xpath))
 365
 366     if isinstance(xpath, (str, compat_str)):
 367         n = _find_xpath(xpath)
 368     else:
 369         for xp in xpath:
 370             n = _find_xpath(xp)
 371             if n is not None:
 372                 break
 373
 374     if n is None:
 375         if default is not NO_DEFAULT:
 376             return default
 377         elif fatal:
 378             name = xpath if name is None else name
 379             raise ExtractorError('Could not find XML element %s' % name)
 380         else:
 381             return None
 382     return n
 383
 384
 385 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 386     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 387     if n is None or n == default:
 388         return n
 389     if n.text is None:
 390         if default is not NO_DEFAULT:
 391             return default
 392         elif fatal:
 393             name = xpath if name is None else name
 394             raise ExtractorError('Could not find XML element\'s text %s' % name)
 395         else:
 396             return None
 397     return n.text
 398
 399
 400 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 401     n = find_xpath_attr(node, xpath, key)
 402     if n is None:
 403         if default is not NO_DEFAULT:
 404             return default
 405         elif fatal:
 406             name = '%s[@%s]' % (xpath, key) if name is None else name
 407             raise ExtractorError('Could not find XML attribute %s' % name)
 408         else:
 409             return None
 410     return n.attrib[key]
 411
 412
 413 def get_element_by_id(id, html):
 414     """Return the content of the tag with the specified ID in the passed HTML document"""
 415     return get_element_by_attribute('id', id, html)
 416
 417
 418 def get_element_by_class(class_name, html):
 419     """Return the content of the first tag with the specified class in the passed HTML document"""
 420     retval = get_elements_by_class(class_name, html)
 421     return retval[0] if retval else None
 422
 423
 424 def get_element_by_attribute(attribute, value, html, escape_value=True):
 425     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 426     return retval[0] if retval else None
 427
 428
 429 def get_elements_by_class(class_name, html):
 430     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 431     return get_elements_by_attribute(
 432         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 433         html, escape_value=False)
 434
 435
 436 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 437     """Return the content of the tag with the specified attribute in the passed HTML document"""
 438
 439     value = re.escape(value) if escape_value else value
 440
 441     retlist = []
 442     for m in re.finditer(r'''(?xs)
 443         <([a-zA-Z0-9:._-]+)
 444          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 445          \s+%s=['"]?%s['"]?
 446          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 447         \s*>
 448         (?P<content>.*?)
 449         </\1>
 450     ''' % (re.escape(attribute), value), html):
 451         res = m.group('content')
 452
 453         if res.startswith('"') or res.startswith("'"):
 454             res = res[1:-1]
 455
 456         retlist.append(unescapeHTML(res))
 457
 458     return retlist
 459
 460
 461 class HTMLAttributeParser(compat_HTMLParser):
 462     """Trivial HTML parser to gather the attributes for a single element"""
 463
 464     def __init__(self):
 465         self.attrs = {}
 466         compat_HTMLParser.__init__(self)
 467
 468     def handle_starttag(self, tag, attrs):
 469         self.attrs = dict(attrs)
 470
 471
 472 class HTMLListAttrsParser(compat_HTMLParser):
 473     """HTML parser to gather the attributes for the elements of a list"""
 474
 475     def __init__(self):
 476         compat_HTMLParser.__init__(self)
 477         self.items = []
 478         self._level = 0
 479
 480     def handle_starttag(self, tag, attrs):
 481         if tag == 'li' and self._level == 0:
 482             self.items.append(dict(attrs))
 483         self._level += 1
 484
 485     def handle_endtag(self, tag):
 486         self._level -= 1
 487
 488
 489 def extract_attributes(html_element):
 490     """Given a string for an HTML element such as
 491     <el
 492          a="foo" B="bar" c="&98;az" d=boz
 493          empty= noval entity="&amp;"
 494          sq='"' dq="'"
 495     >
 496     Decode and return a dictionary of attributes.
 497     {
 498         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 499         'empty': '', 'noval': None, 'entity': '&',
 500         'sq': '"', 'dq': '\''
 501     }.
 502     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 503     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 504     """
 505     parser = HTMLAttributeParser()
 506     try:
 507         parser.feed(html_element)
 508         parser.close()
 509     # Older Python may throw HTMLParseError in case of malformed HTML
 510     except compat_HTMLParseError:
 511         pass
 512     return parser.attrs
 513
 514
 515 def parse_list(webpage):
 516     """Given a string for an series of HTML <li> elements,
 517     return a dictionary of their attributes"""
 518     parser = HTMLListAttrsParser()
 519     parser.feed(webpage)
 520     parser.close()
 521     return parser.items
 522
 523
 524 def clean_html(html):
 525     """Clean an HTML snippet into a readable string"""
 526
 527     if html is None:  # Convenience for sanitizing descriptions etc.
 528         return html
 529
 530     # Newline vs <br />
 531     html = html.replace('\n', ' ')
 532     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 533     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 534     # Strip html tags
 535     html = re.sub('<.*?>', '', html)
 536     # Replace html entities
 537     html = unescapeHTML(html)
 538     return html.strip()
 539
 540
 541 def sanitize_open(filename, open_mode):
 542     """Try to open the given filename, and slightly tweak it if this fails.
 543
 544     Attempts to open the given filename. If this fails, it tries to change
 545     the filename slightly, step by step, until it's either able to open it
 546     or it fails and raises a final exception, like the standard open()
 547     function.
 548
 549     It returns the tuple (stream, definitive_file_name).
 550     """
 551     try:
 552         if filename == '-':
 553             if sys.platform == 'win32':
 554                 import msvcrt
 555                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 556             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 557         stream = open(encodeFilename(filename), open_mode)
 558         return (stream, filename)
 559     except (IOError, OSError) as err:
 560         if err.errno in (errno.EACCES,):
 561             raise
 562
 563         # In case of error, try to remove win32 forbidden chars
 564         alt_filename = sanitize_path(filename)
 565         if alt_filename == filename:
 566             raise
 567         else:
 568             # An exception here should be caught in the caller
 569             stream = open(encodeFilename(alt_filename), open_mode)
 570             return (stream, alt_filename)
 571
 572
 573 def timeconvert(timestr):
 574     """Convert RFC 2822 defined time string into system timestamp"""
 575     timestamp = None
 576     timetuple = email.utils.parsedate_tz(timestr)
 577     if timetuple is not None:
 578         timestamp = email.utils.mktime_tz(timetuple)
 579     return timestamp
 580
 581
 582 def sanitize_filename(s, restricted=False, is_id=False):
 583     """Sanitizes a string so it could be used as part of a filename.
 584     If restricted is set, use a stricter subset of allowed characters.
 585     Set is_id if this is not an arbitrary string, but an ID that should be kept
 586     if possible.
 587     """
 588     def replace_insane(char):
 589         if restricted and char in ACCENT_CHARS:
 590             return ACCENT_CHARS[char]
 591         elif not restricted and char == '\n':
 592             return ' '
 593         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 594             return ''
 595         elif char == '"':
 596             return '' if restricted else '\''
 597         elif char == ':':
 598             return '_-' if restricted else ' -'
 599         elif char in '\\/|*<>':
 600             return '_'
 601         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 602             return '_'
 603         if restricted and ord(char) > 127:
 604             return '_'
 605         return char
 606
 607     if s == '':
 608         return ''
 609     # Handle timestamps
 610     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 611     result = ''.join(map(replace_insane, s))
 612     if not is_id:
 613         while '__' in result:
 614             result = result.replace('__', '_')
 615         result = result.strip('_')
 616         # Common case of "Foreign band name - English song title"
 617         if restricted and result.startswith('-_'):
 618             result = result[2:]
 619         if result.startswith('-'):
 620             result = '_' + result[len('-'):]
 621         result = result.lstrip('.')
 622         if not result:
 623             result = '_'
 624     return result
 625
 626
 627 def sanitize_path(s, force=False):
 628     """Sanitizes and normalizes path on Windows"""
 629     if sys.platform == 'win32':
 630         force = False
 631         drive_or_unc, _ = os.path.splitdrive(s)
 632         if sys.version_info < (2, 7) and not drive_or_unc:
 633             drive_or_unc, _ = os.path.splitunc(s)
 634     elif force:
 635         drive_or_unc = ''
 636     else:
 637         return s
 638
 639     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 640     if drive_or_unc:
 641         norm_path.pop(0)
 642     sanitized_path = [
 643         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 644         for path_part in norm_path]
 645     if drive_or_unc:
 646         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 647     elif force and s[0] == os.path.sep:
 648         sanitized_path.insert(0, os.path.sep)
 649     return os.path.join(*sanitized_path)
 650
 651
 652 def sanitize_url(url):
 653     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 654     # the number of unwanted failures due to missing protocol
 655     if url.startswith('//'):
 656         return 'http:%s' % url
 657     # Fix some common typos seen so far
 658     COMMON_TYPOS = (
 659         # https://github.com/ytdl-org/youtube-dl/issues/15649
 660         (r'^httpss://', r'https://'),
 661         # https://bx1.be/lives/direct-tv/
 662         (r'^rmtp([es]?)://', r'rtmp\1://'),
 663     )
 664     for mistake, fixup in COMMON_TYPOS:
 665         if re.match(mistake, url):
 666             return re.sub(mistake, fixup, url)
 667     return url
 668
 669
 670 def extract_basic_auth(url):
 671     parts = compat_urlparse.urlsplit(url)
 672     if parts.username is None:
 673         return url, None
 674     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 675         parts.hostname if parts.port is None
 676         else '%s:%d' % (parts.hostname, parts.port))))
 677     auth_payload = base64.b64encode(
 678         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 679     return url, 'Basic ' + auth_payload.decode('utf-8')
 680
 681
 682 def sanitized_Request(url, *args, **kwargs):
 683     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 684     if auth_header is not None:
 685         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 686         headers['Authorization'] = auth_header
 687     return compat_urllib_request.Request(url, *args, **kwargs)
 688
 689
 690 def expand_path(s):
 691     """Expand shell variables and ~"""
 692     return os.path.expandvars(compat_expanduser(s))
 693
 694
 695 def orderedSet(iterable):
 696     """ Remove all duplicates from the input iterable """
 697     res = []
 698     for el in iterable:
 699         if el not in res:
 700             res.append(el)
 701     return res
 702
 703
 704 def _htmlentity_transform(entity_with_semicolon):
 705     """Transforms an HTML entity to a character."""
 706     entity = entity_with_semicolon[:-1]
 707
 708     # Known non-numeric HTML entity
 709     if entity in compat_html_entities.name2codepoint:
 710         return compat_chr(compat_html_entities.name2codepoint[entity])
 711
 712     # TODO: HTML5 allows entities without a semicolon. For example,
 713     # '&Eacuteric' should be decoded as 'Éric'.
 714     if entity_with_semicolon in compat_html_entities_html5:
 715         return compat_html_entities_html5[entity_with_semicolon]
 716
 717     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 718     if mobj is not None:
 719         numstr = mobj.group(1)
 720         if numstr.startswith('x'):
 721             base = 16
 722             numstr = '0%s' % numstr
 723         else:
 724             base = 10
 725         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 726         try:
 727             return compat_chr(int(numstr, base))
 728         except ValueError:
 729             pass
 730
 731     # Unknown entity in name, return its literal representation
 732     return '&%s;' % entity
 733
 734
 735 def unescapeHTML(s):
 736     if s is None:
 737         return None
 738     assert type(s) == compat_str
 739
 740     return re.sub(
 741         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 742
 743
 744 def escapeHTML(text):
 745     return (
 746         text
 747         .replace('&', '&amp;')
 748         .replace('<', '&lt;')
 749         .replace('>', '&gt;')
 750         .replace('"', '&quot;')
 751         .replace("'", '&#39;')
 752     )
 753
 754
 755 def process_communicate_or_kill(p, *args, **kwargs):
 756     try:
 757         return p.communicate(*args, **kwargs)
 758     except BaseException:  # Including KeyboardInterrupt
 759         p.kill()
 760         p.wait()
 761         raise
 762
 763
 764 class Popen(subprocess.Popen):
 765     if sys.platform == 'win32':
 766         _startupinfo = subprocess.STARTUPINFO()
 767         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 768     else:
 769         _startupinfo = None
 770
 771     def __init__(self, *args, **kwargs):
 772         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 773
 774     def communicate_or_kill(self, *args, **kwargs):
 775         return process_communicate_or_kill(self, *args, **kwargs)
 776
 777
 778 def get_subprocess_encoding():
 779     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 780         # For subprocess calls, encode with locale encoding
 781         # Refer to http://stackoverflow.com/a/9951851/35070
 782         encoding = preferredencoding()
 783     else:
 784         encoding = sys.getfilesystemencoding()
 785     if encoding is None:
 786         encoding = 'utf-8'
 787     return encoding
 788
 789
 790 def encodeFilename(s, for_subprocess=False):
 791     """
 792     @param s The name of the file
 793     """
 794
 795     assert type(s) == compat_str
 796
 797     # Python 3 has a Unicode API
 798     if sys.version_info >= (3, 0):
 799         return s
 800
 801     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 802     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 803     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 804     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 805         return s
 806
 807     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 808     if sys.platform.startswith('java'):
 809         return s
 810
 811     return s.encode(get_subprocess_encoding(), 'ignore')
 812
 813
 814 def decodeFilename(b, for_subprocess=False):
 815
 816     if sys.version_info >= (3, 0):
 817         return b
 818
 819     if not isinstance(b, bytes):
 820         return b
 821
 822     return b.decode(get_subprocess_encoding(), 'ignore')
 823
 824
 825 def encodeArgument(s):
 826     if not isinstance(s, compat_str):
 827         # Legacy code that uses byte strings
 828         # Uncomment the following line after fixing all post processors
 829         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 830         s = s.decode('ascii')
 831     return encodeFilename(s, True)
 832
 833
 834 def decodeArgument(b):
 835     return decodeFilename(b, True)
 836
 837
 838 def decodeOption(optval):
 839     if optval is None:
 840         return optval
 841     if isinstance(optval, bytes):
 842         optval = optval.decode(preferredencoding())
 843
 844     assert isinstance(optval, compat_str)
 845     return optval
 846
 847
 848 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 849
 850
 851 def timetuple_from_msec(msec):
 852     secs, msec = divmod(msec, 1000)
 853     mins, secs = divmod(secs, 60)
 854     hrs, mins = divmod(mins, 60)
 855     return _timetuple(hrs, mins, secs, msec)
 856
 857
 858 def formatSeconds(secs, delim=':', msec=False):
 859     time = timetuple_from_msec(secs * 1000)
 860     if time.hours:
 861         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 862     elif time.minutes:
 863         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 864     else:
 865         ret = '%d' % time.seconds
 866     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 867
 868
 869 def _ssl_load_windows_store_certs(ssl_context, storename):
 870     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 871     try:
 872         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 873                  if encoding == 'x509_asn' and (
 874                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 875     except PermissionError:
 876         return
 877     for cert in certs:
 878         try:
 879             ssl_context.load_verify_locations(cadata=cert)
 880         except ssl.SSLError:
 881             pass
 882
 883
 884 def make_HTTPS_handler(params, **kwargs):
 885     opts_check_certificate = not params.get('nocheckcertificate')
 886     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 887     context.check_hostname = opts_check_certificate
 888     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 889     if opts_check_certificate:
 890         try:
 891             context.load_default_certs()
 892             # Work around the issue in load_default_certs when there are bad certificates. See:
 893             # https://github.com/yt-dlp/yt-dlp/issues/1060,
 894             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 895         except ssl.SSLError:
 896             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 897             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 898                 # Create a new context to discard any certificates that were already loaded
 899                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 900                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
 901                 for storename in ('CA', 'ROOT'):
 902                     _ssl_load_windows_store_certs(context, storename)
 903             context.set_default_verify_paths()
 904     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 905
 906
 907 def bug_reports_message(before=';'):
 908     if ytdl_is_updateable():
 909         update_cmd = 'type  yt-dlp -U  to update'
 910     else:
 911         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
 912     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
 913     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 914     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
 915
 916     before = before.rstrip()
 917     if not before or before.endswith(('.', '!', '?')):
 918         msg = msg[0].title() + msg[1:]
 919
 920     return (before + ' ' if before else '') + msg
 921
 922
 923 class YoutubeDLError(Exception):
 924     """Base exception for YoutubeDL errors."""
 925     msg = None
 926
 927     def __init__(self, msg=None):
 928         if msg is not None:
 929             self.msg = msg
 930         elif self.msg is None:
 931             self.msg = type(self).__name__
 932         super().__init__(self.msg)
 933
 934
 935 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 936 if hasattr(ssl, 'CertificateError'):
 937     network_exceptions.append(ssl.CertificateError)
 938 network_exceptions = tuple(network_exceptions)
 939
 940
 941 class ExtractorError(YoutubeDLError):
 942     """Error during info extraction."""
 943
 944     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 945         """ tb, if given, is the original traceback (so that it can be printed out).
 946         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 947         """
 948         if sys.exc_info()[0] in network_exceptions:
 949             expected = True
 950
 951         self.msg = str(msg)
 952         self.traceback = tb
 953         self.expected = expected
 954         self.cause = cause
 955         self.video_id = video_id
 956         self.ie = ie
 957         self.exc_info = sys.exc_info()  # preserve original exception
 958
 959         super(ExtractorError, self).__init__(''.join((
 960             format_field(ie, template='[%s] '),
 961             format_field(video_id, template='%s: '),
 962             self.msg,
 963             format_field(cause, template=' (caused by %r)'),
 964             '' if expected else bug_reports_message())))
 965
 966     def format_traceback(self):
 967         if self.traceback is None:
 968             return None
 969         return ''.join(traceback.format_tb(self.traceback))
 970
 971
 972 class UnsupportedError(ExtractorError):
 973     def __init__(self, url):
 974         super(UnsupportedError, self).__init__(
 975             'Unsupported URL: %s' % url, expected=True)
 976         self.url = url
 977
 978
 979 class RegexNotFoundError(ExtractorError):
 980     """Error when a regex didn't match"""
 981     pass
 982
 983
 984 class GeoRestrictedError(ExtractorError):
 985     """Geographic restriction Error exception.
 986
 987     This exception may be thrown when a video is not available from your
 988     geographic location due to geographic restrictions imposed by a website.
 989     """
 990
 991     def __init__(self, msg, countries=None, **kwargs):
 992         kwargs['expected'] = True
 993         super(GeoRestrictedError, self).__init__(msg, **kwargs)
 994         self.countries = countries
 995
 996
 997 class DownloadError(YoutubeDLError):
 998     """Download Error exception.
 999
1000     This exception may be thrown by FileDownloader objects if they are not
1001     configured to continue on errors. They will contain the appropriate
1002     error message.
1003     """
1004
1005     def __init__(self, msg, exc_info=None):
1006         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1007         super(DownloadError, self).__init__(msg)
1008         self.exc_info = exc_info
1009
1010
1011 class EntryNotInPlaylist(YoutubeDLError):
1012     """Entry not in playlist exception.
1013
1014     This exception will be thrown by YoutubeDL when a requested entry
1015     is not found in the playlist info_dict
1016     """
1017     msg = 'Entry not found in info'
1018
1019
1020 class SameFileError(YoutubeDLError):
1021     """Same File exception.
1022
1023     This exception will be thrown by FileDownloader objects if they detect
1024     multiple files would have to be downloaded to the same file on disk.
1025     """
1026     msg = 'Fixed output name but more than one file to download'
1027
1028     def __init__(self, filename=None):
1029         if filename is not None:
1030             self.msg += f': {filename}'
1031         super().__init__(self.msg)
1032
1033
1034 class PostProcessingError(YoutubeDLError):
1035     """Post Processing exception.
1036
1037     This exception may be raised by PostProcessor's .run() method to
1038     indicate an error in the postprocessing task.
1039     """
1040
1041
1042 class DownloadCancelled(YoutubeDLError):
1043     """ Exception raised when the download queue should be interrupted """
1044     msg = 'The download was cancelled'
1045
1046
1047 class ExistingVideoReached(DownloadCancelled):
1048     """ --break-on-existing triggered """
1049     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1050
1051
1052 class RejectedVideoReached(DownloadCancelled):
1053     """ --break-on-reject triggered """
1054     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1055
1056
1057 class MaxDownloadsReached(DownloadCancelled):
1058     """ --max-downloads limit has been reached. """
1059     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1060
1061
1062 class ReExtractInfo(YoutubeDLError):
1063     """ Video info needs to be re-extracted. """
1064
1065     def __init__(self, msg, expected=False):
1066         super().__init__(msg)
1067         self.expected = expected
1068
1069
1070 class ThrottledDownload(ReExtractInfo):
1071     """ Download speed below --throttled-rate. """
1072     msg = 'The download speed is below throttle limit'
1073
1074     def __init__(self):
1075         super().__init__(self.msg, expected=False)
1076
1077
1078 class UnavailableVideoError(YoutubeDLError):
1079     """Unavailable Format exception.
1080
1081     This exception will be thrown when a video is requested
1082     in a format that is not available for that video.
1083     """
1084     msg = 'Unable to download video'
1085
1086     def __init__(self, err=None):
1087         if err is not None:
1088             self.msg += f': {err}'
1089         super().__init__(self.msg)
1090
1091
1092 class ContentTooShortError(YoutubeDLError):
1093     """Content Too Short exception.
1094
1095     This exception may be raised by FileDownloader objects when a file they
1096     download is too small for what the server announced first, indicating
1097     the connection was probably interrupted.
1098     """
1099
1100     def __init__(self, downloaded, expected):
1101         super(ContentTooShortError, self).__init__(
1102             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1103         )
1104         # Both in bytes
1105         self.downloaded = downloaded
1106         self.expected = expected
1107
1108
1109 class XAttrMetadataError(YoutubeDLError):
1110     def __init__(self, code=None, msg='Unknown error'):
1111         super(XAttrMetadataError, self).__init__(msg)
1112         self.code = code
1113         self.msg = msg
1114
1115         # Parsing code and msg
1116         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1117                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1118             self.reason = 'NO_SPACE'
1119         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1120             self.reason = 'VALUE_TOO_LONG'
1121         else:
1122             self.reason = 'NOT_SUPPORTED'
1123
1124
1125 class XAttrUnavailableError(YoutubeDLError):
1126     pass
1127
1128
1129 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1130     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1131     # expected HTTP responses to meet HTTP/1.0 or later (see also
1132     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1133     if sys.version_info < (3, 0):
1134         kwargs['strict'] = True
1135     hc = http_class(*args, **compat_kwargs(kwargs))
1136     source_address = ydl_handler._params.get('source_address')
1137
1138     if source_address is not None:
1139         # This is to workaround _create_connection() from socket where it will try all
1140         # address data from getaddrinfo() including IPv6. This filters the result from
1141         # getaddrinfo() based on the source_address value.
1142         # This is based on the cpython socket.create_connection() function.
1143         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1144         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1145             host, port = address
1146             err = None
1147             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1148             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1149             ip_addrs = [addr for addr in addrs if addr[0] == af]
1150             if addrs and not ip_addrs:
1151                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1152                 raise socket.error(
1153                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1154                     % (ip_version, source_address[0]))
1155             for res in ip_addrs:
1156                 af, socktype, proto, canonname, sa = res
1157                 sock = None
1158                 try:
1159                     sock = socket.socket(af, socktype, proto)
1160                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1161                         sock.settimeout(timeout)
1162                     sock.bind(source_address)
1163                     sock.connect(sa)
1164                     err = None  # Explicitly break reference cycle
1165                     return sock
1166                 except socket.error as _:
1167                     err = _
1168                     if sock is not None:
1169                         sock.close()
1170             if err is not None:
1171                 raise err
1172             else:
1173                 raise socket.error('getaddrinfo returns an empty list')
1174         if hasattr(hc, '_create_connection'):
1175             hc._create_connection = _create_connection
1176         sa = (source_address, 0)
1177         if hasattr(hc, 'source_address'):  # Python 2.7+
1178             hc.source_address = sa
1179         else:  # Python 2.6
1180             def _hc_connect(self, *args, **kwargs):
1181                 sock = _create_connection(
1182                     (self.host, self.port), self.timeout, sa)
1183                 if is_https:
1184                     self.sock = ssl.wrap_socket(
1185                         sock, self.key_file, self.cert_file,
1186                         ssl_version=ssl.PROTOCOL_TLSv1)
1187                 else:
1188                     self.sock = sock
1189             hc.connect = functools.partial(_hc_connect, hc)
1190
1191     return hc
1192
1193
1194 def handle_youtubedl_headers(headers):
1195     filtered_headers = headers
1196
1197     if 'Youtubedl-no-compression' in filtered_headers:
1198         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1199         del filtered_headers['Youtubedl-no-compression']
1200
1201     return filtered_headers
1202
1203
1204 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1205     """Handler for HTTP requests and responses.
1206
1207     This class, when installed with an OpenerDirector, automatically adds
1208     the standard headers to every HTTP request and handles gzipped and
1209     deflated responses from web servers. If compression is to be avoided in
1210     a particular request, the original request in the program code only has
1211     to include the HTTP header "Youtubedl-no-compression", which will be
1212     removed before making the real request.
1213
1214     Part of this code was copied from:
1215
1216     http://techknack.net/python-urllib2-handlers/
1217
1218     Andrew Rowls, the author of that code, agreed to release it to the
1219     public domain.
1220     """
1221
1222     def __init__(self, params, *args, **kwargs):
1223         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1224         self._params = params
1225
1226     def http_open(self, req):
1227         conn_class = compat_http_client.HTTPConnection
1228
1229         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1230         if socks_proxy:
1231             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1232             del req.headers['Ytdl-socks-proxy']
1233
1234         return self.do_open(functools.partial(
1235             _create_http_connection, self, conn_class, False),
1236             req)
1237
1238     @staticmethod
1239     def deflate(data):
1240         if not data:
1241             return data
1242         try:
1243             return zlib.decompress(data, -zlib.MAX_WBITS)
1244         except zlib.error:
1245             return zlib.decompress(data)
1246
1247     def http_request(self, req):
1248         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1249         # always respected by websites, some tend to give out URLs with non percent-encoded
1250         # non-ASCII characters (see telemb.py, ard.py [#3412])
1251         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1252         # To work around aforementioned issue we will replace request's original URL with
1253         # percent-encoded one
1254         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1255         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1256         url = req.get_full_url()
1257         url_escaped = escape_url(url)
1258
1259         # Substitute URL if any change after escaping
1260         if url != url_escaped:
1261             req = update_Request(req, url=url_escaped)
1262
1263         for h, v in std_headers.items():
1264             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1265             # The dict keys are capitalized because of this bug by urllib
1266             if h.capitalize() not in req.headers:
1267                 req.add_header(h, v)
1268
1269         req.headers = handle_youtubedl_headers(req.headers)
1270
1271         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1272             # Python 2.6 is brain-dead when it comes to fragments
1273             req._Request__original = req._Request__original.partition('#')[0]
1274             req._Request__r_type = req._Request__r_type.partition('#')[0]
1275
1276         return req
1277
1278     def http_response(self, req, resp):
1279         old_resp = resp
1280         # gzip
1281         if resp.headers.get('Content-encoding', '') == 'gzip':
1282             content = resp.read()
1283             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1284             try:
1285                 uncompressed = io.BytesIO(gz.read())
1286             except IOError as original_ioerror:
1287                 # There may be junk add the end of the file
1288                 # See http://stackoverflow.com/q/4928560/35070 for details
1289                 for i in range(1, 1024):
1290                     try:
1291                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1292                         uncompressed = io.BytesIO(gz.read())
1293                     except IOError:
1294                         continue
1295                     break
1296                 else:
1297                     raise original_ioerror
1298             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1299             resp.msg = old_resp.msg
1300             del resp.headers['Content-encoding']
1301         # deflate
1302         if resp.headers.get('Content-encoding', '') == 'deflate':
1303             gz = io.BytesIO(self.deflate(resp.read()))
1304             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1305             resp.msg = old_resp.msg
1306             del resp.headers['Content-encoding']
1307         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1308         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1309         if 300 <= resp.code < 400:
1310             location = resp.headers.get('Location')
1311             if location:
1312                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1313                 if sys.version_info >= (3, 0):
1314                     location = location.encode('iso-8859-1').decode('utf-8')
1315                 else:
1316                     location = location.decode('utf-8')
1317                 location_escaped = escape_url(location)
1318                 if location != location_escaped:
1319                     del resp.headers['Location']
1320                     if sys.version_info < (3, 0):
1321                         location_escaped = location_escaped.encode('utf-8')
1322                     resp.headers['Location'] = location_escaped
1323         return resp
1324
1325     https_request = http_request
1326     https_response = http_response
1327
1328
1329 def make_socks_conn_class(base_class, socks_proxy):
1330     assert issubclass(base_class, (
1331         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1332
1333     url_components = compat_urlparse.urlparse(socks_proxy)
1334     if url_components.scheme.lower() == 'socks5':
1335         socks_type = ProxyType.SOCKS5
1336     elif url_components.scheme.lower() in ('socks', 'socks4'):
1337         socks_type = ProxyType.SOCKS4
1338     elif url_components.scheme.lower() == 'socks4a':
1339         socks_type = ProxyType.SOCKS4A
1340
1341     def unquote_if_non_empty(s):
1342         if not s:
1343             return s
1344         return compat_urllib_parse_unquote_plus(s)
1345
1346     proxy_args = (
1347         socks_type,
1348         url_components.hostname, url_components.port or 1080,
1349         True,  # Remote DNS
1350         unquote_if_non_empty(url_components.username),
1351         unquote_if_non_empty(url_components.password),
1352     )
1353
1354     class SocksConnection(base_class):
1355         def connect(self):
1356             self.sock = sockssocket()
1357             self.sock.setproxy(*proxy_args)
1358             if type(self.timeout) in (int, float):
1359                 self.sock.settimeout(self.timeout)
1360             self.sock.connect((self.host, self.port))
1361
1362             if isinstance(self, compat_http_client.HTTPSConnection):
1363                 if hasattr(self, '_context'):  # Python > 2.6
1364                     self.sock = self._context.wrap_socket(
1365                         self.sock, server_hostname=self.host)
1366                 else:
1367                     self.sock = ssl.wrap_socket(self.sock)
1368
1369     return SocksConnection
1370
1371
1372 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1373     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1374         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1375         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1376         self._params = params
1377
1378     def https_open(self, req):
1379         kwargs = {}
1380         conn_class = self._https_conn_class
1381
1382         if hasattr(self, '_context'):  # python > 2.6
1383             kwargs['context'] = self._context
1384         if hasattr(self, '_check_hostname'):  # python 3.x
1385             kwargs['check_hostname'] = self._check_hostname
1386
1387         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1388         if socks_proxy:
1389             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1390             del req.headers['Ytdl-socks-proxy']
1391
1392         return self.do_open(functools.partial(
1393             _create_http_connection, self, conn_class, True),
1394             req, **kwargs)
1395
1396
1397 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1398     """
1399     See [1] for cookie file format.
1400
1401     1. https://curl.haxx.se/docs/http-cookies.html
1402     """
1403     _HTTPONLY_PREFIX = '#HttpOnly_'
1404     _ENTRY_LEN = 7
1405     _HEADER = '''# Netscape HTTP Cookie File
1406 # This file is generated by yt-dlp.  Do not edit.
1407
1408 '''
1409     _CookieFileEntry = collections.namedtuple(
1410         'CookieFileEntry',
1411         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1412
1413     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1414         """
1415         Save cookies to a file.
1416
1417         Most of the code is taken from CPython 3.8 and slightly adapted
1418         to support cookie files with UTF-8 in both python 2 and 3.
1419         """
1420         if filename is None:
1421             if self.filename is not None:
1422                 filename = self.filename
1423             else:
1424                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1425
1426         # Store session cookies with `expires` set to 0 instead of an empty
1427         # string
1428         for cookie in self:
1429             if cookie.expires is None:
1430                 cookie.expires = 0
1431
1432         with io.open(filename, 'w', encoding='utf-8') as f:
1433             f.write(self._HEADER)
1434             now = time.time()
1435             for cookie in self:
1436                 if not ignore_discard and cookie.discard:
1437                     continue
1438                 if not ignore_expires and cookie.is_expired(now):
1439                     continue
1440                 if cookie.secure:
1441                     secure = 'TRUE'
1442                 else:
1443                     secure = 'FALSE'
1444                 if cookie.domain.startswith('.'):
1445                     initial_dot = 'TRUE'
1446                 else:
1447                     initial_dot = 'FALSE'
1448                 if cookie.expires is not None:
1449                     expires = compat_str(cookie.expires)
1450                 else:
1451                     expires = ''
1452                 if cookie.value is None:
1453                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1454                     # with no name, whereas http.cookiejar regards it as a
1455                     # cookie with no value.
1456                     name = ''
1457                     value = cookie.name
1458                 else:
1459                     name = cookie.name
1460                     value = cookie.value
1461                 f.write(
1462                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1463                                secure, expires, name, value]) + '\n')
1464
1465     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1466         """Load cookies from a file."""
1467         if filename is None:
1468             if self.filename is not None:
1469                 filename = self.filename
1470             else:
1471                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1472
1473         def prepare_line(line):
1474             if line.startswith(self._HTTPONLY_PREFIX):
1475                 line = line[len(self._HTTPONLY_PREFIX):]
1476             # comments and empty lines are fine
1477             if line.startswith('#') or not line.strip():
1478                 return line
1479             cookie_list = line.split('\t')
1480             if len(cookie_list) != self._ENTRY_LEN:
1481                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1482             cookie = self._CookieFileEntry(*cookie_list)
1483             if cookie.expires_at and not cookie.expires_at.isdigit():
1484                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1485             return line
1486
1487         cf = io.StringIO()
1488         with io.open(filename, encoding='utf-8') as f:
1489             for line in f:
1490                 try:
1491                     cf.write(prepare_line(line))
1492                 except compat_cookiejar.LoadError as e:
1493                     write_string(
1494                         'WARNING: skipping cookie file entry due to %s: %r\n'
1495                         % (e, line), sys.stderr)
1496                     continue
1497         cf.seek(0)
1498         self._really_load(cf, filename, ignore_discard, ignore_expires)
1499         # Session cookies are denoted by either `expires` field set to
1500         # an empty string or 0. MozillaCookieJar only recognizes the former
1501         # (see [1]). So we need force the latter to be recognized as session
1502         # cookies on our own.
1503         # Session cookies may be important for cookies-based authentication,
1504         # e.g. usually, when user does not check 'Remember me' check box while
1505         # logging in on a site, some important cookies are stored as session
1506         # cookies so that not recognizing them will result in failed login.
1507         # 1. https://bugs.python.org/issue17164
1508         for cookie in self:
1509             # Treat `expires=0` cookies as session cookies
1510             if cookie.expires == 0:
1511                 cookie.expires = None
1512                 cookie.discard = True
1513
1514
1515 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1516     def __init__(self, cookiejar=None):
1517         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1518
1519     def http_response(self, request, response):
1520         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1521         # characters in Set-Cookie HTTP header of last response (see
1522         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1523         # In order to at least prevent crashing we will percent encode Set-Cookie
1524         # header before HTTPCookieProcessor starts processing it.
1525         # if sys.version_info < (3, 0) and response.headers:
1526         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1527         #         set_cookie = response.headers.get(set_cookie_header)
1528         #         if set_cookie:
1529         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1530         #             if set_cookie != set_cookie_escaped:
1531         #                 del response.headers[set_cookie_header]
1532         #                 response.headers[set_cookie_header] = set_cookie_escaped
1533         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1534
1535     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1536     https_response = http_response
1537
1538
1539 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1540     """YoutubeDL redirect handler
1541
1542     The code is based on HTTPRedirectHandler implementation from CPython [1].
1543
1544     This redirect handler solves two issues:
1545      - ensures redirect URL is always unicode under python 2
1546      - introduces support for experimental HTTP response status code
1547        308 Permanent Redirect [2] used by some sites [3]
1548
1549     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1550     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1551     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1552     """
1553
1554     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1555
1556     def redirect_request(self, req, fp, code, msg, headers, newurl):
1557         """Return a Request or None in response to a redirect.
1558
1559         This is called by the http_error_30x methods when a
1560         redirection response is received.  If a redirection should
1561         take place, return a new Request to allow http_error_30x to
1562         perform the redirect.  Otherwise, raise HTTPError if no-one
1563         else should try to handle this url.  Return None if you can't
1564         but another Handler might.
1565         """
1566         m = req.get_method()
1567         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1568                  or code in (301, 302, 303) and m == "POST")):
1569             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1570         # Strictly (according to RFC 2616), 301 or 302 in response to
1571         # a POST MUST NOT cause a redirection without confirmation
1572         # from the user (of urllib.request, in this case).  In practice,
1573         # essentially all clients do redirect in this case, so we do
1574         # the same.
1575
1576         # On python 2 urlh.geturl() may sometimes return redirect URL
1577         # as byte string instead of unicode. This workaround allows
1578         # to force it always return unicode.
1579         if sys.version_info[0] < 3:
1580             newurl = compat_str(newurl)
1581
1582         # Be conciliant with URIs containing a space.  This is mainly
1583         # redundant with the more complete encoding done in http_error_302(),
1584         # but it is kept for compatibility with other callers.
1585         newurl = newurl.replace(' ', '%20')
1586
1587         CONTENT_HEADERS = ("content-length", "content-type")
1588         # NB: don't use dict comprehension for python 2.6 compatibility
1589         newheaders = dict((k, v) for k, v in req.headers.items()
1590                           if k.lower() not in CONTENT_HEADERS)
1591         return compat_urllib_request.Request(
1592             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1593             unverifiable=True)
1594
1595
1596 def extract_timezone(date_str):
1597     m = re.search(
1598         r'''(?x)
1599             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1600             (?P<tz>Z|                                            # just the UTC Z, or
1601                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1602                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1603                    [ ]?                                          # optional space
1604                 (?P<sign>\+|-)                                   # +/-
1605                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1606             $)
1607         ''', date_str)
1608     if not m:
1609         timezone = datetime.timedelta()
1610     else:
1611         date_str = date_str[:-len(m.group('tz'))]
1612         if not m.group('sign'):
1613             timezone = datetime.timedelta()
1614         else:
1615             sign = 1 if m.group('sign') == '+' else -1
1616             timezone = datetime.timedelta(
1617                 hours=sign * int(m.group('hours')),
1618                 minutes=sign * int(m.group('minutes')))
1619     return timezone, date_str
1620
1621
1622 def parse_iso8601(date_str, delimiter='T', timezone=None):
1623     """ Return a UNIX timestamp from the given date """
1624
1625     if date_str is None:
1626         return None
1627
1628     date_str = re.sub(r'\.[0-9]+', '', date_str)
1629
1630     if timezone is None:
1631         timezone, date_str = extract_timezone(date_str)
1632
1633     try:
1634         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1635         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1636         return calendar.timegm(dt.timetuple())
1637     except ValueError:
1638         pass
1639
1640
1641 def date_formats(day_first=True):
1642     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1643
1644
1645 def unified_strdate(date_str, day_first=True):
1646     """Return a string with the date in the format YYYYMMDD"""
1647
1648     if date_str is None:
1649         return None
1650     upload_date = None
1651     # Replace commas
1652     date_str = date_str.replace(',', ' ')
1653     # Remove AM/PM + timezone
1654     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1655     _, date_str = extract_timezone(date_str)
1656
1657     for expression in date_formats(day_first):
1658         try:
1659             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1660         except ValueError:
1661             pass
1662     if upload_date is None:
1663         timetuple = email.utils.parsedate_tz(date_str)
1664         if timetuple:
1665             try:
1666                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1667             except ValueError:
1668                 pass
1669     if upload_date is not None:
1670         return compat_str(upload_date)
1671
1672
1673 def unified_timestamp(date_str, day_first=True):
1674     if date_str is None:
1675         return None
1676
1677     date_str = re.sub(r'[,|]', '', date_str)
1678
1679     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1680     timezone, date_str = extract_timezone(date_str)
1681
1682     # Remove AM/PM + timezone
1683     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1684
1685     # Remove unrecognized timezones from ISO 8601 alike timestamps
1686     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1687     if m:
1688         date_str = date_str[:-len(m.group('tz'))]
1689
1690     # Python only supports microseconds, so remove nanoseconds
1691     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1692     if m:
1693         date_str = m.group(1)
1694
1695     for expression in date_formats(day_first):
1696         try:
1697             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1698             return calendar.timegm(dt.timetuple())
1699         except ValueError:
1700             pass
1701     timetuple = email.utils.parsedate_tz(date_str)
1702     if timetuple:
1703         return calendar.timegm(timetuple) + pm_delta * 3600
1704
1705
1706 def determine_ext(url, default_ext='unknown_video'):
1707     if url is None or '.' not in url:
1708         return default_ext
1709     guess = url.partition('?')[0].rpartition('.')[2]
1710     if re.match(r'^[A-Za-z0-9]+$', guess):
1711         return guess
1712     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1713     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1714         return guess.rstrip('/')
1715     else:
1716         return default_ext
1717
1718
1719 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1720     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1721
1722
1723 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1724     """
1725     Return a datetime object from a string in the format YYYYMMDD or
1726     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1727
1728     format: string date format used to return datetime object from
1729     precision: round the time portion of a datetime object.
1730                 auto|microsecond|second|minute|hour|day.
1731                 auto: round to the unit provided in date_str (if applicable).
1732     """
1733     auto_precision = False
1734     if precision == 'auto':
1735         auto_precision = True
1736         precision = 'microsecond'
1737     today = datetime_round(datetime.datetime.now(), precision)
1738     if date_str in ('now', 'today'):
1739         return today
1740     if date_str == 'yesterday':
1741         return today - datetime.timedelta(days=1)
1742     match = re.match(
1743         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1744         date_str)
1745     if match is not None:
1746         start_time = datetime_from_str(match.group('start'), precision, format)
1747         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1748         unit = match.group('unit')
1749         if unit == 'month' or unit == 'year':
1750             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1751             unit = 'day'
1752         else:
1753             if unit == 'week':
1754                 unit = 'day'
1755                 time *= 7
1756             delta = datetime.timedelta(**{unit + 's': time})
1757             new_date = start_time + delta
1758         if auto_precision:
1759             return datetime_round(new_date, unit)
1760         return new_date
1761
1762     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1763
1764
1765 def date_from_str(date_str, format='%Y%m%d'):
1766     """
1767     Return a datetime object from a string in the format YYYYMMDD or
1768     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1769
1770     format: string date format used to return datetime object from
1771     """
1772     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1773
1774
1775 def datetime_add_months(dt, months):
1776     """Increment/Decrement a datetime object by months."""
1777     month = dt.month + months - 1
1778     year = dt.year + month // 12
1779     month = month % 12 + 1
1780     day = min(dt.day, calendar.monthrange(year, month)[1])
1781     return dt.replace(year, month, day)
1782
1783
1784 def datetime_round(dt, precision='day'):
1785     """
1786     Round a datetime object's time to a specific precision
1787     """
1788     if precision == 'microsecond':
1789         return dt
1790
1791     unit_seconds = {
1792         'day': 86400,
1793         'hour': 3600,
1794         'minute': 60,
1795         'second': 1,
1796     }
1797     roundto = lambda x, n: ((x + n / 2) // n) * n
1798     timestamp = calendar.timegm(dt.timetuple())
1799     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1800
1801
1802 def hyphenate_date(date_str):
1803     """
1804     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1805     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1806     if match is not None:
1807         return '-'.join(match.groups())
1808     else:
1809         return date_str
1810
1811
1812 class DateRange(object):
1813     """Represents a time interval between two dates"""
1814
1815     def __init__(self, start=None, end=None):
1816         """start and end must be strings in the format accepted by date"""
1817         if start is not None:
1818             self.start = date_from_str(start)
1819         else:
1820             self.start = datetime.datetime.min.date()
1821         if end is not None:
1822             self.end = date_from_str(end)
1823         else:
1824             self.end = datetime.datetime.max.date()
1825         if self.start > self.end:
1826             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1827
1828     @classmethod
1829     def day(cls, day):
1830         """Returns a range that only contains the given day"""
1831         return cls(day, day)
1832
1833     def __contains__(self, date):
1834         """Check if the date is in the range"""
1835         if not isinstance(date, datetime.date):
1836             date = date_from_str(date)
1837         return self.start <= date <= self.end
1838
1839     def __str__(self):
1840         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1841
1842
1843 def platform_name():
1844     """ Returns the platform name as a compat_str """
1845     res = platform.platform()
1846     if isinstance(res, bytes):
1847         res = res.decode(preferredencoding())
1848
1849     assert isinstance(res, compat_str)
1850     return res
1851
1852
1853 def get_windows_version():
1854     ''' Get Windows version. None if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return None
1859
1860
1861 def _windows_write_string(s, out):
1862     """ Returns True if the string was written using special methods,
1863     False if it has yet to be written out."""
1864     # Adapted from http://stackoverflow.com/a/3259271/35070
1865
1866     import ctypes.wintypes
1867
1868     WIN_OUTPUT_IDS = {
1869         1: -11,
1870         2: -12,
1871     }
1872
1873     try:
1874         fileno = out.fileno()
1875     except AttributeError:
1876         # If the output stream doesn't have a fileno, it's virtual
1877         return False
1878     except io.UnsupportedOperation:
1879         # Some strange Windows pseudo files?
1880         return False
1881     if fileno not in WIN_OUTPUT_IDS:
1882         return False
1883
1884     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1885         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1886         ('GetStdHandle', ctypes.windll.kernel32))
1887     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1888
1889     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1890         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1891         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1892         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1893     written = ctypes.wintypes.DWORD(0)
1894
1895     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1896     FILE_TYPE_CHAR = 0x0002
1897     FILE_TYPE_REMOTE = 0x8000
1898     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1899         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1900         ctypes.POINTER(ctypes.wintypes.DWORD))(
1901         ('GetConsoleMode', ctypes.windll.kernel32))
1902     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1903
1904     def not_a_console(handle):
1905         if handle == INVALID_HANDLE_VALUE or handle is None:
1906             return True
1907         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1908                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1909
1910     if not_a_console(h):
1911         return False
1912
1913     def next_nonbmp_pos(s):
1914         try:
1915             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1916         except StopIteration:
1917             return len(s)
1918
1919     while s:
1920         count = min(next_nonbmp_pos(s), 1024)
1921
1922         ret = WriteConsoleW(
1923             h, s, count if count else 2, ctypes.byref(written), None)
1924         if ret == 0:
1925             raise OSError('Failed to write string')
1926         if not count:  # We just wrote a non-BMP character
1927             assert written.value == 2
1928             s = s[1:]
1929         else:
1930             assert written.value > 0
1931             s = s[written.value:]
1932     return True
1933
1934
1935 def write_string(s, out=None, encoding=None):
1936     if out is None:
1937         out = sys.stderr
1938     assert type(s) == compat_str
1939
1940     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1941         if _windows_write_string(s, out):
1942             return
1943
1944     if ('b' in getattr(out, 'mode', '')
1945             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1946         byt = s.encode(encoding or preferredencoding(), 'ignore')
1947         out.write(byt)
1948     elif hasattr(out, 'buffer'):
1949         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1950         byt = s.encode(enc, 'ignore')
1951         out.buffer.write(byt)
1952     else:
1953         out.write(s)
1954     out.flush()
1955
1956
1957 def bytes_to_intlist(bs):
1958     if not bs:
1959         return []
1960     if isinstance(bs[0], int):  # Python 3
1961         return list(bs)
1962     else:
1963         return [ord(c) for c in bs]
1964
1965
1966 def intlist_to_bytes(xs):
1967     if not xs:
1968         return b''
1969     return compat_struct_pack('%dB' % len(xs), *xs)
1970
1971
1972 # Cross-platform file locking
1973 if sys.platform == 'win32':
1974     import ctypes.wintypes
1975     import msvcrt
1976
1977     class OVERLAPPED(ctypes.Structure):
1978         _fields_ = [
1979             ('Internal', ctypes.wintypes.LPVOID),
1980             ('InternalHigh', ctypes.wintypes.LPVOID),
1981             ('Offset', ctypes.wintypes.DWORD),
1982             ('OffsetHigh', ctypes.wintypes.DWORD),
1983             ('hEvent', ctypes.wintypes.HANDLE),
1984         ]
1985
1986     kernel32 = ctypes.windll.kernel32
1987     LockFileEx = kernel32.LockFileEx
1988     LockFileEx.argtypes = [
1989         ctypes.wintypes.HANDLE,     # hFile
1990         ctypes.wintypes.DWORD,      # dwFlags
1991         ctypes.wintypes.DWORD,      # dwReserved
1992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1993         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1994         ctypes.POINTER(OVERLAPPED)  # Overlapped
1995     ]
1996     LockFileEx.restype = ctypes.wintypes.BOOL
1997     UnlockFileEx = kernel32.UnlockFileEx
1998     UnlockFileEx.argtypes = [
1999         ctypes.wintypes.HANDLE,     # hFile
2000         ctypes.wintypes.DWORD,      # dwReserved
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2002         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2003         ctypes.POINTER(OVERLAPPED)  # Overlapped
2004     ]
2005     UnlockFileEx.restype = ctypes.wintypes.BOOL
2006     whole_low = 0xffffffff
2007     whole_high = 0x7fffffff
2008
2009     def _lock_file(f, exclusive):
2010         overlapped = OVERLAPPED()
2011         overlapped.Offset = 0
2012         overlapped.OffsetHigh = 0
2013         overlapped.hEvent = 0
2014         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2015         handle = msvcrt.get_osfhandle(f.fileno())
2016         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2017                           whole_low, whole_high, f._lock_file_overlapped_p):
2018             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2019
2020     def _unlock_file(f):
2021         assert f._lock_file_overlapped_p
2022         handle = msvcrt.get_osfhandle(f.fileno())
2023         if not UnlockFileEx(handle, 0,
2024                             whole_low, whole_high, f._lock_file_overlapped_p):
2025             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2026
2027 else:
2028     # Some platforms, such as Jython, is missing fcntl
2029     try:
2030         import fcntl
2031
2032         def _lock_file(f, exclusive):
2033             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2034
2035         def _unlock_file(f):
2036             fcntl.flock(f, fcntl.LOCK_UN)
2037     except ImportError:
2038         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2039
2040         def _lock_file(f, exclusive):
2041             raise IOError(UNSUPPORTED_MSG)
2042
2043         def _unlock_file(f):
2044             raise IOError(UNSUPPORTED_MSG)
2045
2046
2047 class locked_file(object):
2048     def __init__(self, filename, mode, encoding=None):
2049         assert mode in ['r', 'a', 'w']
2050         self.f = io.open(filename, mode, encoding=encoding)
2051         self.mode = mode
2052
2053     def __enter__(self):
2054         exclusive = self.mode != 'r'
2055         try:
2056             _lock_file(self.f, exclusive)
2057         except IOError:
2058             self.f.close()
2059             raise
2060         return self
2061
2062     def __exit__(self, etype, value, traceback):
2063         try:
2064             _unlock_file(self.f)
2065         finally:
2066             self.f.close()
2067
2068     def __iter__(self):
2069         return iter(self.f)
2070
2071     def write(self, *args):
2072         return self.f.write(*args)
2073
2074     def read(self, *args):
2075         return self.f.read(*args)
2076
2077
2078 def get_filesystem_encoding():
2079     encoding = sys.getfilesystemencoding()
2080     return encoding if encoding is not None else 'utf-8'
2081
2082
2083 def shell_quote(args):
2084     quoted_args = []
2085     encoding = get_filesystem_encoding()
2086     for a in args:
2087         if isinstance(a, bytes):
2088             # We may get a filename encoded with 'encodeFilename'
2089             a = a.decode(encoding)
2090         quoted_args.append(compat_shlex_quote(a))
2091     return ' '.join(quoted_args)
2092
2093
2094 def smuggle_url(url, data):
2095     """ Pass additional data in a URL for internal use. """
2096
2097     url, idata = unsmuggle_url(url, {})
2098     data.update(idata)
2099     sdata = compat_urllib_parse_urlencode(
2100         {'__youtubedl_smuggle': json.dumps(data)})
2101     return url + '#' + sdata
2102
2103
2104 def unsmuggle_url(smug_url, default=None):
2105     if '#__youtubedl_smuggle' not in smug_url:
2106         return smug_url, default
2107     url, _, sdata = smug_url.rpartition('#')
2108     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2109     data = json.loads(jsond)
2110     return url, data
2111
2112
2113 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2114     """ Formats numbers with decimal sufixes like K, M, etc """
2115     num, factor = float_or_none(num), float(factor)
2116     if num is None:
2117         return None
2118     exponent = 0 if num == 0 else int(math.log(num, factor))
2119     suffix = ['', *'kMGTPEZY'][exponent]
2120     if factor == 1024:
2121         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2122     converted = num / (factor ** exponent)
2123     return fmt % (converted, suffix)
2124
2125
2126 def format_bytes(bytes):
2127     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2128
2129
2130 def lookup_unit_table(unit_table, s):
2131     units_re = '|'.join(re.escape(u) for u in unit_table)
2132     m = re.match(
2133         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2134     if not m:
2135         return None
2136     num_str = m.group('num').replace(',', '.')
2137     mult = unit_table[m.group('unit')]
2138     return int(float(num_str) * mult)
2139
2140
2141 def parse_filesize(s):
2142     if s is None:
2143         return None
2144
2145     # The lower-case forms are of course incorrect and unofficial,
2146     # but we support those too
2147     _UNIT_TABLE = {
2148         'B': 1,
2149         'b': 1,
2150         'bytes': 1,
2151         'KiB': 1024,
2152         'KB': 1000,
2153         'kB': 1024,
2154         'Kb': 1000,
2155         'kb': 1000,
2156         'kilobytes': 1000,
2157         'kibibytes': 1024,
2158         'MiB': 1024 ** 2,
2159         'MB': 1000 ** 2,
2160         'mB': 1024 ** 2,
2161         'Mb': 1000 ** 2,
2162         'mb': 1000 ** 2,
2163         'megabytes': 1000 ** 2,
2164         'mebibytes': 1024 ** 2,
2165         'GiB': 1024 ** 3,
2166         'GB': 1000 ** 3,
2167         'gB': 1024 ** 3,
2168         'Gb': 1000 ** 3,
2169         'gb': 1000 ** 3,
2170         'gigabytes': 1000 ** 3,
2171         'gibibytes': 1024 ** 3,
2172         'TiB': 1024 ** 4,
2173         'TB': 1000 ** 4,
2174         'tB': 1024 ** 4,
2175         'Tb': 1000 ** 4,
2176         'tb': 1000 ** 4,
2177         'terabytes': 1000 ** 4,
2178         'tebibytes': 1024 ** 4,
2179         'PiB': 1024 ** 5,
2180         'PB': 1000 ** 5,
2181         'pB': 1024 ** 5,
2182         'Pb': 1000 ** 5,
2183         'pb': 1000 ** 5,
2184         'petabytes': 1000 ** 5,
2185         'pebibytes': 1024 ** 5,
2186         'EiB': 1024 ** 6,
2187         'EB': 1000 ** 6,
2188         'eB': 1024 ** 6,
2189         'Eb': 1000 ** 6,
2190         'eb': 1000 ** 6,
2191         'exabytes': 1000 ** 6,
2192         'exbibytes': 1024 ** 6,
2193         'ZiB': 1024 ** 7,
2194         'ZB': 1000 ** 7,
2195         'zB': 1024 ** 7,
2196         'Zb': 1000 ** 7,
2197         'zb': 1000 ** 7,
2198         'zettabytes': 1000 ** 7,
2199         'zebibytes': 1024 ** 7,
2200         'YiB': 1024 ** 8,
2201         'YB': 1000 ** 8,
2202         'yB': 1024 ** 8,
2203         'Yb': 1000 ** 8,
2204         'yb': 1000 ** 8,
2205         'yottabytes': 1000 ** 8,
2206         'yobibytes': 1024 ** 8,
2207     }
2208
2209     return lookup_unit_table(_UNIT_TABLE, s)
2210
2211
2212 def parse_count(s):
2213     if s is None:
2214         return None
2215
2216     s = re.sub(r'^[^\d]+\s', '', s).strip()
2217
2218     if re.match(r'^[\d,.]+$', s):
2219         return str_to_int(s)
2220
2221     _UNIT_TABLE = {
2222         'k': 1000,
2223         'K': 1000,
2224         'm': 1000 ** 2,
2225         'M': 1000 ** 2,
2226         'kk': 1000 ** 2,
2227         'KK': 1000 ** 2,
2228         'b': 1000 ** 3,
2229         'B': 1000 ** 3,
2230     }
2231
2232     ret = lookup_unit_table(_UNIT_TABLE, s)
2233     if ret is not None:
2234         return ret
2235
2236     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2237     if mobj:
2238         return str_to_int(mobj.group(1))
2239
2240
2241 def parse_resolution(s):
2242     if s is None:
2243         return {}
2244
2245     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2246     if mobj:
2247         return {
2248             'width': int(mobj.group('w')),
2249             'height': int(mobj.group('h')),
2250         }
2251
2252     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2253     if mobj:
2254         return {'height': int(mobj.group(1))}
2255
2256     mobj = re.search(r'\b([48])[kK]\b', s)
2257     if mobj:
2258         return {'height': int(mobj.group(1)) * 540}
2259
2260     return {}
2261
2262
2263 def parse_bitrate(s):
2264     if not isinstance(s, compat_str):
2265         return
2266     mobj = re.search(r'\b(\d+)\s*kbps', s)
2267     if mobj:
2268         return int(mobj.group(1))
2269
2270
2271 def month_by_name(name, lang='en'):
2272     """ Return the number of a month by (locale-independently) English name """
2273
2274     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2275
2276     try:
2277         return month_names.index(name) + 1
2278     except ValueError:
2279         return None
2280
2281
2282 def month_by_abbreviation(abbrev):
2283     """ Return the number of a month by (locale-independently) English
2284         abbreviations """
2285
2286     try:
2287         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2288     except ValueError:
2289         return None
2290
2291
2292 def fix_xml_ampersands(xml_str):
2293     """Replace all the '&' by '&amp;' in XML"""
2294     return re.sub(
2295         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2296         '&amp;',
2297         xml_str)
2298
2299
2300 def setproctitle(title):
2301     assert isinstance(title, compat_str)
2302
2303     # ctypes in Jython is not complete
2304     # http://bugs.jython.org/issue2148
2305     if sys.platform.startswith('java'):
2306         return
2307
2308     try:
2309         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2310     except OSError:
2311         return
2312     except TypeError:
2313         # LoadLibrary in Windows Python 2.7.13 only expects
2314         # a bytestring, but since unicode_literals turns
2315         # every string into a unicode string, it fails.
2316         return
2317     title_bytes = title.encode('utf-8')
2318     buf = ctypes.create_string_buffer(len(title_bytes))
2319     buf.value = title_bytes
2320     try:
2321         libc.prctl(15, buf, 0, 0, 0)
2322     except AttributeError:
2323         return  # Strange libc, just skip this
2324
2325
2326 def remove_start(s, start):
2327     return s[len(start):] if s is not None and s.startswith(start) else s
2328
2329
2330 def remove_end(s, end):
2331     return s[:-len(end)] if s is not None and s.endswith(end) else s
2332
2333
2334 def remove_quotes(s):
2335     if s is None or len(s) < 2:
2336         return s
2337     for quote in ('"', "'", ):
2338         if s[0] == quote and s[-1] == quote:
2339             return s[1:-1]
2340     return s
2341
2342
2343 def get_domain(url):
2344     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2345     return domain.group('domain') if domain else None
2346
2347
2348 def url_basename(url):
2349     path = compat_urlparse.urlparse(url).path
2350     return path.strip('/').split('/')[-1]
2351
2352
2353 def base_url(url):
2354     return re.match(r'https?://[^?#&]+/', url).group()
2355
2356
2357 def urljoin(base, path):
2358     if isinstance(path, bytes):
2359         path = path.decode('utf-8')
2360     if not isinstance(path, compat_str) or not path:
2361         return None
2362     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2363         return path
2364     if isinstance(base, bytes):
2365         base = base.decode('utf-8')
2366     if not isinstance(base, compat_str) or not re.match(
2367             r'^(?:https?:)?//', base):
2368         return None
2369     return compat_urlparse.urljoin(base, path)
2370
2371
2372 class HEADRequest(compat_urllib_request.Request):
2373     def get_method(self):
2374         return 'HEAD'
2375
2376
2377 class PUTRequest(compat_urllib_request.Request):
2378     def get_method(self):
2379         return 'PUT'
2380
2381
2382 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2383     if get_attr:
2384         if v is not None:
2385             v = getattr(v, get_attr, None)
2386     if v == '':
2387         v = None
2388     if v is None:
2389         return default
2390     try:
2391         return int(v) * invscale // scale
2392     except (ValueError, TypeError, OverflowError):
2393         return default
2394
2395
2396 def str_or_none(v, default=None):
2397     return default if v is None else compat_str(v)
2398
2399
2400 def str_to_int(int_str):
2401     """ A more relaxed version of int_or_none """
2402     if isinstance(int_str, compat_integer_types):
2403         return int_str
2404     elif isinstance(int_str, compat_str):
2405         int_str = re.sub(r'[,\.\+]', '', int_str)
2406         return int_or_none(int_str)
2407
2408
2409 def float_or_none(v, scale=1, invscale=1, default=None):
2410     if v is None:
2411         return default
2412     try:
2413         return float(v) * invscale / scale
2414     except (ValueError, TypeError):
2415         return default
2416
2417
2418 def bool_or_none(v, default=None):
2419     return v if isinstance(v, bool) else default
2420
2421
2422 def strip_or_none(v, default=None):
2423     return v.strip() if isinstance(v, compat_str) else default
2424
2425
2426 def url_or_none(url):
2427     if not url or not isinstance(url, compat_str):
2428         return None
2429     url = url.strip()
2430     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2431
2432
2433 def strftime_or_none(timestamp, date_format, default=None):
2434     datetime_object = None
2435     try:
2436         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2437             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2438         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2439             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2440         return datetime_object.strftime(date_format)
2441     except (ValueError, TypeError, AttributeError):
2442         return default
2443
2444
2445 def parse_duration(s):
2446     if not isinstance(s, compat_basestring):
2447         return None
2448     s = s.strip()
2449     if not s:
2450         return None
2451
2452     days, hours, mins, secs, ms = [None] * 5
2453     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2454     if m:
2455         days, hours, mins, secs, ms = m.groups()
2456     else:
2457         m = re.match(
2458             r'''(?ix)(?:P?
2459                 (?:
2460                     [0-9]+\s*y(?:ears?)?\s*
2461                 )?
2462                 (?:
2463                     [0-9]+\s*m(?:onths?)?\s*
2464                 )?
2465                 (?:
2466                     [0-9]+\s*w(?:eeks?)?\s*
2467                 )?
2468                 (?:
2469                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2470                 )?
2471                 T)?
2472                 (?:
2473                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2474                 )?
2475                 (?:
2476                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2477                 )?
2478                 (?:
2479                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2480                 )?Z?$''', s)
2481         if m:
2482             days, hours, mins, secs, ms = m.groups()
2483         else:
2484             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2485             if m:
2486                 hours, mins = m.groups()
2487             else:
2488                 return None
2489
2490     duration = 0
2491     if secs:
2492         duration += float(secs)
2493     if mins:
2494         duration += float(mins) * 60
2495     if hours:
2496         duration += float(hours) * 60 * 60
2497     if days:
2498         duration += float(days) * 24 * 60 * 60
2499     if ms:
2500         duration += float(ms)
2501     return duration
2502
2503
2504 def prepend_extension(filename, ext, expected_real_ext=None):
2505     name, real_ext = os.path.splitext(filename)
2506     return (
2507         '{0}.{1}{2}'.format(name, ext, real_ext)
2508         if not expected_real_ext or real_ext[1:] == expected_real_ext
2509         else '{0}.{1}'.format(filename, ext))
2510
2511
2512 def replace_extension(filename, ext, expected_real_ext=None):
2513     name, real_ext = os.path.splitext(filename)
2514     return '{0}.{1}'.format(
2515         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2516         ext)
2517
2518
2519 def check_executable(exe, args=[]):
2520     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2521     args can be a list of arguments for a short output (like -version) """
2522     try:
2523         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2524     except OSError:
2525         return False
2526     return exe
2527
2528
2529 def _get_exe_version_output(exe, args):
2530     try:
2531         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2532         # SIGTTOU if yt-dlp is run in the background.
2533         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2534         out, _ = Popen(
2535             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2536             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2537     except OSError:
2538         return False
2539     if isinstance(out, bytes):  # Python 2.x
2540         out = out.decode('ascii', 'ignore')
2541     return out
2542
2543
2544 def detect_exe_version(output, version_re=None, unrecognized='present'):
2545     assert isinstance(output, compat_str)
2546     if version_re is None:
2547         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2548     m = re.search(version_re, output)
2549     if m:
2550         return m.group(1)
2551     else:
2552         return unrecognized
2553
2554
2555 def get_exe_version(exe, args=['--version'],
2556                     version_re=None, unrecognized='present'):
2557     """ Returns the version of the specified executable,
2558     or False if the executable is not present """
2559     out = _get_exe_version_output(exe, args)
2560     return detect_exe_version(out, version_re, unrecognized) if out else False
2561
2562
2563 class LazyList(collections.abc.Sequence):
2564     ''' Lazy immutable list from an iterable
2565     Note that slices of a LazyList are lists and not LazyList'''
2566
2567     class IndexError(IndexError):
2568         pass
2569
2570     def __init__(self, iterable, *, reverse=False, _cache=None):
2571         self.__iterable = iter(iterable)
2572         self.__cache = [] if _cache is None else _cache
2573         self.__reversed = reverse
2574
2575     def __iter__(self):
2576         if self.__reversed:
2577             # We need to consume the entire iterable to iterate in reverse
2578             yield from self.exhaust()
2579             return
2580         yield from self.__cache
2581         for item in self.__iterable:
2582             self.__cache.append(item)
2583             yield item
2584
2585     def __exhaust(self):
2586         self.__cache.extend(self.__iterable)
2587         # Discard the emptied iterable to make it pickle-able
2588         self.__iterable = []
2589         return self.__cache
2590
2591     def exhaust(self):
2592         ''' Evaluate the entire iterable '''
2593         return self.__exhaust()[::-1 if self.__reversed else 1]
2594
2595     @staticmethod
2596     def __reverse_index(x):
2597         return None if x is None else -(x + 1)
2598
2599     def __getitem__(self, idx):
2600         if isinstance(idx, slice):
2601             if self.__reversed:
2602                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2603             start, stop, step = idx.start, idx.stop, idx.step or 1
2604         elif isinstance(idx, int):
2605             if self.__reversed:
2606                 idx = self.__reverse_index(idx)
2607             start, stop, step = idx, idx, 0
2608         else:
2609             raise TypeError('indices must be integers or slices')
2610         if ((start or 0) < 0 or (stop or 0) < 0
2611                 or (start is None and step < 0)
2612                 or (stop is None and step > 0)):
2613             # We need to consume the entire iterable to be able to slice from the end
2614             # Obviously, never use this with infinite iterables
2615             self.__exhaust()
2616             try:
2617                 return self.__cache[idx]
2618             except IndexError as e:
2619                 raise self.IndexError(e) from e
2620         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2621         if n > 0:
2622             self.__cache.extend(itertools.islice(self.__iterable, n))
2623         try:
2624             return self.__cache[idx]
2625         except IndexError as e:
2626             raise self.IndexError(e) from e
2627
2628     def __bool__(self):
2629         try:
2630             self[-1] if self.__reversed else self[0]
2631         except self.IndexError:
2632             return False
2633         return True
2634
2635     def __len__(self):
2636         self.__exhaust()
2637         return len(self.__cache)
2638
2639     def __reversed__(self):
2640         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2641
2642     def __copy__(self):
2643         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2644
2645     def __repr__(self):
2646         # repr and str should mimic a list. So we exhaust the iterable
2647         return repr(self.exhaust())
2648
2649     def __str__(self):
2650         return repr(self.exhaust())
2651
2652
2653 class PagedList:
2654
2655     class IndexError(IndexError):
2656         pass
2657
2658     def __len__(self):
2659         # This is only useful for tests
2660         return len(self.getslice())
2661
2662     def __init__(self, pagefunc, pagesize, use_cache=True):
2663         self._pagefunc = pagefunc
2664         self._pagesize = pagesize
2665         self._use_cache = use_cache
2666         self._cache = {}
2667
2668     def getpage(self, pagenum):
2669         page_results = self._cache.get(pagenum)
2670         if page_results is None:
2671             page_results = list(self._pagefunc(pagenum))
2672         if self._use_cache:
2673             self._cache[pagenum] = page_results
2674         return page_results
2675
2676     def getslice(self, start=0, end=None):
2677         return list(self._getslice(start, end))
2678
2679     def _getslice(self, start, end):
2680         raise NotImplementedError('This method must be implemented by subclasses')
2681
2682     def __getitem__(self, idx):
2683         # NOTE: cache must be enabled if this is used
2684         if not isinstance(idx, int) or idx < 0:
2685             raise TypeError('indices must be non-negative integers')
2686         entries = self.getslice(idx, idx + 1)
2687         if not entries:
2688             raise self.IndexError()
2689         return entries[0]
2690
2691
2692 class OnDemandPagedList(PagedList):
2693     def _getslice(self, start, end):
2694         for pagenum in itertools.count(start // self._pagesize):
2695             firstid = pagenum * self._pagesize
2696             nextfirstid = pagenum * self._pagesize + self._pagesize
2697             if start >= nextfirstid:
2698                 continue
2699
2700             startv = (
2701                 start % self._pagesize
2702                 if firstid <= start < nextfirstid
2703                 else 0)
2704             endv = (
2705                 ((end - 1) % self._pagesize) + 1
2706                 if (end is not None and firstid <= end <= nextfirstid)
2707                 else None)
2708
2709             page_results = self.getpage(pagenum)
2710             if startv != 0 or endv is not None:
2711                 page_results = page_results[startv:endv]
2712             yield from page_results
2713
2714             # A little optimization - if current page is not "full", ie. does
2715             # not contain page_size videos then we can assume that this page
2716             # is the last one - there are no more ids on further pages -
2717             # i.e. no need to query again.
2718             if len(page_results) + startv < self._pagesize:
2719                 break
2720
2721             # If we got the whole page, but the next page is not interesting,
2722             # break out early as well
2723             if end == nextfirstid:
2724                 break
2725
2726
2727 class InAdvancePagedList(PagedList):
2728     def __init__(self, pagefunc, pagecount, pagesize):
2729         self._pagecount = pagecount
2730         PagedList.__init__(self, pagefunc, pagesize, True)
2731
2732     def _getslice(self, start, end):
2733         start_page = start // self._pagesize
2734         end_page = (
2735             self._pagecount if end is None else (end // self._pagesize + 1))
2736         skip_elems = start - start_page * self._pagesize
2737         only_more = None if end is None else end - start
2738         for pagenum in range(start_page, end_page):
2739             page_results = self.getpage(pagenum)
2740             if skip_elems:
2741                 page_results = page_results[skip_elems:]
2742                 skip_elems = None
2743             if only_more is not None:
2744                 if len(page_results) < only_more:
2745                     only_more -= len(page_results)
2746                 else:
2747                     yield from page_results[:only_more]
2748                     break
2749             yield from page_results
2750
2751
2752 def uppercase_escape(s):
2753     unicode_escape = codecs.getdecoder('unicode_escape')
2754     return re.sub(
2755         r'\\U[0-9a-fA-F]{8}',
2756         lambda m: unicode_escape(m.group(0))[0],
2757         s)
2758
2759
2760 def lowercase_escape(s):
2761     unicode_escape = codecs.getdecoder('unicode_escape')
2762     return re.sub(
2763         r'\\u[0-9a-fA-F]{4}',
2764         lambda m: unicode_escape(m.group(0))[0],
2765         s)
2766
2767
2768 def escape_rfc3986(s):
2769     """Escape non-ASCII characters as suggested by RFC 3986"""
2770     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2771         s = s.encode('utf-8')
2772     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2773
2774
2775 def escape_url(url):
2776     """Escape URL as suggested by RFC 3986"""
2777     url_parsed = compat_urllib_parse_urlparse(url)
2778     return url_parsed._replace(
2779         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2780         path=escape_rfc3986(url_parsed.path),
2781         params=escape_rfc3986(url_parsed.params),
2782         query=escape_rfc3986(url_parsed.query),
2783         fragment=escape_rfc3986(url_parsed.fragment)
2784     ).geturl()
2785
2786
2787 def parse_qs(url):
2788     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2789
2790
2791 def read_batch_urls(batch_fd):
2792     def fixup(url):
2793         if not isinstance(url, compat_str):
2794             url = url.decode('utf-8', 'replace')
2795         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2796         for bom in BOM_UTF8:
2797             if url.startswith(bom):
2798                 url = url[len(bom):]
2799         url = url.lstrip()
2800         if not url or url.startswith(('#', ';', ']')):
2801             return False
2802         # "#" cannot be stripped out since it is part of the URI
2803         # However, it can be safely stipped out if follwing a whitespace
2804         return re.split(r'\s#', url, 1)[0].rstrip()
2805
2806     with contextlib.closing(batch_fd) as fd:
2807         return [url for url in map(fixup, fd) if url]
2808
2809
2810 def urlencode_postdata(*args, **kargs):
2811     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2812
2813
2814 def update_url_query(url, query):
2815     if not query:
2816         return url
2817     parsed_url = compat_urlparse.urlparse(url)
2818     qs = compat_parse_qs(parsed_url.query)
2819     qs.update(query)
2820     return compat_urlparse.urlunparse(parsed_url._replace(
2821         query=compat_urllib_parse_urlencode(qs, True)))
2822
2823
2824 def update_Request(req, url=None, data=None, headers={}, query={}):
2825     req_headers = req.headers.copy()
2826     req_headers.update(headers)
2827     req_data = data or req.data
2828     req_url = update_url_query(url or req.get_full_url(), query)
2829     req_get_method = req.get_method()
2830     if req_get_method == 'HEAD':
2831         req_type = HEADRequest
2832     elif req_get_method == 'PUT':
2833         req_type = PUTRequest
2834     else:
2835         req_type = compat_urllib_request.Request
2836     new_req = req_type(
2837         req_url, data=req_data, headers=req_headers,
2838         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2839     if hasattr(req, 'timeout'):
2840         new_req.timeout = req.timeout
2841     return new_req
2842
2843
2844 def _multipart_encode_impl(data, boundary):
2845     content_type = 'multipart/form-data; boundary=%s' % boundary
2846
2847     out = b''
2848     for k, v in data.items():
2849         out += b'--' + boundary.encode('ascii') + b'\r\n'
2850         if isinstance(k, compat_str):
2851             k = k.encode('utf-8')
2852         if isinstance(v, compat_str):
2853             v = v.encode('utf-8')
2854         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2855         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2856         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2857         if boundary.encode('ascii') in content:
2858             raise ValueError('Boundary overlaps with data')
2859         out += content
2860
2861     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2862
2863     return out, content_type
2864
2865
2866 def multipart_encode(data, boundary=None):
2867     '''
2868     Encode a dict to RFC 7578-compliant form-data
2869
2870     data:
2871         A dict where keys and values can be either Unicode or bytes-like
2872         objects.
2873     boundary:
2874         If specified a Unicode object, it's used as the boundary. Otherwise
2875         a random boundary is generated.
2876
2877     Reference: https://tools.ietf.org/html/rfc7578
2878     '''
2879     has_specified_boundary = boundary is not None
2880
2881     while True:
2882         if boundary is None:
2883             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2884
2885         try:
2886             out, content_type = _multipart_encode_impl(data, boundary)
2887             break
2888         except ValueError:
2889             if has_specified_boundary:
2890                 raise
2891             boundary = None
2892
2893     return out, content_type
2894
2895
2896 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2897     if isinstance(key_or_keys, (list, tuple)):
2898         for key in key_or_keys:
2899             if key not in d or d[key] is None or skip_false_values and not d[key]:
2900                 continue
2901             return d[key]
2902         return default
2903     return d.get(key_or_keys, default)
2904
2905
2906 def try_get(src, getter, expected_type=None):
2907     for get in variadic(getter):
2908         try:
2909             v = get(src)
2910         except (AttributeError, KeyError, TypeError, IndexError):
2911             pass
2912         else:
2913             if expected_type is None or isinstance(v, expected_type):
2914                 return v
2915
2916
2917 def merge_dicts(*dicts):
2918     merged = {}
2919     for a_dict in dicts:
2920         for k, v in a_dict.items():
2921             if v is None:
2922                 continue
2923             if (k not in merged
2924                     or (isinstance(v, compat_str) and v
2925                         and isinstance(merged[k], compat_str)
2926                         and not merged[k])):
2927                 merged[k] = v
2928     return merged
2929
2930
2931 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2932     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2933
2934
2935 US_RATINGS = {
2936     'G': 0,
2937     'PG': 10,
2938     'PG-13': 13,
2939     'R': 16,
2940     'NC': 18,
2941 }
2942
2943
2944 TV_PARENTAL_GUIDELINES = {
2945     'TV-Y': 0,
2946     'TV-Y7': 7,
2947     'TV-G': 0,
2948     'TV-PG': 0,
2949     'TV-14': 14,
2950     'TV-MA': 17,
2951 }
2952
2953
2954 def parse_age_limit(s):
2955     if type(s) == int:
2956         return s if 0 <= s <= 21 else None
2957     if not isinstance(s, compat_basestring):
2958         return None
2959     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2960     if m:
2961         return int(m.group('age'))
2962     s = s.upper()
2963     if s in US_RATINGS:
2964         return US_RATINGS[s]
2965     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2966     if m:
2967         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2968     return None
2969
2970
2971 def strip_jsonp(code):
2972     return re.sub(
2973         r'''(?sx)^
2974             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2975             (?:\s*&&\s*(?P=func_name))?
2976             \s*\(\s*(?P<callback_data>.*)\);?
2977             \s*?(?://[^\n]*)*$''',
2978         r'\g<callback_data>', code)
2979
2980
2981 def js_to_json(code, vars={}):
2982     # vars is a dict of var, val pairs to substitute
2983     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2984     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2985     INTEGER_TABLE = (
2986         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2987         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2988     )
2989
2990     def fix_kv(m):
2991         v = m.group(0)
2992         if v in ('true', 'false', 'null'):
2993             return v
2994         elif v in ('undefined', 'void 0'):
2995             return 'null'
2996         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2997             return ""
2998
2999         if v[0] in ("'", '"'):
3000             v = re.sub(r'(?s)\\.|"', lambda m: {
3001                 '"': '\\"',
3002                 "\\'": "'",
3003                 '\\\n': '',
3004                 '\\x': '\\u00',
3005             }.get(m.group(0), m.group(0)), v[1:-1])
3006         else:
3007             for regex, base in INTEGER_TABLE:
3008                 im = re.match(regex, v)
3009                 if im:
3010                     i = int(im.group(1), base)
3011                     return '"%d":' % i if v.endswith(':') else '%d' % i
3012
3013             if v in vars:
3014                 return vars[v]
3015
3016         return '"%s"' % v
3017
3018     return re.sub(r'''(?sx)
3019         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3020         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3021         {comment}|,(?={skip}[\]}}])|
3022         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3023         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3024         [0-9]+(?={skip}:)|
3025         !+
3026         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3027
3028
3029 def qualities(quality_ids):
3030     """ Get a numeric quality value out of a list of possible values """
3031     def q(qid):
3032         try:
3033             return quality_ids.index(qid)
3034         except ValueError:
3035             return -1
3036     return q
3037
3038
3039 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process'}
3040
3041
3042 DEFAULT_OUTTMPL = {
3043     'default': '%(title)s [%(id)s].%(ext)s',
3044     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3045 }
3046 OUTTMPL_TYPES = {
3047     'chapter': None,
3048     'subtitle': None,
3049     'thumbnail': None,
3050     'description': 'description',
3051     'annotation': 'annotations.xml',
3052     'infojson': 'info.json',
3053     'link': None,
3054     'pl_thumbnail': None,
3055     'pl_description': 'description',
3056     'pl_infojson': 'info.json',
3057 }
3058
3059 # As of [1] format syntax is:
3060 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3061 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3062 STR_FORMAT_RE_TMPL = r'''(?x)
3063     (?<!%)(?P<prefix>(?:%%)*)
3064     %
3065     (?P<has_key>\((?P<key>{0})\))?
3066     (?P<format>
3067         (?P<conversion>[#0\-+ ]+)?
3068         (?P<min_width>\d+)?
3069         (?P<precision>\.\d+)?
3070         (?P<len_mod>[hlL])?  # unused in python
3071         {1}  # conversion type
3072     )
3073 '''
3074
3075
3076 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3077
3078
3079 def limit_length(s, length):
3080     """ Add ellipses to overly long strings """
3081     if s is None:
3082         return None
3083     ELLIPSES = '...'
3084     if len(s) > length:
3085         return s[:length - len(ELLIPSES)] + ELLIPSES
3086     return s
3087
3088
3089 def version_tuple(v):
3090     return tuple(int(e) for e in re.split(r'[-.]', v))
3091
3092
3093 def is_outdated_version(version, limit, assume_new=True):
3094     if not version:
3095         return not assume_new
3096     try:
3097         return version_tuple(version) < version_tuple(limit)
3098     except ValueError:
3099         return not assume_new
3100
3101
3102 def ytdl_is_updateable():
3103     """ Returns if yt-dlp can be updated with -U """
3104
3105     from .update import is_non_updateable
3106
3107     return not is_non_updateable()
3108
3109
3110 def args_to_str(args):
3111     # Get a short string representation for a subprocess command
3112     return ' '.join(compat_shlex_quote(a) for a in args)
3113
3114
3115 def error_to_compat_str(err):
3116     err_str = str(err)
3117     # On python 2 error byte string must be decoded with proper
3118     # encoding rather than ascii
3119     if sys.version_info[0] < 3:
3120         err_str = err_str.decode(preferredencoding())
3121     return err_str
3122
3123
3124 def mimetype2ext(mt):
3125     if mt is None:
3126         return None
3127
3128     mt, _, params = mt.partition(';')
3129     mt = mt.strip()
3130
3131     FULL_MAP = {
3132         'audio/mp4': 'm4a',
3133         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3134         # it's the most popular one
3135         'audio/mpeg': 'mp3',
3136         'audio/x-wav': 'wav',
3137         'audio/wav': 'wav',
3138         'audio/wave': 'wav',
3139     }
3140
3141     ext = FULL_MAP.get(mt)
3142     if ext is not None:
3143         return ext
3144
3145     SUBTYPE_MAP = {
3146         '3gpp': '3gp',
3147         'smptett+xml': 'tt',
3148         'ttaf+xml': 'dfxp',
3149         'ttml+xml': 'ttml',
3150         'x-flv': 'flv',
3151         'x-mp4-fragmented': 'mp4',
3152         'x-ms-sami': 'sami',
3153         'x-ms-wmv': 'wmv',
3154         'mpegurl': 'm3u8',
3155         'x-mpegurl': 'm3u8',
3156         'vnd.apple.mpegurl': 'm3u8',
3157         'dash+xml': 'mpd',
3158         'f4m+xml': 'f4m',
3159         'hds+xml': 'f4m',
3160         'vnd.ms-sstr+xml': 'ism',
3161         'quicktime': 'mov',
3162         'mp2t': 'ts',
3163         'x-wav': 'wav',
3164         'filmstrip+json': 'fs',
3165         'svg+xml': 'svg',
3166     }
3167
3168     _, _, subtype = mt.rpartition('/')
3169     ext = SUBTYPE_MAP.get(subtype.lower())
3170     if ext is not None:
3171         return ext
3172
3173     SUFFIX_MAP = {
3174         'json': 'json',
3175         'xml': 'xml',
3176         'zip': 'zip',
3177         'gzip': 'gz',
3178     }
3179
3180     _, _, suffix = subtype.partition('+')
3181     ext = SUFFIX_MAP.get(suffix)
3182     if ext is not None:
3183         return ext
3184
3185     return subtype.replace('+', '.')
3186
3187
3188 def ext2mimetype(ext_or_url):
3189     if not ext_or_url:
3190         return None
3191     if '.' not in ext_or_url:
3192         ext_or_url = f'file.{ext_or_url}'
3193     return mimetypes.guess_type(ext_or_url)[0]
3194
3195
3196 def parse_codecs(codecs_str):
3197     # http://tools.ietf.org/html/rfc6381
3198     if not codecs_str:
3199         return {}
3200     split_codecs = list(filter(None, map(
3201         str.strip, codecs_str.strip().strip(',').split(','))))
3202     vcodec, acodec, tcodec, hdr = None, None, None, None
3203     for full_codec in split_codecs:
3204         parts = full_codec.split('.')
3205         codec = parts[0].replace('0', '')
3206         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3207                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3208             if not vcodec:
3209                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3210                 if codec in ('dvh1', 'dvhe'):
3211                     hdr = 'DV'
3212                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3213                     hdr = 'HDR10'
3214                 elif full_codec.replace('0', '').startswith('vp9.2'):
3215                     hdr = 'HDR10'
3216         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3217             if not acodec:
3218                 acodec = full_codec
3219         elif codec in ('stpp', 'wvtt',):
3220             if not tcodec:
3221                 tcodec = full_codec
3222         else:
3223             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3224     if vcodec or acodec or tcodec:
3225         return {
3226             'vcodec': vcodec or 'none',
3227             'acodec': acodec or 'none',
3228             'dynamic_range': hdr,
3229             **({'tcodec': tcodec} if tcodec is not None else {}),
3230         }
3231     elif len(split_codecs) == 2:
3232         return {
3233             'vcodec': split_codecs[0],
3234             'acodec': split_codecs[1],
3235         }
3236     return {}
3237
3238
3239 def urlhandle_detect_ext(url_handle):
3240     getheader = url_handle.headers.get
3241
3242     cd = getheader('Content-Disposition')
3243     if cd:
3244         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3245         if m:
3246             e = determine_ext(m.group('filename'), default_ext=None)
3247             if e:
3248                 return e
3249
3250     return mimetype2ext(getheader('Content-Type'))
3251
3252
3253 def encode_data_uri(data, mime_type):
3254     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3255
3256
3257 def age_restricted(content_limit, age_limit):
3258     """ Returns True iff the content should be blocked """
3259
3260     if age_limit is None:  # No limit set
3261         return False
3262     if content_limit is None:
3263         return False  # Content available for everyone
3264     return age_limit < content_limit
3265
3266
3267 def is_html(first_bytes):
3268     """ Detect whether a file contains HTML by examining its first bytes. """
3269
3270     BOMS = [
3271         (b'\xef\xbb\xbf', 'utf-8'),
3272         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3273         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3274         (b'\xff\xfe', 'utf-16-le'),
3275         (b'\xfe\xff', 'utf-16-be'),
3276     ]
3277     for bom, enc in BOMS:
3278         if first_bytes.startswith(bom):
3279             s = first_bytes[len(bom):].decode(enc, 'replace')
3280             break
3281     else:
3282         s = first_bytes.decode('utf-8', 'replace')
3283
3284     return re.match(r'^\s*<', s)
3285
3286
3287 def determine_protocol(info_dict):
3288     protocol = info_dict.get('protocol')
3289     if protocol is not None:
3290         return protocol
3291
3292     url = sanitize_url(info_dict['url'])
3293     if url.startswith('rtmp'):
3294         return 'rtmp'
3295     elif url.startswith('mms'):
3296         return 'mms'
3297     elif url.startswith('rtsp'):
3298         return 'rtsp'
3299
3300     ext = determine_ext(url)
3301     if ext == 'm3u8':
3302         return 'm3u8'
3303     elif ext == 'f4m':
3304         return 'f4m'
3305
3306     return compat_urllib_parse_urlparse(url).scheme
3307
3308
3309 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3310     """ Render a list of rows, each as a list of values.
3311     Text after a \t will be right aligned """
3312     def width(string):
3313         return len(remove_terminal_sequences(string).replace('\t', ''))
3314
3315     def get_max_lens(table):
3316         return [max(width(str(v)) for v in col) for col in zip(*table)]
3317
3318     def filter_using_list(row, filterArray):
3319         return [col for (take, col) in zip(filterArray, row) if take]
3320
3321     if hide_empty:
3322         max_lens = get_max_lens(data)
3323         header_row = filter_using_list(header_row, max_lens)
3324         data = [filter_using_list(row, max_lens) for row in data]
3325
3326     table = [header_row] + data
3327     max_lens = get_max_lens(table)
3328     extra_gap += 1
3329     if delim:
3330         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3331         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3332     for row in table:
3333         for pos, text in enumerate(map(str, row)):
3334             if '\t' in text:
3335                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3336             else:
3337                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3338     ret = '\n'.join(''.join(row).rstrip() for row in table)
3339     return ret
3340
3341
3342 def _match_one(filter_part, dct, incomplete):
3343     # TODO: Generalize code with YoutubeDL._build_format_filter
3344     STRING_OPERATORS = {
3345         '*=': operator.contains,
3346         '^=': lambda attr, value: attr.startswith(value),
3347         '$=': lambda attr, value: attr.endswith(value),
3348         '~=': lambda attr, value: re.search(value, attr),
3349     }
3350     COMPARISON_OPERATORS = {
3351         **STRING_OPERATORS,
3352         '<=': operator.le,  # "<=" must be defined above "<"
3353         '<': operator.lt,
3354         '>=': operator.ge,
3355         '>': operator.gt,
3356         '=': operator.eq,
3357     }
3358
3359     operator_rex = re.compile(r'''(?x)\s*
3360         (?P<key>[a-z_]+)
3361         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3362         (?:
3363             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3364             (?P<strval>.+?)
3365         )
3366         \s*$
3367         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3368     m = operator_rex.search(filter_part)
3369     if m:
3370         m = m.groupdict()
3371         unnegated_op = COMPARISON_OPERATORS[m['op']]
3372         if m['negation']:
3373             op = lambda attr, value: not unnegated_op(attr, value)
3374         else:
3375             op = unnegated_op
3376         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3377         if m['quote']:
3378             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3379         actual_value = dct.get(m['key'])
3380         numeric_comparison = None
3381         if isinstance(actual_value, compat_numeric_types):
3382             # If the original field is a string and matching comparisonvalue is
3383             # a number we should respect the origin of the original field
3384             # and process comparison value as a string (see
3385             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3386             try:
3387                 numeric_comparison = int(comparison_value)
3388             except ValueError:
3389                 numeric_comparison = parse_filesize(comparison_value)
3390                 if numeric_comparison is None:
3391                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3392                 if numeric_comparison is None:
3393                     numeric_comparison = parse_duration(comparison_value)
3394         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3395             raise ValueError('Operator %s only supports string values!' % m['op'])
3396         if actual_value is None:
3397             return incomplete or m['none_inclusive']
3398         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3399
3400     UNARY_OPERATORS = {
3401         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3402         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3403     }
3404     operator_rex = re.compile(r'''(?x)\s*
3405         (?P<op>%s)\s*(?P<key>[a-z_]+)
3406         \s*$
3407         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3408     m = operator_rex.search(filter_part)
3409     if m:
3410         op = UNARY_OPERATORS[m.group('op')]
3411         actual_value = dct.get(m.group('key'))
3412         if incomplete and actual_value is None:
3413             return True
3414         return op(actual_value)
3415
3416     raise ValueError('Invalid filter part %r' % filter_part)
3417
3418
3419 def match_str(filter_str, dct, incomplete=False):
3420     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3421         When incomplete, all conditions passes on missing fields
3422     """
3423     return all(
3424         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3425         for filter_part in re.split(r'(?<!\\)&', filter_str))
3426
3427
3428 def match_filter_func(filter_str):
3429     def _match_func(info_dict, *args, **kwargs):
3430         if match_str(filter_str, info_dict, *args, **kwargs):
3431             return None
3432         else:
3433             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3434             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3435     return _match_func
3436
3437
3438 def parse_dfxp_time_expr(time_expr):
3439     if not time_expr:
3440         return
3441
3442     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3443     if mobj:
3444         return float(mobj.group('time_offset'))
3445
3446     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3447     if mobj:
3448         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3449
3450
3451 def srt_subtitles_timecode(seconds):
3452     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3453
3454
3455 def ass_subtitles_timecode(seconds):
3456     time = timetuple_from_msec(seconds * 1000)
3457     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3458
3459
3460 def dfxp2srt(dfxp_data):
3461     '''
3462     @param dfxp_data A bytes-like object containing DFXP data
3463     @returns A unicode object containing converted SRT data
3464     '''
3465     LEGACY_NAMESPACES = (
3466         (b'http://www.w3.org/ns/ttml', [
3467             b'http://www.w3.org/2004/11/ttaf1',
3468             b'http://www.w3.org/2006/04/ttaf1',
3469             b'http://www.w3.org/2006/10/ttaf1',
3470         ]),
3471         (b'http://www.w3.org/ns/ttml#styling', [
3472             b'http://www.w3.org/ns/ttml#style',
3473         ]),
3474     )
3475
3476     SUPPORTED_STYLING = [
3477         'color',
3478         'fontFamily',
3479         'fontSize',
3480         'fontStyle',
3481         'fontWeight',
3482         'textDecoration'
3483     ]
3484
3485     _x = functools.partial(xpath_with_ns, ns_map={
3486         'xml': 'http://www.w3.org/XML/1998/namespace',
3487         'ttml': 'http://www.w3.org/ns/ttml',
3488         'tts': 'http://www.w3.org/ns/ttml#styling',
3489     })
3490
3491     styles = {}
3492     default_style = {}
3493
3494     class TTMLPElementParser(object):
3495         _out = ''
3496         _unclosed_elements = []
3497         _applied_styles = []
3498
3499         def start(self, tag, attrib):
3500             if tag in (_x('ttml:br'), 'br'):
3501                 self._out += '\n'
3502             else:
3503                 unclosed_elements = []
3504                 style = {}
3505                 element_style_id = attrib.get('style')
3506                 if default_style:
3507                     style.update(default_style)
3508                 if element_style_id:
3509                     style.update(styles.get(element_style_id, {}))
3510                 for prop in SUPPORTED_STYLING:
3511                     prop_val = attrib.get(_x('tts:' + prop))
3512                     if prop_val:
3513                         style[prop] = prop_val
3514                 if style:
3515                     font = ''
3516                     for k, v in sorted(style.items()):
3517                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3518                             continue
3519                         if k == 'color':
3520                             font += ' color="%s"' % v
3521                         elif k == 'fontSize':
3522                             font += ' size="%s"' % v
3523                         elif k == 'fontFamily':
3524                             font += ' face="%s"' % v
3525                         elif k == 'fontWeight' and v == 'bold':
3526                             self._out += '<b>'
3527                             unclosed_elements.append('b')
3528                         elif k == 'fontStyle' and v == 'italic':
3529                             self._out += '<i>'
3530                             unclosed_elements.append('i')
3531                         elif k == 'textDecoration' and v == 'underline':
3532                             self._out += '<u>'
3533                             unclosed_elements.append('u')
3534                     if font:
3535                         self._out += '<font' + font + '>'
3536                         unclosed_elements.append('font')
3537                     applied_style = {}
3538                     if self._applied_styles:
3539                         applied_style.update(self._applied_styles[-1])
3540                     applied_style.update(style)
3541                     self._applied_styles.append(applied_style)
3542                 self._unclosed_elements.append(unclosed_elements)
3543
3544         def end(self, tag):
3545             if tag not in (_x('ttml:br'), 'br'):
3546                 unclosed_elements = self._unclosed_elements.pop()
3547                 for element in reversed(unclosed_elements):
3548                     self._out += '</%s>' % element
3549                 if unclosed_elements and self._applied_styles:
3550                     self._applied_styles.pop()
3551
3552         def data(self, data):
3553             self._out += data
3554
3555         def close(self):
3556             return self._out.strip()
3557
3558     def parse_node(node):
3559         target = TTMLPElementParser()
3560         parser = xml.etree.ElementTree.XMLParser(target=target)
3561         parser.feed(xml.etree.ElementTree.tostring(node))
3562         return parser.close()
3563
3564     for k, v in LEGACY_NAMESPACES:
3565         for ns in v:
3566             dfxp_data = dfxp_data.replace(ns, k)
3567
3568     dfxp = compat_etree_fromstring(dfxp_data)
3569     out = []
3570     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3571
3572     if not paras:
3573         raise ValueError('Invalid dfxp/TTML subtitle')
3574
3575     repeat = False
3576     while True:
3577         for style in dfxp.findall(_x('.//ttml:style')):
3578             style_id = style.get('id') or style.get(_x('xml:id'))
3579             if not style_id:
3580                 continue
3581             parent_style_id = style.get('style')
3582             if parent_style_id:
3583                 if parent_style_id not in styles:
3584                     repeat = True
3585                     continue
3586                 styles[style_id] = styles[parent_style_id].copy()
3587             for prop in SUPPORTED_STYLING:
3588                 prop_val = style.get(_x('tts:' + prop))
3589                 if prop_val:
3590                     styles.setdefault(style_id, {})[prop] = prop_val
3591         if repeat:
3592             repeat = False
3593         else:
3594             break
3595
3596     for p in ('body', 'div'):
3597         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3598         if ele is None:
3599             continue
3600         style = styles.get(ele.get('style'))
3601         if not style:
3602             continue
3603         default_style.update(style)
3604
3605     for para, index in zip(paras, itertools.count(1)):
3606         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3607         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3608         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3609         if begin_time is None:
3610             continue
3611         if not end_time:
3612             if not dur:
3613                 continue
3614             end_time = begin_time + dur
3615         out.append('%d\n%s --> %s\n%s\n\n' % (
3616             index,
3617             srt_subtitles_timecode(begin_time),
3618             srt_subtitles_timecode(end_time),
3619             parse_node(para)))
3620
3621     return ''.join(out)
3622
3623
3624 def cli_option(params, command_option, param):
3625     param = params.get(param)
3626     if param:
3627         param = compat_str(param)
3628     return [command_option, param] if param is not None else []
3629
3630
3631 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3632     param = params.get(param)
3633     if param is None:
3634         return []
3635     assert isinstance(param, bool)
3636     if separator:
3637         return [command_option + separator + (true_value if param else false_value)]
3638     return [command_option, true_value if param else false_value]
3639
3640
3641 def cli_valueless_option(params, command_option, param, expected_value=True):
3642     param = params.get(param)
3643     return [command_option] if param == expected_value else []
3644
3645
3646 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3647     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3648         if use_compat:
3649             return argdict
3650         else:
3651             argdict = None
3652     if argdict is None:
3653         return default
3654     assert isinstance(argdict, dict)
3655
3656     assert isinstance(keys, (list, tuple))
3657     for key_list in keys:
3658         arg_list = list(filter(
3659             lambda x: x is not None,
3660             [argdict.get(key.lower()) for key in variadic(key_list)]))
3661         if arg_list:
3662             return [arg for args in arg_list for arg in args]
3663     return default
3664
3665
3666 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3667     main_key, exe = main_key.lower(), exe.lower()
3668     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3669     keys = [f'{root_key}{k}' for k in (keys or [''])]
3670     if root_key in keys:
3671         if main_key != exe:
3672             keys.append((main_key, exe))
3673         keys.append('default')
3674     else:
3675         use_compat = False
3676     return cli_configuration_args(argdict, keys, default, use_compat)
3677
3678
3679 class ISO639Utils(object):
3680     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3681     _lang_map = {
3682         'aa': 'aar',
3683         'ab': 'abk',
3684         'ae': 'ave',
3685         'af': 'afr',
3686         'ak': 'aka',
3687         'am': 'amh',
3688         'an': 'arg',
3689         'ar': 'ara',
3690         'as': 'asm',
3691         'av': 'ava',
3692         'ay': 'aym',
3693         'az': 'aze',
3694         'ba': 'bak',
3695         'be': 'bel',
3696         'bg': 'bul',
3697         'bh': 'bih',
3698         'bi': 'bis',
3699         'bm': 'bam',
3700         'bn': 'ben',
3701         'bo': 'bod',
3702         'br': 'bre',
3703         'bs': 'bos',
3704         'ca': 'cat',
3705         'ce': 'che',
3706         'ch': 'cha',
3707         'co': 'cos',
3708         'cr': 'cre',
3709         'cs': 'ces',
3710         'cu': 'chu',
3711         'cv': 'chv',
3712         'cy': 'cym',
3713         'da': 'dan',
3714         'de': 'deu',
3715         'dv': 'div',
3716         'dz': 'dzo',
3717         'ee': 'ewe',
3718         'el': 'ell',
3719         'en': 'eng',
3720         'eo': 'epo',
3721         'es': 'spa',
3722         'et': 'est',
3723         'eu': 'eus',
3724         'fa': 'fas',
3725         'ff': 'ful',
3726         'fi': 'fin',
3727         'fj': 'fij',
3728         'fo': 'fao',
3729         'fr': 'fra',
3730         'fy': 'fry',
3731         'ga': 'gle',
3732         'gd': 'gla',
3733         'gl': 'glg',
3734         'gn': 'grn',
3735         'gu': 'guj',
3736         'gv': 'glv',
3737         'ha': 'hau',
3738         'he': 'heb',
3739         'iw': 'heb',  # Replaced by he in 1989 revision
3740         'hi': 'hin',
3741         'ho': 'hmo',
3742         'hr': 'hrv',
3743         'ht': 'hat',
3744         'hu': 'hun',
3745         'hy': 'hye',
3746         'hz': 'her',
3747         'ia': 'ina',
3748         'id': 'ind',
3749         'in': 'ind',  # Replaced by id in 1989 revision
3750         'ie': 'ile',
3751         'ig': 'ibo',
3752         'ii': 'iii',
3753         'ik': 'ipk',
3754         'io': 'ido',
3755         'is': 'isl',
3756         'it': 'ita',
3757         'iu': 'iku',
3758         'ja': 'jpn',
3759         'jv': 'jav',
3760         'ka': 'kat',
3761         'kg': 'kon',
3762         'ki': 'kik',
3763         'kj': 'kua',
3764         'kk': 'kaz',
3765         'kl': 'kal',
3766         'km': 'khm',
3767         'kn': 'kan',
3768         'ko': 'kor',
3769         'kr': 'kau',
3770         'ks': 'kas',
3771         'ku': 'kur',
3772         'kv': 'kom',
3773         'kw': 'cor',
3774         'ky': 'kir',
3775         'la': 'lat',
3776         'lb': 'ltz',
3777         'lg': 'lug',
3778         'li': 'lim',
3779         'ln': 'lin',
3780         'lo': 'lao',
3781         'lt': 'lit',
3782         'lu': 'lub',
3783         'lv': 'lav',
3784         'mg': 'mlg',
3785         'mh': 'mah',
3786         'mi': 'mri',
3787         'mk': 'mkd',
3788         'ml': 'mal',
3789         'mn': 'mon',
3790         'mr': 'mar',
3791         'ms': 'msa',
3792         'mt': 'mlt',
3793         'my': 'mya',
3794         'na': 'nau',
3795         'nb': 'nob',
3796         'nd': 'nde',
3797         'ne': 'nep',
3798         'ng': 'ndo',
3799         'nl': 'nld',
3800         'nn': 'nno',
3801         'no': 'nor',
3802         'nr': 'nbl',
3803         'nv': 'nav',
3804         'ny': 'nya',
3805         'oc': 'oci',
3806         'oj': 'oji',
3807         'om': 'orm',
3808         'or': 'ori',
3809         'os': 'oss',
3810         'pa': 'pan',
3811         'pi': 'pli',
3812         'pl': 'pol',
3813         'ps': 'pus',
3814         'pt': 'por',
3815         'qu': 'que',
3816         'rm': 'roh',
3817         'rn': 'run',
3818         'ro': 'ron',
3819         'ru': 'rus',
3820         'rw': 'kin',
3821         'sa': 'san',
3822         'sc': 'srd',
3823         'sd': 'snd',
3824         'se': 'sme',
3825         'sg': 'sag',
3826         'si': 'sin',
3827         'sk': 'slk',
3828         'sl': 'slv',
3829         'sm': 'smo',
3830         'sn': 'sna',
3831         'so': 'som',
3832         'sq': 'sqi',
3833         'sr': 'srp',
3834         'ss': 'ssw',
3835         'st': 'sot',
3836         'su': 'sun',
3837         'sv': 'swe',
3838         'sw': 'swa',
3839         'ta': 'tam',
3840         'te': 'tel',
3841         'tg': 'tgk',
3842         'th': 'tha',
3843         'ti': 'tir',
3844         'tk': 'tuk',
3845         'tl': 'tgl',
3846         'tn': 'tsn',
3847         'to': 'ton',
3848         'tr': 'tur',
3849         'ts': 'tso',
3850         'tt': 'tat',
3851         'tw': 'twi',
3852         'ty': 'tah',
3853         'ug': 'uig',
3854         'uk': 'ukr',
3855         'ur': 'urd',
3856         'uz': 'uzb',
3857         've': 'ven',
3858         'vi': 'vie',
3859         'vo': 'vol',
3860         'wa': 'wln',
3861         'wo': 'wol',
3862         'xh': 'xho',
3863         'yi': 'yid',
3864         'ji': 'yid',  # Replaced by yi in 1989 revision
3865         'yo': 'yor',
3866         'za': 'zha',
3867         'zh': 'zho',
3868         'zu': 'zul',
3869     }
3870
3871     @classmethod
3872     def short2long(cls, code):
3873         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3874         return cls._lang_map.get(code[:2])
3875
3876     @classmethod
3877     def long2short(cls, code):
3878         """Convert language code from ISO 639-2/T to ISO 639-1"""
3879         for short_name, long_name in cls._lang_map.items():
3880             if long_name == code:
3881                 return short_name
3882
3883
3884 class ISO3166Utils(object):
3885     # From http://data.okfn.org/data/core/country-list
3886     _country_map = {
3887         'AF': 'Afghanistan',
3888         'AX': 'Åland Islands',
3889         'AL': 'Albania',
3890         'DZ': 'Algeria',
3891         'AS': 'American Samoa',
3892         'AD': 'Andorra',
3893         'AO': 'Angola',
3894         'AI': 'Anguilla',
3895         'AQ': 'Antarctica',
3896         'AG': 'Antigua and Barbuda',
3897         'AR': 'Argentina',
3898         'AM': 'Armenia',
3899         'AW': 'Aruba',
3900         'AU': 'Australia',
3901         'AT': 'Austria',
3902         'AZ': 'Azerbaijan',
3903         'BS': 'Bahamas',
3904         'BH': 'Bahrain',
3905         'BD': 'Bangladesh',
3906         'BB': 'Barbados',
3907         'BY': 'Belarus',
3908         'BE': 'Belgium',
3909         'BZ': 'Belize',
3910         'BJ': 'Benin',
3911         'BM': 'Bermuda',
3912         'BT': 'Bhutan',
3913         'BO': 'Bolivia, Plurinational State of',
3914         'BQ': 'Bonaire, Sint Eustatius and Saba',
3915         'BA': 'Bosnia and Herzegovina',
3916         'BW': 'Botswana',
3917         'BV': 'Bouvet Island',
3918         'BR': 'Brazil',
3919         'IO': 'British Indian Ocean Territory',
3920         'BN': 'Brunei Darussalam',
3921         'BG': 'Bulgaria',
3922         'BF': 'Burkina Faso',
3923         'BI': 'Burundi',
3924         'KH': 'Cambodia',
3925         'CM': 'Cameroon',
3926         'CA': 'Canada',
3927         'CV': 'Cape Verde',
3928         'KY': 'Cayman Islands',
3929         'CF': 'Central African Republic',
3930         'TD': 'Chad',
3931         'CL': 'Chile',
3932         'CN': 'China',
3933         'CX': 'Christmas Island',
3934         'CC': 'Cocos (Keeling) Islands',
3935         'CO': 'Colombia',
3936         'KM': 'Comoros',
3937         'CG': 'Congo',
3938         'CD': 'Congo, the Democratic Republic of the',
3939         'CK': 'Cook Islands',
3940         'CR': 'Costa Rica',
3941         'CI': 'Côte d\'Ivoire',
3942         'HR': 'Croatia',
3943         'CU': 'Cuba',
3944         'CW': 'Curaçao',
3945         'CY': 'Cyprus',
3946         'CZ': 'Czech Republic',
3947         'DK': 'Denmark',
3948         'DJ': 'Djibouti',
3949         'DM': 'Dominica',
3950         'DO': 'Dominican Republic',
3951         'EC': 'Ecuador',
3952         'EG': 'Egypt',
3953         'SV': 'El Salvador',
3954         'GQ': 'Equatorial Guinea',
3955         'ER': 'Eritrea',
3956         'EE': 'Estonia',
3957         'ET': 'Ethiopia',
3958         'FK': 'Falkland Islands (Malvinas)',
3959         'FO': 'Faroe Islands',
3960         'FJ': 'Fiji',
3961         'FI': 'Finland',
3962         'FR': 'France',
3963         'GF': 'French Guiana',
3964         'PF': 'French Polynesia',
3965         'TF': 'French Southern Territories',
3966         'GA': 'Gabon',
3967         'GM': 'Gambia',
3968         'GE': 'Georgia',
3969         'DE': 'Germany',
3970         'GH': 'Ghana',
3971         'GI': 'Gibraltar',
3972         'GR': 'Greece',
3973         'GL': 'Greenland',
3974         'GD': 'Grenada',
3975         'GP': 'Guadeloupe',
3976         'GU': 'Guam',
3977         'GT': 'Guatemala',
3978         'GG': 'Guernsey',
3979         'GN': 'Guinea',
3980         'GW': 'Guinea-Bissau',
3981         'GY': 'Guyana',
3982         'HT': 'Haiti',
3983         'HM': 'Heard Island and McDonald Islands',
3984         'VA': 'Holy See (Vatican City State)',
3985         'HN': 'Honduras',
3986         'HK': 'Hong Kong',
3987         'HU': 'Hungary',
3988         'IS': 'Iceland',
3989         'IN': 'India',
3990         'ID': 'Indonesia',
3991         'IR': 'Iran, Islamic Republic of',
3992         'IQ': 'Iraq',
3993         'IE': 'Ireland',
3994         'IM': 'Isle of Man',
3995         'IL': 'Israel',
3996         'IT': 'Italy',
3997         'JM': 'Jamaica',
3998         'JP': 'Japan',
3999         'JE': 'Jersey',
4000         'JO': 'Jordan',
4001         'KZ': 'Kazakhstan',
4002         'KE': 'Kenya',
4003         'KI': 'Kiribati',
4004         'KP': 'Korea, Democratic People\'s Republic of',
4005         'KR': 'Korea, Republic of',
4006         'KW': 'Kuwait',
4007         'KG': 'Kyrgyzstan',
4008         'LA': 'Lao People\'s Democratic Republic',
4009         'LV': 'Latvia',
4010         'LB': 'Lebanon',
4011         'LS': 'Lesotho',
4012         'LR': 'Liberia',
4013         'LY': 'Libya',
4014         'LI': 'Liechtenstein',
4015         'LT': 'Lithuania',
4016         'LU': 'Luxembourg',
4017         'MO': 'Macao',
4018         'MK': 'Macedonia, the Former Yugoslav Republic of',
4019         'MG': 'Madagascar',
4020         'MW': 'Malawi',
4021         'MY': 'Malaysia',
4022         'MV': 'Maldives',
4023         'ML': 'Mali',
4024         'MT': 'Malta',
4025         'MH': 'Marshall Islands',
4026         'MQ': 'Martinique',
4027         'MR': 'Mauritania',
4028         'MU': 'Mauritius',
4029         'YT': 'Mayotte',
4030         'MX': 'Mexico',
4031         'FM': 'Micronesia, Federated States of',
4032         'MD': 'Moldova, Republic of',
4033         'MC': 'Monaco',
4034         'MN': 'Mongolia',
4035         'ME': 'Montenegro',
4036         'MS': 'Montserrat',
4037         'MA': 'Morocco',
4038         'MZ': 'Mozambique',
4039         'MM': 'Myanmar',
4040         'NA': 'Namibia',
4041         'NR': 'Nauru',
4042         'NP': 'Nepal',
4043         'NL': 'Netherlands',
4044         'NC': 'New Caledonia',
4045         'NZ': 'New Zealand',
4046         'NI': 'Nicaragua',
4047         'NE': 'Niger',
4048         'NG': 'Nigeria',
4049         'NU': 'Niue',
4050         'NF': 'Norfolk Island',
4051         'MP': 'Northern Mariana Islands',
4052         'NO': 'Norway',
4053         'OM': 'Oman',
4054         'PK': 'Pakistan',
4055         'PW': 'Palau',
4056         'PS': 'Palestine, State of',
4057         'PA': 'Panama',
4058         'PG': 'Papua New Guinea',
4059         'PY': 'Paraguay',
4060         'PE': 'Peru',
4061         'PH': 'Philippines',
4062         'PN': 'Pitcairn',
4063         'PL': 'Poland',
4064         'PT': 'Portugal',
4065         'PR': 'Puerto Rico',
4066         'QA': 'Qatar',
4067         'RE': 'Réunion',
4068         'RO': 'Romania',
4069         'RU': 'Russian Federation',
4070         'RW': 'Rwanda',
4071         'BL': 'Saint Barthélemy',
4072         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4073         'KN': 'Saint Kitts and Nevis',
4074         'LC': 'Saint Lucia',
4075         'MF': 'Saint Martin (French part)',
4076         'PM': 'Saint Pierre and Miquelon',
4077         'VC': 'Saint Vincent and the Grenadines',
4078         'WS': 'Samoa',
4079         'SM': 'San Marino',
4080         'ST': 'Sao Tome and Principe',
4081         'SA': 'Saudi Arabia',
4082         'SN': 'Senegal',
4083         'RS': 'Serbia',
4084         'SC': 'Seychelles',
4085         'SL': 'Sierra Leone',
4086         'SG': 'Singapore',
4087         'SX': 'Sint Maarten (Dutch part)',
4088         'SK': 'Slovakia',
4089         'SI': 'Slovenia',
4090         'SB': 'Solomon Islands',
4091         'SO': 'Somalia',
4092         'ZA': 'South Africa',
4093         'GS': 'South Georgia and the South Sandwich Islands',
4094         'SS': 'South Sudan',
4095         'ES': 'Spain',
4096         'LK': 'Sri Lanka',
4097         'SD': 'Sudan',
4098         'SR': 'Suriname',
4099         'SJ': 'Svalbard and Jan Mayen',
4100         'SZ': 'Swaziland',
4101         'SE': 'Sweden',
4102         'CH': 'Switzerland',
4103         'SY': 'Syrian Arab Republic',
4104         'TW': 'Taiwan, Province of China',
4105         'TJ': 'Tajikistan',
4106         'TZ': 'Tanzania, United Republic of',
4107         'TH': 'Thailand',
4108         'TL': 'Timor-Leste',
4109         'TG': 'Togo',
4110         'TK': 'Tokelau',
4111         'TO': 'Tonga',
4112         'TT': 'Trinidad and Tobago',
4113         'TN': 'Tunisia',
4114         'TR': 'Turkey',
4115         'TM': 'Turkmenistan',
4116         'TC': 'Turks and Caicos Islands',
4117         'TV': 'Tuvalu',
4118         'UG': 'Uganda',
4119         'UA': 'Ukraine',
4120         'AE': 'United Arab Emirates',
4121         'GB': 'United Kingdom',
4122         'US': 'United States',
4123         'UM': 'United States Minor Outlying Islands',
4124         'UY': 'Uruguay',
4125         'UZ': 'Uzbekistan',
4126         'VU': 'Vanuatu',
4127         'VE': 'Venezuela, Bolivarian Republic of',
4128         'VN': 'Viet Nam',
4129         'VG': 'Virgin Islands, British',
4130         'VI': 'Virgin Islands, U.S.',
4131         'WF': 'Wallis and Futuna',
4132         'EH': 'Western Sahara',
4133         'YE': 'Yemen',
4134         'ZM': 'Zambia',
4135         'ZW': 'Zimbabwe',
4136     }
4137
4138     @classmethod
4139     def short2full(cls, code):
4140         """Convert an ISO 3166-2 country code to the corresponding full name"""
4141         return cls._country_map.get(code.upper())
4142
4143
4144 class GeoUtils(object):
4145     # Major IPv4 address blocks per country
4146     _country_ip_map = {
4147         'AD': '46.172.224.0/19',
4148         'AE': '94.200.0.0/13',
4149         'AF': '149.54.0.0/17',
4150         'AG': '209.59.64.0/18',
4151         'AI': '204.14.248.0/21',
4152         'AL': '46.99.0.0/16',
4153         'AM': '46.70.0.0/15',
4154         'AO': '105.168.0.0/13',
4155         'AP': '182.50.184.0/21',
4156         'AQ': '23.154.160.0/24',
4157         'AR': '181.0.0.0/12',
4158         'AS': '202.70.112.0/20',
4159         'AT': '77.116.0.0/14',
4160         'AU': '1.128.0.0/11',
4161         'AW': '181.41.0.0/18',
4162         'AX': '185.217.4.0/22',
4163         'AZ': '5.197.0.0/16',
4164         'BA': '31.176.128.0/17',
4165         'BB': '65.48.128.0/17',
4166         'BD': '114.130.0.0/16',
4167         'BE': '57.0.0.0/8',
4168         'BF': '102.178.0.0/15',
4169         'BG': '95.42.0.0/15',
4170         'BH': '37.131.0.0/17',
4171         'BI': '154.117.192.0/18',
4172         'BJ': '137.255.0.0/16',
4173         'BL': '185.212.72.0/23',
4174         'BM': '196.12.64.0/18',
4175         'BN': '156.31.0.0/16',
4176         'BO': '161.56.0.0/16',
4177         'BQ': '161.0.80.0/20',
4178         'BR': '191.128.0.0/12',
4179         'BS': '24.51.64.0/18',
4180         'BT': '119.2.96.0/19',
4181         'BW': '168.167.0.0/16',
4182         'BY': '178.120.0.0/13',
4183         'BZ': '179.42.192.0/18',
4184         'CA': '99.224.0.0/11',
4185         'CD': '41.243.0.0/16',
4186         'CF': '197.242.176.0/21',
4187         'CG': '160.113.0.0/16',
4188         'CH': '85.0.0.0/13',
4189         'CI': '102.136.0.0/14',
4190         'CK': '202.65.32.0/19',
4191         'CL': '152.172.0.0/14',
4192         'CM': '102.244.0.0/14',
4193         'CN': '36.128.0.0/10',
4194         'CO': '181.240.0.0/12',
4195         'CR': '201.192.0.0/12',
4196         'CU': '152.206.0.0/15',
4197         'CV': '165.90.96.0/19',
4198         'CW': '190.88.128.0/17',
4199         'CY': '31.153.0.0/16',
4200         'CZ': '88.100.0.0/14',
4201         'DE': '53.0.0.0/8',
4202         'DJ': '197.241.0.0/17',
4203         'DK': '87.48.0.0/12',
4204         'DM': '192.243.48.0/20',
4205         'DO': '152.166.0.0/15',
4206         'DZ': '41.96.0.0/12',
4207         'EC': '186.68.0.0/15',
4208         'EE': '90.190.0.0/15',
4209         'EG': '156.160.0.0/11',
4210         'ER': '196.200.96.0/20',
4211         'ES': '88.0.0.0/11',
4212         'ET': '196.188.0.0/14',
4213         'EU': '2.16.0.0/13',
4214         'FI': '91.152.0.0/13',
4215         'FJ': '144.120.0.0/16',
4216         'FK': '80.73.208.0/21',
4217         'FM': '119.252.112.0/20',
4218         'FO': '88.85.32.0/19',
4219         'FR': '90.0.0.0/9',
4220         'GA': '41.158.0.0/15',
4221         'GB': '25.0.0.0/8',
4222         'GD': '74.122.88.0/21',
4223         'GE': '31.146.0.0/16',
4224         'GF': '161.22.64.0/18',
4225         'GG': '62.68.160.0/19',
4226         'GH': '154.160.0.0/12',
4227         'GI': '95.164.0.0/16',
4228         'GL': '88.83.0.0/19',
4229         'GM': '160.182.0.0/15',
4230         'GN': '197.149.192.0/18',
4231         'GP': '104.250.0.0/19',
4232         'GQ': '105.235.224.0/20',
4233         'GR': '94.64.0.0/13',
4234         'GT': '168.234.0.0/16',
4235         'GU': '168.123.0.0/16',
4236         'GW': '197.214.80.0/20',
4237         'GY': '181.41.64.0/18',
4238         'HK': '113.252.0.0/14',
4239         'HN': '181.210.0.0/16',
4240         'HR': '93.136.0.0/13',
4241         'HT': '148.102.128.0/17',
4242         'HU': '84.0.0.0/14',
4243         'ID': '39.192.0.0/10',
4244         'IE': '87.32.0.0/12',
4245         'IL': '79.176.0.0/13',
4246         'IM': '5.62.80.0/20',
4247         'IN': '117.192.0.0/10',
4248         'IO': '203.83.48.0/21',
4249         'IQ': '37.236.0.0/14',
4250         'IR': '2.176.0.0/12',
4251         'IS': '82.221.0.0/16',
4252         'IT': '79.0.0.0/10',
4253         'JE': '87.244.64.0/18',
4254         'JM': '72.27.0.0/17',
4255         'JO': '176.29.0.0/16',
4256         'JP': '133.0.0.0/8',
4257         'KE': '105.48.0.0/12',
4258         'KG': '158.181.128.0/17',
4259         'KH': '36.37.128.0/17',
4260         'KI': '103.25.140.0/22',
4261         'KM': '197.255.224.0/20',
4262         'KN': '198.167.192.0/19',
4263         'KP': '175.45.176.0/22',
4264         'KR': '175.192.0.0/10',
4265         'KW': '37.36.0.0/14',
4266         'KY': '64.96.0.0/15',
4267         'KZ': '2.72.0.0/13',
4268         'LA': '115.84.64.0/18',
4269         'LB': '178.135.0.0/16',
4270         'LC': '24.92.144.0/20',
4271         'LI': '82.117.0.0/19',
4272         'LK': '112.134.0.0/15',
4273         'LR': '102.183.0.0/16',
4274         'LS': '129.232.0.0/17',
4275         'LT': '78.56.0.0/13',
4276         'LU': '188.42.0.0/16',
4277         'LV': '46.109.0.0/16',
4278         'LY': '41.252.0.0/14',
4279         'MA': '105.128.0.0/11',
4280         'MC': '88.209.64.0/18',
4281         'MD': '37.246.0.0/16',
4282         'ME': '178.175.0.0/17',
4283         'MF': '74.112.232.0/21',
4284         'MG': '154.126.0.0/17',
4285         'MH': '117.103.88.0/21',
4286         'MK': '77.28.0.0/15',
4287         'ML': '154.118.128.0/18',
4288         'MM': '37.111.0.0/17',
4289         'MN': '49.0.128.0/17',
4290         'MO': '60.246.0.0/16',
4291         'MP': '202.88.64.0/20',
4292         'MQ': '109.203.224.0/19',
4293         'MR': '41.188.64.0/18',
4294         'MS': '208.90.112.0/22',
4295         'MT': '46.11.0.0/16',
4296         'MU': '105.16.0.0/12',
4297         'MV': '27.114.128.0/18',
4298         'MW': '102.70.0.0/15',
4299         'MX': '187.192.0.0/11',
4300         'MY': '175.136.0.0/13',
4301         'MZ': '197.218.0.0/15',
4302         'NA': '41.182.0.0/16',
4303         'NC': '101.101.0.0/18',
4304         'NE': '197.214.0.0/18',
4305         'NF': '203.17.240.0/22',
4306         'NG': '105.112.0.0/12',
4307         'NI': '186.76.0.0/15',
4308         'NL': '145.96.0.0/11',
4309         'NO': '84.208.0.0/13',
4310         'NP': '36.252.0.0/15',
4311         'NR': '203.98.224.0/19',
4312         'NU': '49.156.48.0/22',
4313         'NZ': '49.224.0.0/14',
4314         'OM': '5.36.0.0/15',
4315         'PA': '186.72.0.0/15',
4316         'PE': '186.160.0.0/14',
4317         'PF': '123.50.64.0/18',
4318         'PG': '124.240.192.0/19',
4319         'PH': '49.144.0.0/13',
4320         'PK': '39.32.0.0/11',
4321         'PL': '83.0.0.0/11',
4322         'PM': '70.36.0.0/20',
4323         'PR': '66.50.0.0/16',
4324         'PS': '188.161.0.0/16',
4325         'PT': '85.240.0.0/13',
4326         'PW': '202.124.224.0/20',
4327         'PY': '181.120.0.0/14',
4328         'QA': '37.210.0.0/15',
4329         'RE': '102.35.0.0/16',
4330         'RO': '79.112.0.0/13',
4331         'RS': '93.86.0.0/15',
4332         'RU': '5.136.0.0/13',
4333         'RW': '41.186.0.0/16',
4334         'SA': '188.48.0.0/13',
4335         'SB': '202.1.160.0/19',
4336         'SC': '154.192.0.0/11',
4337         'SD': '102.120.0.0/13',
4338         'SE': '78.64.0.0/12',
4339         'SG': '8.128.0.0/10',
4340         'SI': '188.196.0.0/14',
4341         'SK': '78.98.0.0/15',
4342         'SL': '102.143.0.0/17',
4343         'SM': '89.186.32.0/19',
4344         'SN': '41.82.0.0/15',
4345         'SO': '154.115.192.0/18',
4346         'SR': '186.179.128.0/17',
4347         'SS': '105.235.208.0/21',
4348         'ST': '197.159.160.0/19',
4349         'SV': '168.243.0.0/16',
4350         'SX': '190.102.0.0/20',
4351         'SY': '5.0.0.0/16',
4352         'SZ': '41.84.224.0/19',
4353         'TC': '65.255.48.0/20',
4354         'TD': '154.68.128.0/19',
4355         'TG': '196.168.0.0/14',
4356         'TH': '171.96.0.0/13',
4357         'TJ': '85.9.128.0/18',
4358         'TK': '27.96.24.0/21',
4359         'TL': '180.189.160.0/20',
4360         'TM': '95.85.96.0/19',
4361         'TN': '197.0.0.0/11',
4362         'TO': '175.176.144.0/21',
4363         'TR': '78.160.0.0/11',
4364         'TT': '186.44.0.0/15',
4365         'TV': '202.2.96.0/19',
4366         'TW': '120.96.0.0/11',
4367         'TZ': '156.156.0.0/14',
4368         'UA': '37.52.0.0/14',
4369         'UG': '102.80.0.0/13',
4370         'US': '6.0.0.0/8',
4371         'UY': '167.56.0.0/13',
4372         'UZ': '84.54.64.0/18',
4373         'VA': '212.77.0.0/19',
4374         'VC': '207.191.240.0/21',
4375         'VE': '186.88.0.0/13',
4376         'VG': '66.81.192.0/20',
4377         'VI': '146.226.0.0/16',
4378         'VN': '14.160.0.0/11',
4379         'VU': '202.80.32.0/20',
4380         'WF': '117.20.32.0/21',
4381         'WS': '202.4.32.0/19',
4382         'YE': '134.35.0.0/16',
4383         'YT': '41.242.116.0/22',
4384         'ZA': '41.0.0.0/11',
4385         'ZM': '102.144.0.0/13',
4386         'ZW': '102.177.192.0/18',
4387     }
4388
4389     @classmethod
4390     def random_ipv4(cls, code_or_block):
4391         if len(code_or_block) == 2:
4392             block = cls._country_ip_map.get(code_or_block.upper())
4393             if not block:
4394                 return None
4395         else:
4396             block = code_or_block
4397         addr, preflen = block.split('/')
4398         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4399         addr_max = addr_min | (0xffffffff >> int(preflen))
4400         return compat_str(socket.inet_ntoa(
4401             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4402
4403
4404 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4405     def __init__(self, proxies=None):
4406         # Set default handlers
4407         for type in ('http', 'https'):
4408             setattr(self, '%s_open' % type,
4409                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4410                         meth(r, proxy, type))
4411         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4412
4413     def proxy_open(self, req, proxy, type):
4414         req_proxy = req.headers.get('Ytdl-request-proxy')
4415         if req_proxy is not None:
4416             proxy = req_proxy
4417             del req.headers['Ytdl-request-proxy']
4418
4419         if proxy == '__noproxy__':
4420             return None  # No Proxy
4421         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4422             req.add_header('Ytdl-socks-proxy', proxy)
4423             # yt-dlp's http/https handlers do wrapping the socket with socks
4424             return None
4425         return compat_urllib_request.ProxyHandler.proxy_open(
4426             self, req, proxy, type)
4427
4428
4429 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4430 # released into Public Domain
4431 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4432
4433 def long_to_bytes(n, blocksize=0):
4434     """long_to_bytes(n:long, blocksize:int) : string
4435     Convert a long integer to a byte string.
4436
4437     If optional blocksize is given and greater than zero, pad the front of the
4438     byte string with binary zeros so that the length is a multiple of
4439     blocksize.
4440     """
4441     # after much testing, this algorithm was deemed to be the fastest
4442     s = b''
4443     n = int(n)
4444     while n > 0:
4445         s = compat_struct_pack('>I', n & 0xffffffff) + s
4446         n = n >> 32
4447     # strip off leading zeros
4448     for i in range(len(s)):
4449         if s[i] != b'\000'[0]:
4450             break
4451     else:
4452         # only happens when n == 0
4453         s = b'\000'
4454         i = 0
4455     s = s[i:]
4456     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4457     # de-padding being done above, but sigh...
4458     if blocksize > 0 and len(s) % blocksize:
4459         s = (blocksize - len(s) % blocksize) * b'\000' + s
4460     return s
4461
4462
4463 def bytes_to_long(s):
4464     """bytes_to_long(string) : long
4465     Convert a byte string to a long integer.
4466
4467     This is (essentially) the inverse of long_to_bytes().
4468     """
4469     acc = 0
4470     length = len(s)
4471     if length % 4:
4472         extra = (4 - length % 4)
4473         s = b'\000' * extra + s
4474         length = length + extra
4475     for i in range(0, length, 4):
4476         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4477     return acc
4478
4479
4480 def ohdave_rsa_encrypt(data, exponent, modulus):
4481     '''
4482     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4483
4484     Input:
4485         data: data to encrypt, bytes-like object
4486         exponent, modulus: parameter e and N of RSA algorithm, both integer
4487     Output: hex string of encrypted data
4488
4489     Limitation: supports one block encryption only
4490     '''
4491
4492     payload = int(binascii.hexlify(data[::-1]), 16)
4493     encrypted = pow(payload, exponent, modulus)
4494     return '%x' % encrypted
4495
4496
4497 def pkcs1pad(data, length):
4498     """
4499     Padding input data with PKCS#1 scheme
4500
4501     @param {int[]} data        input data
4502     @param {int}   length      target length
4503     @returns {int[]}           padded data
4504     """
4505     if len(data) > length - 11:
4506         raise ValueError('Input data too long for PKCS#1 padding')
4507
4508     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4509     return [0, 2] + pseudo_random + [0] + data
4510
4511
4512 def encode_base_n(num, n, table=None):
4513     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4514     if not table:
4515         table = FULL_TABLE[:n]
4516
4517     if n > len(table):
4518         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4519
4520     if num == 0:
4521         return table[0]
4522
4523     ret = ''
4524     while num:
4525         ret = table[num % n] + ret
4526         num = num // n
4527     return ret
4528
4529
4530 def decode_packed_codes(code):
4531     mobj = re.search(PACKED_CODES_RE, code)
4532     obfuscated_code, base, count, symbols = mobj.groups()
4533     base = int(base)
4534     count = int(count)
4535     symbols = symbols.split('|')
4536     symbol_table = {}
4537
4538     while count:
4539         count -= 1
4540         base_n_count = encode_base_n(count, base)
4541         symbol_table[base_n_count] = symbols[count] or base_n_count
4542
4543     return re.sub(
4544         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4545         obfuscated_code)
4546
4547
4548 def caesar(s, alphabet, shift):
4549     if shift == 0:
4550         return s
4551     l = len(alphabet)
4552     return ''.join(
4553         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4554         for c in s)
4555
4556
4557 def rot47(s):
4558     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4559
4560
4561 def parse_m3u8_attributes(attrib):
4562     info = {}
4563     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4564         if val.startswith('"'):
4565             val = val[1:-1]
4566         info[key] = val
4567     return info
4568
4569
4570 def urshift(val, n):
4571     return val >> n if val >= 0 else (val + 0x100000000) >> n
4572
4573
4574 # Based on png2str() written by @gdkchan and improved by @yokrysty
4575 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4576 def decode_png(png_data):
4577     # Reference: https://www.w3.org/TR/PNG/
4578     header = png_data[8:]
4579
4580     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4581         raise IOError('Not a valid PNG file.')
4582
4583     int_map = {1: '>B', 2: '>H', 4: '>I'}
4584     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4585
4586     chunks = []
4587
4588     while header:
4589         length = unpack_integer(header[:4])
4590         header = header[4:]
4591
4592         chunk_type = header[:4]
4593         header = header[4:]
4594
4595         chunk_data = header[:length]
4596         header = header[length:]
4597
4598         header = header[4:]  # Skip CRC
4599
4600         chunks.append({
4601             'type': chunk_type,
4602             'length': length,
4603             'data': chunk_data
4604         })
4605
4606     ihdr = chunks[0]['data']
4607
4608     width = unpack_integer(ihdr[:4])
4609     height = unpack_integer(ihdr[4:8])
4610
4611     idat = b''
4612
4613     for chunk in chunks:
4614         if chunk['type'] == b'IDAT':
4615             idat += chunk['data']
4616
4617     if not idat:
4618         raise IOError('Unable to read PNG data.')
4619
4620     decompressed_data = bytearray(zlib.decompress(idat))
4621
4622     stride = width * 3
4623     pixels = []
4624
4625     def _get_pixel(idx):
4626         x = idx % stride
4627         y = idx // stride
4628         return pixels[y][x]
4629
4630     for y in range(height):
4631         basePos = y * (1 + stride)
4632         filter_type = decompressed_data[basePos]
4633
4634         current_row = []
4635
4636         pixels.append(current_row)
4637
4638         for x in range(stride):
4639             color = decompressed_data[1 + basePos + x]
4640             basex = y * stride + x
4641             left = 0
4642             up = 0
4643
4644             if x > 2:
4645                 left = _get_pixel(basex - 3)
4646             if y > 0:
4647                 up = _get_pixel(basex - stride)
4648
4649             if filter_type == 1:  # Sub
4650                 color = (color + left) & 0xff
4651             elif filter_type == 2:  # Up
4652                 color = (color + up) & 0xff
4653             elif filter_type == 3:  # Average
4654                 color = (color + ((left + up) >> 1)) & 0xff
4655             elif filter_type == 4:  # Paeth
4656                 a = left
4657                 b = up
4658                 c = 0
4659
4660                 if x > 2 and y > 0:
4661                     c = _get_pixel(basex - stride - 3)
4662
4663                 p = a + b - c
4664
4665                 pa = abs(p - a)
4666                 pb = abs(p - b)
4667                 pc = abs(p - c)
4668
4669                 if pa <= pb and pa <= pc:
4670                     color = (color + a) & 0xff
4671                 elif pb <= pc:
4672                     color = (color + b) & 0xff
4673                 else:
4674                     color = (color + c) & 0xff
4675
4676             current_row.append(color)
4677
4678     return width, height, pixels
4679
4680
4681 def write_xattr(path, key, value):
4682     # This mess below finds the best xattr tool for the job
4683     try:
4684         # try the pyxattr module...
4685         import xattr
4686
4687         if hasattr(xattr, 'set'):  # pyxattr
4688             # Unicode arguments are not supported in python-pyxattr until
4689             # version 0.5.0
4690             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4691             pyxattr_required_version = '0.5.0'
4692             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4693                 # TODO: fallback to CLI tools
4694                 raise XAttrUnavailableError(
4695                     'python-pyxattr is detected but is too old. '
4696                     'yt-dlp requires %s or above while your version is %s. '
4697                     'Falling back to other xattr implementations' % (
4698                         pyxattr_required_version, xattr.__version__))
4699
4700             setxattr = xattr.set
4701         else:  # xattr
4702             setxattr = xattr.setxattr
4703
4704         try:
4705             setxattr(path, key, value)
4706         except EnvironmentError as e:
4707             raise XAttrMetadataError(e.errno, e.strerror)
4708
4709     except ImportError:
4710         if compat_os_name == 'nt':
4711             # Write xattrs to NTFS Alternate Data Streams:
4712             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4713             assert ':' not in key
4714             assert os.path.exists(path)
4715
4716             ads_fn = path + ':' + key
4717             try:
4718                 with open(ads_fn, 'wb') as f:
4719                     f.write(value)
4720             except EnvironmentError as e:
4721                 raise XAttrMetadataError(e.errno, e.strerror)
4722         else:
4723             user_has_setfattr = check_executable('setfattr', ['--version'])
4724             user_has_xattr = check_executable('xattr', ['-h'])
4725
4726             if user_has_setfattr or user_has_xattr:
4727
4728                 value = value.decode('utf-8')
4729                 if user_has_setfattr:
4730                     executable = 'setfattr'
4731                     opts = ['-n', key, '-v', value]
4732                 elif user_has_xattr:
4733                     executable = 'xattr'
4734                     opts = ['-w', key, value]
4735
4736                 cmd = ([encodeFilename(executable, True)]
4737                        + [encodeArgument(o) for o in opts]
4738                        + [encodeFilename(path, True)])
4739
4740                 try:
4741                     p = Popen(
4742                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4743                 except EnvironmentError as e:
4744                     raise XAttrMetadataError(e.errno, e.strerror)
4745                 stdout, stderr = p.communicate_or_kill()
4746                 stderr = stderr.decode('utf-8', 'replace')
4747                 if p.returncode != 0:
4748                     raise XAttrMetadataError(p.returncode, stderr)
4749
4750             else:
4751                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4752                 if sys.platform.startswith('linux'):
4753                     raise XAttrUnavailableError(
4754                         "Couldn't find a tool to set the xattrs. "
4755                         "Install either the python 'pyxattr' or 'xattr' "
4756                         "modules, or the GNU 'attr' package "
4757                         "(which contains the 'setfattr' tool).")
4758                 else:
4759                     raise XAttrUnavailableError(
4760                         "Couldn't find a tool to set the xattrs. "
4761                         "Install either the python 'xattr' module, "
4762                         "or the 'xattr' binary.")
4763
4764
4765 def random_birthday(year_field, month_field, day_field):
4766     start_date = datetime.date(1950, 1, 1)
4767     end_date = datetime.date(1995, 12, 31)
4768     offset = random.randint(0, (end_date - start_date).days)
4769     random_date = start_date + datetime.timedelta(offset)
4770     return {
4771         year_field: str(random_date.year),
4772         month_field: str(random_date.month),
4773         day_field: str(random_date.day),
4774     }
4775
4776
4777 # Templates for internet shortcut files, which are plain text files.
4778 DOT_URL_LINK_TEMPLATE = '''
4779 [InternetShortcut]
4780 URL=%(url)s
4781 '''.lstrip()
4782
4783 DOT_WEBLOC_LINK_TEMPLATE = '''
4784 <?xml version="1.0" encoding="UTF-8"?>
4785 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4786 <plist version="1.0">
4787 <dict>
4788 \t<key>URL</key>
4789 \t<string>%(url)s</string>
4790 </dict>
4791 </plist>
4792 '''.lstrip()
4793
4794 DOT_DESKTOP_LINK_TEMPLATE = '''
4795 [Desktop Entry]
4796 Encoding=UTF-8
4797 Name=%(filename)s
4798 Type=Link
4799 URL=%(url)s
4800 Icon=text-html
4801 '''.lstrip()
4802
4803 LINK_TEMPLATES = {
4804     'url': DOT_URL_LINK_TEMPLATE,
4805     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4806     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4807 }
4808
4809
4810 def iri_to_uri(iri):
4811     """
4812     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4813
4814     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4815     """
4816
4817     iri_parts = compat_urllib_parse_urlparse(iri)
4818
4819     if '[' in iri_parts.netloc:
4820         raise ValueError('IPv6 URIs are not, yet, supported.')
4821         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4822
4823     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4824
4825     net_location = ''
4826     if iri_parts.username:
4827         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4828         if iri_parts.password is not None:
4829             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4830         net_location += '@'
4831
4832     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4833     # The 'idna' encoding produces ASCII text.
4834     if iri_parts.port is not None and iri_parts.port != 80:
4835         net_location += ':' + str(iri_parts.port)
4836
4837     return compat_urllib_parse_urlunparse(
4838         (iri_parts.scheme,
4839             net_location,
4840
4841             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4842
4843             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4844             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4845
4846             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4847             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4848
4849             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4850
4851     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4852
4853
4854 def to_high_limit_path(path):
4855     if sys.platform in ['win32', 'cygwin']:
4856         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4857         return r'\\?\ '.rstrip() + os.path.abspath(path)
4858
4859     return path
4860
4861
4862 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4863     if field is None:
4864         val = obj if obj is not None else default
4865     else:
4866         val = obj.get(field, default)
4867     if func and val not in ignore:
4868         val = func(val)
4869     return template % val if val not in ignore else default
4870
4871
4872 def clean_podcast_url(url):
4873     return re.sub(r'''(?x)
4874         (?:
4875             (?:
4876                 chtbl\.com/track|
4877                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4878                 play\.podtrac\.com
4879             )/[^/]+|
4880             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4881             flex\.acast\.com|
4882             pd(?:
4883                 cn\.co| # https://podcorn.com/analytics-prefix/
4884                 st\.fm # https://podsights.com/docs/
4885             )/e
4886         )/''', '', url)
4887
4888
4889 _HEX_TABLE = '0123456789abcdef'
4890
4891
4892 def random_uuidv4():
4893     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4894
4895
4896 def make_dir(path, to_screen=None):
4897     try:
4898         dn = os.path.dirname(path)
4899         if dn and not os.path.exists(dn):
4900             os.makedirs(dn)
4901         return True
4902     except (OSError, IOError) as err:
4903         if callable(to_screen) is not None:
4904             to_screen('unable to create directory ' + error_to_compat_str(err))
4905         return False
4906
4907
4908 def get_executable_path():
4909     from zipimport import zipimporter
4910     if hasattr(sys, 'frozen'):  # Running from PyInstaller
4911         path = os.path.dirname(sys.executable)
4912     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
4913         path = os.path.join(os.path.dirname(__file__), '../..')
4914     else:
4915         path = os.path.join(os.path.dirname(__file__), '..')
4916     return os.path.abspath(path)
4917
4918
4919 def load_plugins(name, suffix, namespace):
4920     classes = {}
4921     try:
4922         plugins_spec = importlib.util.spec_from_file_location(
4923             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4924         plugins = importlib.util.module_from_spec(plugins_spec)
4925         sys.modules[plugins_spec.name] = plugins
4926         plugins_spec.loader.exec_module(plugins)
4927         for name in dir(plugins):
4928             if name in namespace:
4929                 continue
4930             if not name.endswith(suffix):
4931                 continue
4932             klass = getattr(plugins, name)
4933             classes[name] = namespace[name] = klass
4934     except FileNotFoundError:
4935         pass
4936     return classes
4937
4938
4939 def traverse_obj(
4940         obj, *path_list, default=None, expected_type=None, get_all=True,
4941         casesense=True, is_user_input=False, traverse_string=False):
4942     ''' Traverse nested list/dict/tuple
4943     @param path_list        A list of paths which are checked one by one.
4944                             Each path is a list of keys where each key is a string,
4945                             a function, a tuple of strings/None or "...".
4946                             When a fuction is given, it takes the key as argument and
4947                             returns whether the key matches or not. When a tuple is given,
4948                             all the keys given in the tuple are traversed, and
4949                             "..." traverses all the keys in the object
4950                             "None" returns the object without traversal
4951     @param default          Default value to return
4952     @param expected_type    Only accept final value of this type (Can also be any callable)
4953     @param get_all          Return all the values obtained from a path or only the first one
4954     @param casesense        Whether to consider dictionary keys as case sensitive
4955     @param is_user_input    Whether the keys are generated from user input. If True,
4956                             strings are converted to int/slice if necessary
4957     @param traverse_string  Whether to traverse inside strings. If True, any
4958                             non-compatible object will also be converted into a string
4959     # TODO: Write tests
4960     '''
4961     if not casesense:
4962         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4963         path_list = (map(_lower, variadic(path)) for path in path_list)
4964
4965     def _traverse_obj(obj, path, _current_depth=0):
4966         nonlocal depth
4967         path = tuple(variadic(path))
4968         for i, key in enumerate(path):
4969             if None in (key, obj):
4970                 return obj
4971             if isinstance(key, (list, tuple)):
4972                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4973                 key = ...
4974             if key is ...:
4975                 obj = (obj.values() if isinstance(obj, dict)
4976                        else obj if isinstance(obj, (list, tuple, LazyList))
4977                        else str(obj) if traverse_string else [])
4978                 _current_depth += 1
4979                 depth = max(depth, _current_depth)
4980                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4981             elif callable(key):
4982                 if isinstance(obj, (list, tuple, LazyList)):
4983                     obj = enumerate(obj)
4984                 elif isinstance(obj, dict):
4985                     obj = obj.items()
4986                 else:
4987                     if not traverse_string:
4988                         return None
4989                     obj = str(obj)
4990                 _current_depth += 1
4991                 depth = max(depth, _current_depth)
4992                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
4993             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4994                 obj = (obj.get(key) if casesense or (key in obj)
4995                        else next((v for k, v in obj.items() if _lower(k) == key), None))
4996             else:
4997                 if is_user_input:
4998                     key = (int_or_none(key) if ':' not in key
4999                            else slice(*map(int_or_none, key.split(':'))))
5000                     if key == slice(None):
5001                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5002                 if not isinstance(key, (int, slice)):
5003                     return None
5004                 if not isinstance(obj, (list, tuple, LazyList)):
5005                     if not traverse_string:
5006                         return None
5007                     obj = str(obj)
5008                 try:
5009                     obj = obj[key]
5010                 except IndexError:
5011                     return None
5012         return obj
5013
5014     if isinstance(expected_type, type):
5015         type_test = lambda val: val if isinstance(val, expected_type) else None
5016     elif expected_type is not None:
5017         type_test = expected_type
5018     else:
5019         type_test = lambda val: val
5020
5021     for path in path_list:
5022         depth = 0
5023         val = _traverse_obj(obj, path)
5024         if val is not None:
5025             if depth:
5026                 for _ in range(depth - 1):
5027                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5028                 val = [v for v in map(type_test, val) if v is not None]
5029                 if val:
5030                     return val if get_all else val[0]
5031             else:
5032                 val = type_test(val)
5033                 if val is not None:
5034                     return val
5035     return default
5036
5037
5038 # Deprecated
5039 def traverse_dict(dictn, keys, casesense=True):
5040     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5041                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5042     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5043
5044
5045 def variadic(x, allowed_types=(str, bytes, dict)):
5046     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5047
5048
5049 # create a JSON Web Signature (jws) with HS256 algorithm
5050 # the resulting format is in JWS Compact Serialization
5051 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5052 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5053 def jwt_encode_hs256(payload_data, key, headers={}):
5054     header_data = {
5055         'alg': 'HS256',
5056         'typ': 'JWT',
5057     }
5058     if headers:
5059         header_data.update(headers)
5060     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5061     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5062     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5063     signature_b64 = base64.b64encode(h.digest())
5064     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5065     return token
5066
5067
5068 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5069 def jwt_decode_hs256(jwt):
5070     header_b64, payload_b64, signature_b64 = jwt.split('.')
5071     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5072     return payload_data
5073
5074
5075 def supports_terminal_sequences(stream):
5076     if compat_os_name == 'nt':
5077         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5078         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5079             return False
5080     elif not os.getenv('TERM'):
5081         return False
5082     try:
5083         return stream.isatty()
5084     except BaseException:
5085         return False
5086
5087
5088 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5089
5090
5091 def remove_terminal_sequences(string):
5092     return _terminal_sequences_re.sub('', string)
5093
5094
5095 def number_of_digits(number):
5096     return len('%d' % number)
5097
5098
5099 def join_nonempty(*values, delim='-', from_dict=None):
5100     if from_dict is not None:
5101         values = map(from_dict.get, values)
5102     return delim.join(map(str, filter(None, values)))