yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_quote,
  62     compat_str,
  63     compat_struct_pack,
  64     compat_struct_unpack,
  65     compat_urllib_error,
  66     compat_urllib_parse,
  67     compat_urllib_parse_urlencode,
  68     compat_urllib_parse_urlparse,
  69     compat_urllib_parse_urlunparse,
  70     compat_urllib_parse_quote,
  71     compat_urllib_parse_quote_plus,
  72     compat_urllib_parse_unquote_plus,
  73     compat_urllib_request,
  74     compat_urlparse,
  75     compat_xpath,
  76 )
  77
  78 from .socks import (
  79     ProxyType,
  80     sockssocket,
  81 )
  82
  83
  84 def register_socks_protocols():
  85     # "Register" SOCKS protocols
  86     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  87     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  88     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  89         if scheme not in compat_urlparse.uses_netloc:
  90             compat_urlparse.uses_netloc.append(scheme)
  91
  92
  93 # This is not clearly defined otherwise
  94 compiled_regex_type = type(re.compile(''))
  95
  96
  97 def random_user_agent():
  98     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  99     _CHROME_VERSIONS = (
 100         '90.0.4430.212',
 101         '90.0.4430.24',
 102         '90.0.4430.70',
 103         '90.0.4430.72',
 104         '90.0.4430.85',
 105         '90.0.4430.93',
 106         '91.0.4472.101',
 107         '91.0.4472.106',
 108         '91.0.4472.114',
 109         '91.0.4472.124',
 110         '91.0.4472.164',
 111         '91.0.4472.19',
 112         '91.0.4472.77',
 113         '92.0.4515.107',
 114         '92.0.4515.115',
 115         '92.0.4515.131',
 116         '92.0.4515.159',
 117         '92.0.4515.43',
 118         '93.0.4556.0',
 119         '93.0.4577.15',
 120         '93.0.4577.63',
 121         '93.0.4577.82',
 122         '94.0.4606.41',
 123         '94.0.4606.54',
 124         '94.0.4606.61',
 125         '94.0.4606.71',
 126         '94.0.4606.81',
 127         '94.0.4606.85',
 128         '95.0.4638.17',
 129         '95.0.4638.50',
 130         '95.0.4638.54',
 131         '95.0.4638.69',
 132         '95.0.4638.74',
 133         '96.0.4664.18',
 134         '96.0.4664.45',
 135         '96.0.4664.55',
 136         '96.0.4664.93',
 137         '97.0.4692.20',
 138     )
 139     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 140
 141
 142 std_headers = {
 143     'User-Agent': random_user_agent(),
 144     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 145     'Accept-Encoding': 'gzip, deflate',
 146     'Accept-Language': 'en-us,en;q=0.5',
 147 }
 148
 149
 150 USER_AGENTS = {
 151     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 152 }
 153
 154
 155 NO_DEFAULT = object()
 156
 157 ENGLISH_MONTH_NAMES = [
 158     'January', 'February', 'March', 'April', 'May', 'June',
 159     'July', 'August', 'September', 'October', 'November', 'December']
 160
 161 MONTH_NAMES = {
 162     'en': ENGLISH_MONTH_NAMES,
 163     'fr': [
 164         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 165         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 166 }
 167
 168 KNOWN_EXTENSIONS = (
 169     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 170     'flv', 'f4v', 'f4a', 'f4b',
 171     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 172     'mkv', 'mka', 'mk3d',
 173     'avi', 'divx',
 174     'mov',
 175     'asf', 'wmv', 'wma',
 176     '3gp', '3g2',
 177     'mp3',
 178     'flac',
 179     'ape',
 180     'wav',
 181     'f4f', 'f4m', 'm3u8', 'smil')
 182
 183 # needed for sanitizing filenames in restricted mode
 184 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 185                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 186                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 187
 188 DATE_FORMATS = (
 189     '%d %B %Y',
 190     '%d %b %Y',
 191     '%B %d %Y',
 192     '%B %dst %Y',
 193     '%B %dnd %Y',
 194     '%B %drd %Y',
 195     '%B %dth %Y',
 196     '%b %d %Y',
 197     '%b %dst %Y',
 198     '%b %dnd %Y',
 199     '%b %drd %Y',
 200     '%b %dth %Y',
 201     '%b %dst %Y %I:%M',
 202     '%b %dnd %Y %I:%M',
 203     '%b %drd %Y %I:%M',
 204     '%b %dth %Y %I:%M',
 205     '%Y %m %d',
 206     '%Y-%m-%d',
 207     '%Y.%m.%d.',
 208     '%Y/%m/%d',
 209     '%Y/%m/%d %H:%M',
 210     '%Y/%m/%d %H:%M:%S',
 211     '%Y%m%d%H%M',
 212     '%Y%m%d%H%M%S',
 213     '%Y%m%d',
 214     '%Y-%m-%d %H:%M',
 215     '%Y-%m-%d %H:%M:%S',
 216     '%Y-%m-%d %H:%M:%S.%f',
 217     '%Y-%m-%d %H:%M:%S:%f',
 218     '%d.%m.%Y %H:%M',
 219     '%d.%m.%Y %H.%M',
 220     '%Y-%m-%dT%H:%M:%SZ',
 221     '%Y-%m-%dT%H:%M:%S.%fZ',
 222     '%Y-%m-%dT%H:%M:%S.%f0Z',
 223     '%Y-%m-%dT%H:%M:%S',
 224     '%Y-%m-%dT%H:%M:%S.%f',
 225     '%Y-%m-%dT%H:%M',
 226     '%b %d %Y at %H:%M',
 227     '%b %d %Y at %H:%M:%S',
 228     '%B %d %Y at %H:%M',
 229     '%B %d %Y at %H:%M:%S',
 230     '%H:%M %d-%b-%Y',
 231 )
 232
 233 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 234 DATE_FORMATS_DAY_FIRST.extend([
 235     '%d-%m-%Y',
 236     '%d.%m.%Y',
 237     '%d.%m.%y',
 238     '%d/%m/%Y',
 239     '%d/%m/%y',
 240     '%d/%m/%Y %H:%M:%S',
 241 ])
 242
 243 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 244 DATE_FORMATS_MONTH_FIRST.extend([
 245     '%m-%d-%Y',
 246     '%m.%d.%Y',
 247     '%m/%d/%Y',
 248     '%m/%d/%y',
 249     '%m/%d/%Y %H:%M:%S',
 250 ])
 251
 252 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 253 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 254
 255
 256 def preferredencoding():
 257     """Get preferred encoding.
 258
 259     Returns the best encoding scheme for the system, based on
 260     locale.getpreferredencoding() and some further tweaks.
 261     """
 262     try:
 263         pref = locale.getpreferredencoding()
 264         'TEST'.encode(pref)
 265     except Exception:
 266         pref = 'UTF-8'
 267
 268     return pref
 269
 270
 271 def write_json_file(obj, fn):
 272     """ Encode obj as JSON and write it to fn, atomically if possible """
 273
 274     fn = encodeFilename(fn)
 275     if sys.version_info < (3, 0) and sys.platform != 'win32':
 276         encoding = get_filesystem_encoding()
 277         # os.path.basename returns a bytes object, but NamedTemporaryFile
 278         # will fail if the filename contains non ascii characters unless we
 279         # use a unicode object
 280         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 281         # the same for os.path.dirname
 282         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 283     else:
 284         path_basename = os.path.basename
 285         path_dirname = os.path.dirname
 286
 287     args = {
 288         'suffix': '.tmp',
 289         'prefix': path_basename(fn) + '.',
 290         'dir': path_dirname(fn),
 291         'delete': False,
 292     }
 293
 294     # In Python 2.x, json.dump expects a bytestream.
 295     # In Python 3.x, it writes to a character stream
 296     if sys.version_info < (3, 0):
 297         args['mode'] = 'wb'
 298     else:
 299         args.update({
 300             'mode': 'w',
 301             'encoding': 'utf-8',
 302         })
 303
 304     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 305
 306     try:
 307         with tf:
 308             json.dump(obj, tf)
 309         if sys.platform == 'win32':
 310             # Need to remove existing file on Windows, else os.rename raises
 311             # WindowsError or FileExistsError.
 312             try:
 313                 os.unlink(fn)
 314             except OSError:
 315                 pass
 316         try:
 317             mask = os.umask(0)
 318             os.umask(mask)
 319             os.chmod(tf.name, 0o666 & ~mask)
 320         except OSError:
 321             pass
 322         os.rename(tf.name, fn)
 323     except Exception:
 324         try:
 325             os.remove(tf.name)
 326         except OSError:
 327             pass
 328         raise
 329
 330
 331 if sys.version_info >= (2, 7):
 332     def find_xpath_attr(node, xpath, key, val=None):
 333         """ Find the xpath xpath[@key=val] """
 334         assert re.match(r'^[a-zA-Z_-]+$', key)
 335         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 336         return node.find(expr)
 337 else:
 338     def find_xpath_attr(node, xpath, key, val=None):
 339         for f in node.findall(compat_xpath(xpath)):
 340             if key not in f.attrib:
 341                 continue
 342             if val is None or f.attrib.get(key) == val:
 343                 return f
 344         return None
 345
 346 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 347 # the namespace parameter
 348
 349
 350 def xpath_with_ns(path, ns_map):
 351     components = [c.split(':') for c in path.split('/')]
 352     replaced = []
 353     for c in components:
 354         if len(c) == 1:
 355             replaced.append(c[0])
 356         else:
 357             ns, tag = c
 358             replaced.append('{%s}%s' % (ns_map[ns], tag))
 359     return '/'.join(replaced)
 360
 361
 362 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 363     def _find_xpath(xpath):
 364         return node.find(compat_xpath(xpath))
 365
 366     if isinstance(xpath, (str, compat_str)):
 367         n = _find_xpath(xpath)
 368     else:
 369         for xp in xpath:
 370             n = _find_xpath(xp)
 371             if n is not None:
 372                 break
 373
 374     if n is None:
 375         if default is not NO_DEFAULT:
 376             return default
 377         elif fatal:
 378             name = xpath if name is None else name
 379             raise ExtractorError('Could not find XML element %s' % name)
 380         else:
 381             return None
 382     return n
 383
 384
 385 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 386     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 387     if n is None or n == default:
 388         return n
 389     if n.text is None:
 390         if default is not NO_DEFAULT:
 391             return default
 392         elif fatal:
 393             name = xpath if name is None else name
 394             raise ExtractorError('Could not find XML element\'s text %s' % name)
 395         else:
 396             return None
 397     return n.text
 398
 399
 400 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 401     n = find_xpath_attr(node, xpath, key)
 402     if n is None:
 403         if default is not NO_DEFAULT:
 404             return default
 405         elif fatal:
 406             name = '%s[@%s]' % (xpath, key) if name is None else name
 407             raise ExtractorError('Could not find XML attribute %s' % name)
 408         else:
 409             return None
 410     return n.attrib[key]
 411
 412
 413 def get_element_by_id(id, html):
 414     """Return the content of the tag with the specified ID in the passed HTML document"""
 415     return get_element_by_attribute('id', id, html)
 416
 417
 418 def get_element_by_class(class_name, html):
 419     """Return the content of the first tag with the specified class in the passed HTML document"""
 420     retval = get_elements_by_class(class_name, html)
 421     return retval[0] if retval else None
 422
 423
 424 def get_element_by_attribute(attribute, value, html, escape_value=True):
 425     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 426     return retval[0] if retval else None
 427
 428
 429 def get_elements_by_class(class_name, html):
 430     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 431     return get_elements_by_attribute(
 432         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 433         html, escape_value=False)
 434
 435
 436 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 437     """Return the content of the tag with the specified attribute in the passed HTML document"""
 438
 439     value = re.escape(value) if escape_value else value
 440
 441     retlist = []
 442     for m in re.finditer(r'''(?xs)
 443         <([a-zA-Z0-9:._-]+)
 444          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 445          \s+%s=['"]?%s['"]?
 446          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 447         \s*>
 448         (?P<content>.*?)
 449         </\1>
 450     ''' % (re.escape(attribute), value), html):
 451         res = m.group('content')
 452
 453         if res.startswith('"') or res.startswith("'"):
 454             res = res[1:-1]
 455
 456         retlist.append(unescapeHTML(res))
 457
 458     return retlist
 459
 460
 461 class HTMLAttributeParser(compat_HTMLParser):
 462     """Trivial HTML parser to gather the attributes for a single element"""
 463
 464     def __init__(self):
 465         self.attrs = {}
 466         compat_HTMLParser.__init__(self)
 467
 468     def handle_starttag(self, tag, attrs):
 469         self.attrs = dict(attrs)
 470
 471
 472 class HTMLListAttrsParser(compat_HTMLParser):
 473     """HTML parser to gather the attributes for the elements of a list"""
 474
 475     def __init__(self):
 476         compat_HTMLParser.__init__(self)
 477         self.items = []
 478         self._level = 0
 479
 480     def handle_starttag(self, tag, attrs):
 481         if tag == 'li' and self._level == 0:
 482             self.items.append(dict(attrs))
 483         self._level += 1
 484
 485     def handle_endtag(self, tag):
 486         self._level -= 1
 487
 488
 489 def extract_attributes(html_element):
 490     """Given a string for an HTML element such as
 491     <el
 492          a="foo" B="bar" c="&98;az" d=boz
 493          empty= noval entity="&amp;"
 494          sq='"' dq="'"
 495     >
 496     Decode and return a dictionary of attributes.
 497     {
 498         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 499         'empty': '', 'noval': None, 'entity': '&',
 500         'sq': '"', 'dq': '\''
 501     }.
 502     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 503     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 504     """
 505     parser = HTMLAttributeParser()
 506     try:
 507         parser.feed(html_element)
 508         parser.close()
 509     # Older Python may throw HTMLParseError in case of malformed HTML
 510     except compat_HTMLParseError:
 511         pass
 512     return parser.attrs
 513
 514
 515 def parse_list(webpage):
 516     """Given a string for an series of HTML <li> elements,
 517     return a dictionary of their attributes"""
 518     parser = HTMLListAttrsParser()
 519     parser.feed(webpage)
 520     parser.close()
 521     return parser.items
 522
 523
 524 def clean_html(html):
 525     """Clean an HTML snippet into a readable string"""
 526
 527     if html is None:  # Convenience for sanitizing descriptions etc.
 528         return html
 529
 530     # Newline vs <br />
 531     html = html.replace('\n', ' ')
 532     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 533     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 534     # Strip html tags
 535     html = re.sub('<.*?>', '', html)
 536     # Replace html entities
 537     html = unescapeHTML(html)
 538     return html.strip()
 539
 540
 541 def sanitize_open(filename, open_mode):
 542     """Try to open the given filename, and slightly tweak it if this fails.
 543
 544     Attempts to open the given filename. If this fails, it tries to change
 545     the filename slightly, step by step, until it's either able to open it
 546     or it fails and raises a final exception, like the standard open()
 547     function.
 548
 549     It returns the tuple (stream, definitive_file_name).
 550     """
 551     try:
 552         if filename == '-':
 553             if sys.platform == 'win32':
 554                 import msvcrt
 555                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 556             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 557         stream = open(encodeFilename(filename), open_mode)
 558         return (stream, filename)
 559     except (IOError, OSError) as err:
 560         if err.errno in (errno.EACCES,):
 561             raise
 562
 563         # In case of error, try to remove win32 forbidden chars
 564         alt_filename = sanitize_path(filename)
 565         if alt_filename == filename:
 566             raise
 567         else:
 568             # An exception here should be caught in the caller
 569             stream = open(encodeFilename(alt_filename), open_mode)
 570             return (stream, alt_filename)
 571
 572
 573 def timeconvert(timestr):
 574     """Convert RFC 2822 defined time string into system timestamp"""
 575     timestamp = None
 576     timetuple = email.utils.parsedate_tz(timestr)
 577     if timetuple is not None:
 578         timestamp = email.utils.mktime_tz(timetuple)
 579     return timestamp
 580
 581
 582 def sanitize_filename(s, restricted=False, is_id=False):
 583     """Sanitizes a string so it could be used as part of a filename.
 584     If restricted is set, use a stricter subset of allowed characters.
 585     Set is_id if this is not an arbitrary string, but an ID that should be kept
 586     if possible.
 587     """
 588     def replace_insane(char):
 589         if restricted and char in ACCENT_CHARS:
 590             return ACCENT_CHARS[char]
 591         elif not restricted and char == '\n':
 592             return ' '
 593         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 594             return ''
 595         elif char == '"':
 596             return '' if restricted else '\''
 597         elif char == ':':
 598             return '_-' if restricted else ' -'
 599         elif char in '\\/|*<>':
 600             return '_'
 601         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 602             return '_'
 603         if restricted and ord(char) > 127:
 604             return '_'
 605         return char
 606
 607     if s == '':
 608         return ''
 609     # Handle timestamps
 610     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 611     result = ''.join(map(replace_insane, s))
 612     if not is_id:
 613         while '__' in result:
 614             result = result.replace('__', '_')
 615         result = result.strip('_')
 616         # Common case of "Foreign band name - English song title"
 617         if restricted and result.startswith('-_'):
 618             result = result[2:]
 619         if result.startswith('-'):
 620             result = '_' + result[len('-'):]
 621         result = result.lstrip('.')
 622         if not result:
 623             result = '_'
 624     return result
 625
 626
 627 def sanitize_path(s, force=False):
 628     """Sanitizes and normalizes path on Windows"""
 629     if sys.platform == 'win32':
 630         force = False
 631         drive_or_unc, _ = os.path.splitdrive(s)
 632         if sys.version_info < (2, 7) and not drive_or_unc:
 633             drive_or_unc, _ = os.path.splitunc(s)
 634     elif force:
 635         drive_or_unc = ''
 636     else:
 637         return s
 638
 639     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 640     if drive_or_unc:
 641         norm_path.pop(0)
 642     sanitized_path = [
 643         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 644         for path_part in norm_path]
 645     if drive_or_unc:
 646         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 647     elif force and s[0] == os.path.sep:
 648         sanitized_path.insert(0, os.path.sep)
 649     return os.path.join(*sanitized_path)
 650
 651
 652 def sanitize_url(url):
 653     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 654     # the number of unwanted failures due to missing protocol
 655     if url.startswith('//'):
 656         return 'http:%s' % url
 657     # Fix some common typos seen so far
 658     COMMON_TYPOS = (
 659         # https://github.com/ytdl-org/youtube-dl/issues/15649
 660         (r'^httpss://', r'https://'),
 661         # https://bx1.be/lives/direct-tv/
 662         (r'^rmtp([es]?)://', r'rtmp\1://'),
 663     )
 664     for mistake, fixup in COMMON_TYPOS:
 665         if re.match(mistake, url):
 666             return re.sub(mistake, fixup, url)
 667     return url
 668
 669
 670 def extract_basic_auth(url):
 671     parts = compat_urlparse.urlsplit(url)
 672     if parts.username is None:
 673         return url, None
 674     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 675         parts.hostname if parts.port is None
 676         else '%s:%d' % (parts.hostname, parts.port))))
 677     auth_payload = base64.b64encode(
 678         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 679     return url, 'Basic ' + auth_payload.decode('utf-8')
 680
 681
 682 def sanitized_Request(url, *args, **kwargs):
 683     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 684     if auth_header is not None:
 685         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 686         headers['Authorization'] = auth_header
 687     return compat_urllib_request.Request(url, *args, **kwargs)
 688
 689
 690 def expand_path(s):
 691     """Expand shell variables and ~"""
 692     return os.path.expandvars(compat_expanduser(s))
 693
 694
 695 def orderedSet(iterable):
 696     """ Remove all duplicates from the input iterable """
 697     res = []
 698     for el in iterable:
 699         if el not in res:
 700             res.append(el)
 701     return res
 702
 703
 704 def _htmlentity_transform(entity_with_semicolon):
 705     """Transforms an HTML entity to a character."""
 706     entity = entity_with_semicolon[:-1]
 707
 708     # Known non-numeric HTML entity
 709     if entity in compat_html_entities.name2codepoint:
 710         return compat_chr(compat_html_entities.name2codepoint[entity])
 711
 712     # TODO: HTML5 allows entities without a semicolon. For example,
 713     # '&Eacuteric' should be decoded as 'Éric'.
 714     if entity_with_semicolon in compat_html_entities_html5:
 715         return compat_html_entities_html5[entity_with_semicolon]
 716
 717     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 718     if mobj is not None:
 719         numstr = mobj.group(1)
 720         if numstr.startswith('x'):
 721             base = 16
 722             numstr = '0%s' % numstr
 723         else:
 724             base = 10
 725         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 726         try:
 727             return compat_chr(int(numstr, base))
 728         except ValueError:
 729             pass
 730
 731     # Unknown entity in name, return its literal representation
 732     return '&%s;' % entity
 733
 734
 735 def unescapeHTML(s):
 736     if s is None:
 737         return None
 738     assert type(s) == compat_str
 739
 740     return re.sub(
 741         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 742
 743
 744 def escapeHTML(text):
 745     return (
 746         text
 747         .replace('&', '&amp;')
 748         .replace('<', '&lt;')
 749         .replace('>', '&gt;')
 750         .replace('"', '&quot;')
 751         .replace("'", '&#39;')
 752     )
 753
 754
 755 def process_communicate_or_kill(p, *args, **kwargs):
 756     try:
 757         return p.communicate(*args, **kwargs)
 758     except BaseException:  # Including KeyboardInterrupt
 759         p.kill()
 760         p.wait()
 761         raise
 762
 763
 764 class Popen(subprocess.Popen):
 765     if sys.platform == 'win32':
 766         _startupinfo = subprocess.STARTUPINFO()
 767         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 768     else:
 769         _startupinfo = None
 770
 771     def __init__(self, *args, **kwargs):
 772         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 773
 774     def communicate_or_kill(self, *args, **kwargs):
 775         return process_communicate_or_kill(self, *args, **kwargs)
 776
 777
 778 def get_subprocess_encoding():
 779     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 780         # For subprocess calls, encode with locale encoding
 781         # Refer to http://stackoverflow.com/a/9951851/35070
 782         encoding = preferredencoding()
 783     else:
 784         encoding = sys.getfilesystemencoding()
 785     if encoding is None:
 786         encoding = 'utf-8'
 787     return encoding
 788
 789
 790 def encodeFilename(s, for_subprocess=False):
 791     """
 792     @param s The name of the file
 793     """
 794
 795     assert type(s) == compat_str
 796
 797     # Python 3 has a Unicode API
 798     if sys.version_info >= (3, 0):
 799         return s
 800
 801     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 802     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 803     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 804     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 805         return s
 806
 807     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 808     if sys.platform.startswith('java'):
 809         return s
 810
 811     return s.encode(get_subprocess_encoding(), 'ignore')
 812
 813
 814 def decodeFilename(b, for_subprocess=False):
 815
 816     if sys.version_info >= (3, 0):
 817         return b
 818
 819     if not isinstance(b, bytes):
 820         return b
 821
 822     return b.decode(get_subprocess_encoding(), 'ignore')
 823
 824
 825 def encodeArgument(s):
 826     if not isinstance(s, compat_str):
 827         # Legacy code that uses byte strings
 828         # Uncomment the following line after fixing all post processors
 829         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 830         s = s.decode('ascii')
 831     return encodeFilename(s, True)
 832
 833
 834 def decodeArgument(b):
 835     return decodeFilename(b, True)
 836
 837
 838 def decodeOption(optval):
 839     if optval is None:
 840         return optval
 841     if isinstance(optval, bytes):
 842         optval = optval.decode(preferredencoding())
 843
 844     assert isinstance(optval, compat_str)
 845     return optval
 846
 847
 848 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 849
 850
 851 def timetuple_from_msec(msec):
 852     secs, msec = divmod(msec, 1000)
 853     mins, secs = divmod(secs, 60)
 854     hrs, mins = divmod(mins, 60)
 855     return _timetuple(hrs, mins, secs, msec)
 856
 857
 858 def formatSeconds(secs, delim=':', msec=False):
 859     time = timetuple_from_msec(secs * 1000)
 860     if time.hours:
 861         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 862     elif time.minutes:
 863         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 864     else:
 865         ret = '%d' % time.seconds
 866     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 867
 868
 869 def _ssl_load_windows_store_certs(ssl_context, storename):
 870     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 871     try:
 872         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 873                  if encoding == 'x509_asn' and (
 874                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 875     except PermissionError:
 876         return
 877     for cert in certs:
 878         try:
 879             ssl_context.load_verify_locations(cadata=cert)
 880         except ssl.SSLError:
 881             pass
 882
 883
 884 def make_HTTPS_handler(params, **kwargs):
 885     opts_check_certificate = not params.get('nocheckcertificate')
 886     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 887     context.check_hostname = opts_check_certificate
 888     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 889     if opts_check_certificate:
 890         try:
 891             context.load_default_certs()
 892             # Work around the issue in load_default_certs when there are bad certificates. See:
 893             # https://github.com/yt-dlp/yt-dlp/issues/1060,
 894             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 895         except ssl.SSLError:
 896             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 897             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 898                 # Create a new context to discard any certificates that were already loaded
 899                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 900                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
 901                 for storename in ('CA', 'ROOT'):
 902                     _ssl_load_windows_store_certs(context, storename)
 903             context.set_default_verify_paths()
 904     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 905
 906
 907 def bug_reports_message(before=';'):
 908     if ytdl_is_updateable():
 909         update_cmd = 'type  yt-dlp -U  to update'
 910     else:
 911         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
 912     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
 913     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 914     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
 915
 916     before = before.rstrip()
 917     if not before or before.endswith(('.', '!', '?')):
 918         msg = msg[0].title() + msg[1:]
 919
 920     return (before + ' ' if before else '') + msg
 921
 922
 923 class YoutubeDLError(Exception):
 924     """Base exception for YoutubeDL errors."""
 925     msg = None
 926
 927     def __init__(self, msg=None):
 928         if msg is not None:
 929             self.msg = msg
 930         elif self.msg is None:
 931             self.msg = type(self).__name__
 932         super().__init__(self.msg)
 933
 934
 935 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 936 if hasattr(ssl, 'CertificateError'):
 937     network_exceptions.append(ssl.CertificateError)
 938 network_exceptions = tuple(network_exceptions)
 939
 940
 941 class ExtractorError(YoutubeDLError):
 942     """Error during info extraction."""
 943
 944     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 945         """ tb, if given, is the original traceback (so that it can be printed out).
 946         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 947         """
 948         if sys.exc_info()[0] in network_exceptions:
 949             expected = True
 950
 951         self.msg = str(msg)
 952         self.traceback = tb
 953         self.expected = expected
 954         self.cause = cause
 955         self.video_id = video_id
 956         self.ie = ie
 957         self.exc_info = sys.exc_info()  # preserve original exception
 958
 959         super(ExtractorError, self).__init__(''.join((
 960             format_field(ie, template='[%s] '),
 961             format_field(video_id, template='%s: '),
 962             self.msg,
 963             format_field(cause, template=' (caused by %r)'),
 964             '' if expected else bug_reports_message())))
 965
 966     def format_traceback(self):
 967         if self.traceback is None:
 968             return None
 969         return ''.join(traceback.format_tb(self.traceback))
 970
 971
 972 class UnsupportedError(ExtractorError):
 973     def __init__(self, url):
 974         super(UnsupportedError, self).__init__(
 975             'Unsupported URL: %s' % url, expected=True)
 976         self.url = url
 977
 978
 979 class RegexNotFoundError(ExtractorError):
 980     """Error when a regex didn't match"""
 981     pass
 982
 983
 984 class GeoRestrictedError(ExtractorError):
 985     """Geographic restriction Error exception.
 986
 987     This exception may be thrown when a video is not available from your
 988     geographic location due to geographic restrictions imposed by a website.
 989     """
 990
 991     def __init__(self, msg, countries=None, **kwargs):
 992         kwargs['expected'] = True
 993         super(GeoRestrictedError, self).__init__(msg, **kwargs)
 994         self.countries = countries
 995
 996
 997 class DownloadError(YoutubeDLError):
 998     """Download Error exception.
 999
1000     This exception may be thrown by FileDownloader objects if they are not
1001     configured to continue on errors. They will contain the appropriate
1002     error message.
1003     """
1004
1005     def __init__(self, msg, exc_info=None):
1006         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1007         super(DownloadError, self).__init__(msg)
1008         self.exc_info = exc_info
1009
1010
1011 class EntryNotInPlaylist(YoutubeDLError):
1012     """Entry not in playlist exception.
1013
1014     This exception will be thrown by YoutubeDL when a requested entry
1015     is not found in the playlist info_dict
1016     """
1017     msg = 'Entry not found in info'
1018
1019
1020 class SameFileError(YoutubeDLError):
1021     """Same File exception.
1022
1023     This exception will be thrown by FileDownloader objects if they detect
1024     multiple files would have to be downloaded to the same file on disk.
1025     """
1026     msg = 'Fixed output name but more than one file to download'
1027
1028     def __init__(self, filename=None):
1029         if filename is not None:
1030             self.msg += f': {filename}'
1031         super().__init__(self.msg)
1032
1033
1034 class PostProcessingError(YoutubeDLError):
1035     """Post Processing exception.
1036
1037     This exception may be raised by PostProcessor's .run() method to
1038     indicate an error in the postprocessing task.
1039     """
1040
1041
1042 class DownloadCancelled(YoutubeDLError):
1043     """ Exception raised when the download queue should be interrupted """
1044     msg = 'The download was cancelled'
1045
1046
1047 class ExistingVideoReached(DownloadCancelled):
1048     """ --break-on-existing triggered """
1049     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1050
1051
1052 class RejectedVideoReached(DownloadCancelled):
1053     """ --break-on-reject triggered """
1054     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1055
1056
1057 class MaxDownloadsReached(DownloadCancelled):
1058     """ --max-downloads limit has been reached. """
1059     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1060
1061
1062 class ReExtractInfo(YoutubeDLError):
1063     """ Video info needs to be re-extracted. """
1064
1065     def __init__(self, msg, expected=False):
1066         super().__init__(msg)
1067         self.expected = expected
1068
1069
1070 class ThrottledDownload(ReExtractInfo):
1071     """ Download speed below --throttled-rate. """
1072     msg = 'The download speed is below throttle limit'
1073
1074     def __init__(self):
1075         super().__init__(self.msg, expected=False)
1076
1077
1078 class UnavailableVideoError(YoutubeDLError):
1079     """Unavailable Format exception.
1080
1081     This exception will be thrown when a video is requested
1082     in a format that is not available for that video.
1083     """
1084     msg = 'Unable to download video'
1085
1086     def __init__(self, err=None):
1087         if err is not None:
1088             self.msg += f': {err}'
1089         super().__init__(self.msg)
1090
1091
1092 class ContentTooShortError(YoutubeDLError):
1093     """Content Too Short exception.
1094
1095     This exception may be raised by FileDownloader objects when a file they
1096     download is too small for what the server announced first, indicating
1097     the connection was probably interrupted.
1098     """
1099
1100     def __init__(self, downloaded, expected):
1101         super(ContentTooShortError, self).__init__(
1102             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1103         )
1104         # Both in bytes
1105         self.downloaded = downloaded
1106         self.expected = expected
1107
1108
1109 class XAttrMetadataError(YoutubeDLError):
1110     def __init__(self, code=None, msg='Unknown error'):
1111         super(XAttrMetadataError, self).__init__(msg)
1112         self.code = code
1113         self.msg = msg
1114
1115         # Parsing code and msg
1116         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1117                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1118             self.reason = 'NO_SPACE'
1119         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1120             self.reason = 'VALUE_TOO_LONG'
1121         else:
1122             self.reason = 'NOT_SUPPORTED'
1123
1124
1125 class XAttrUnavailableError(YoutubeDLError):
1126     pass
1127
1128
1129 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1130     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1131     # expected HTTP responses to meet HTTP/1.0 or later (see also
1132     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1133     if sys.version_info < (3, 0):
1134         kwargs['strict'] = True
1135     hc = http_class(*args, **compat_kwargs(kwargs))
1136     source_address = ydl_handler._params.get('source_address')
1137
1138     if source_address is not None:
1139         # This is to workaround _create_connection() from socket where it will try all
1140         # address data from getaddrinfo() including IPv6. This filters the result from
1141         # getaddrinfo() based on the source_address value.
1142         # This is based on the cpython socket.create_connection() function.
1143         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1144         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1145             host, port = address
1146             err = None
1147             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1148             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1149             ip_addrs = [addr for addr in addrs if addr[0] == af]
1150             if addrs and not ip_addrs:
1151                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1152                 raise socket.error(
1153                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1154                     % (ip_version, source_address[0]))
1155             for res in ip_addrs:
1156                 af, socktype, proto, canonname, sa = res
1157                 sock = None
1158                 try:
1159                     sock = socket.socket(af, socktype, proto)
1160                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1161                         sock.settimeout(timeout)
1162                     sock.bind(source_address)
1163                     sock.connect(sa)
1164                     err = None  # Explicitly break reference cycle
1165                     return sock
1166                 except socket.error as _:
1167                     err = _
1168                     if sock is not None:
1169                         sock.close()
1170             if err is not None:
1171                 raise err
1172             else:
1173                 raise socket.error('getaddrinfo returns an empty list')
1174         if hasattr(hc, '_create_connection'):
1175             hc._create_connection = _create_connection
1176         sa = (source_address, 0)
1177         if hasattr(hc, 'source_address'):  # Python 2.7+
1178             hc.source_address = sa
1179         else:  # Python 2.6
1180             def _hc_connect(self, *args, **kwargs):
1181                 sock = _create_connection(
1182                     (self.host, self.port), self.timeout, sa)
1183                 if is_https:
1184                     self.sock = ssl.wrap_socket(
1185                         sock, self.key_file, self.cert_file,
1186                         ssl_version=ssl.PROTOCOL_TLSv1)
1187                 else:
1188                     self.sock = sock
1189             hc.connect = functools.partial(_hc_connect, hc)
1190
1191     return hc
1192
1193
1194 def handle_youtubedl_headers(headers):
1195     filtered_headers = headers
1196
1197     if 'Youtubedl-no-compression' in filtered_headers:
1198         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1199         del filtered_headers['Youtubedl-no-compression']
1200
1201     return filtered_headers
1202
1203
1204 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1205     """Handler for HTTP requests and responses.
1206
1207     This class, when installed with an OpenerDirector, automatically adds
1208     the standard headers to every HTTP request and handles gzipped and
1209     deflated responses from web servers. If compression is to be avoided in
1210     a particular request, the original request in the program code only has
1211     to include the HTTP header "Youtubedl-no-compression", which will be
1212     removed before making the real request.
1213
1214     Part of this code was copied from:
1215
1216     http://techknack.net/python-urllib2-handlers/
1217
1218     Andrew Rowls, the author of that code, agreed to release it to the
1219     public domain.
1220     """
1221
1222     def __init__(self, params, *args, **kwargs):
1223         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1224         self._params = params
1225
1226     def http_open(self, req):
1227         conn_class = compat_http_client.HTTPConnection
1228
1229         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1230         if socks_proxy:
1231             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1232             del req.headers['Ytdl-socks-proxy']
1233
1234         return self.do_open(functools.partial(
1235             _create_http_connection, self, conn_class, False),
1236             req)
1237
1238     @staticmethod
1239     def deflate(data):
1240         if not data:
1241             return data
1242         try:
1243             return zlib.decompress(data, -zlib.MAX_WBITS)
1244         except zlib.error:
1245             return zlib.decompress(data)
1246
1247     def http_request(self, req):
1248         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1249         # always respected by websites, some tend to give out URLs with non percent-encoded
1250         # non-ASCII characters (see telemb.py, ard.py [#3412])
1251         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1252         # To work around aforementioned issue we will replace request's original URL with
1253         # percent-encoded one
1254         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1255         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1256         url = req.get_full_url()
1257         url_escaped = escape_url(url)
1258
1259         # Substitute URL if any change after escaping
1260         if url != url_escaped:
1261             req = update_Request(req, url=url_escaped)
1262
1263         for h, v in std_headers.items():
1264             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1265             # The dict keys are capitalized because of this bug by urllib
1266             if h.capitalize() not in req.headers:
1267                 req.add_header(h, v)
1268
1269         req.headers = handle_youtubedl_headers(req.headers)
1270
1271         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1272             # Python 2.6 is brain-dead when it comes to fragments
1273             req._Request__original = req._Request__original.partition('#')[0]
1274             req._Request__r_type = req._Request__r_type.partition('#')[0]
1275
1276         return req
1277
1278     def http_response(self, req, resp):
1279         old_resp = resp
1280         # gzip
1281         if resp.headers.get('Content-encoding', '') == 'gzip':
1282             content = resp.read()
1283             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1284             try:
1285                 uncompressed = io.BytesIO(gz.read())
1286             except IOError as original_ioerror:
1287                 # There may be junk add the end of the file
1288                 # See http://stackoverflow.com/q/4928560/35070 for details
1289                 for i in range(1, 1024):
1290                     try:
1291                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1292                         uncompressed = io.BytesIO(gz.read())
1293                     except IOError:
1294                         continue
1295                     break
1296                 else:
1297                     raise original_ioerror
1298             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1299             resp.msg = old_resp.msg
1300             del resp.headers['Content-encoding']
1301         # deflate
1302         if resp.headers.get('Content-encoding', '') == 'deflate':
1303             gz = io.BytesIO(self.deflate(resp.read()))
1304             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1305             resp.msg = old_resp.msg
1306             del resp.headers['Content-encoding']
1307         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1308         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1309         if 300 <= resp.code < 400:
1310             location = resp.headers.get('Location')
1311             if location:
1312                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1313                 if sys.version_info >= (3, 0):
1314                     location = location.encode('iso-8859-1').decode('utf-8')
1315                 else:
1316                     location = location.decode('utf-8')
1317                 location_escaped = escape_url(location)
1318                 if location != location_escaped:
1319                     del resp.headers['Location']
1320                     if sys.version_info < (3, 0):
1321                         location_escaped = location_escaped.encode('utf-8')
1322                     resp.headers['Location'] = location_escaped
1323         return resp
1324
1325     https_request = http_request
1326     https_response = http_response
1327
1328
1329 def make_socks_conn_class(base_class, socks_proxy):
1330     assert issubclass(base_class, (
1331         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1332
1333     url_components = compat_urlparse.urlparse(socks_proxy)
1334     if url_components.scheme.lower() == 'socks5':
1335         socks_type = ProxyType.SOCKS5
1336     elif url_components.scheme.lower() in ('socks', 'socks4'):
1337         socks_type = ProxyType.SOCKS4
1338     elif url_components.scheme.lower() == 'socks4a':
1339         socks_type = ProxyType.SOCKS4A
1340
1341     def unquote_if_non_empty(s):
1342         if not s:
1343             return s
1344         return compat_urllib_parse_unquote_plus(s)
1345
1346     proxy_args = (
1347         socks_type,
1348         url_components.hostname, url_components.port or 1080,
1349         True,  # Remote DNS
1350         unquote_if_non_empty(url_components.username),
1351         unquote_if_non_empty(url_components.password),
1352     )
1353
1354     class SocksConnection(base_class):
1355         def connect(self):
1356             self.sock = sockssocket()
1357             self.sock.setproxy(*proxy_args)
1358             if type(self.timeout) in (int, float):
1359                 self.sock.settimeout(self.timeout)
1360             self.sock.connect((self.host, self.port))
1361
1362             if isinstance(self, compat_http_client.HTTPSConnection):
1363                 if hasattr(self, '_context'):  # Python > 2.6
1364                     self.sock = self._context.wrap_socket(
1365                         self.sock, server_hostname=self.host)
1366                 else:
1367                     self.sock = ssl.wrap_socket(self.sock)
1368
1369     return SocksConnection
1370
1371
1372 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1373     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1374         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1375         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1376         self._params = params
1377
1378     def https_open(self, req):
1379         kwargs = {}
1380         conn_class = self._https_conn_class
1381
1382         if hasattr(self, '_context'):  # python > 2.6
1383             kwargs['context'] = self._context
1384         if hasattr(self, '_check_hostname'):  # python 3.x
1385             kwargs['check_hostname'] = self._check_hostname
1386
1387         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1388         if socks_proxy:
1389             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1390             del req.headers['Ytdl-socks-proxy']
1391
1392         return self.do_open(functools.partial(
1393             _create_http_connection, self, conn_class, True),
1394             req, **kwargs)
1395
1396
1397 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1398     """
1399     See [1] for cookie file format.
1400
1401     1. https://curl.haxx.se/docs/http-cookies.html
1402     """
1403     _HTTPONLY_PREFIX = '#HttpOnly_'
1404     _ENTRY_LEN = 7
1405     _HEADER = '''# Netscape HTTP Cookie File
1406 # This file is generated by yt-dlp.  Do not edit.
1407
1408 '''
1409     _CookieFileEntry = collections.namedtuple(
1410         'CookieFileEntry',
1411         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1412
1413     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1414         """
1415         Save cookies to a file.
1416
1417         Most of the code is taken from CPython 3.8 and slightly adapted
1418         to support cookie files with UTF-8 in both python 2 and 3.
1419         """
1420         if filename is None:
1421             if self.filename is not None:
1422                 filename = self.filename
1423             else:
1424                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1425
1426         # Store session cookies with `expires` set to 0 instead of an empty
1427         # string
1428         for cookie in self:
1429             if cookie.expires is None:
1430                 cookie.expires = 0
1431
1432         with io.open(filename, 'w', encoding='utf-8') as f:
1433             f.write(self._HEADER)
1434             now = time.time()
1435             for cookie in self:
1436                 if not ignore_discard and cookie.discard:
1437                     continue
1438                 if not ignore_expires and cookie.is_expired(now):
1439                     continue
1440                 if cookie.secure:
1441                     secure = 'TRUE'
1442                 else:
1443                     secure = 'FALSE'
1444                 if cookie.domain.startswith('.'):
1445                     initial_dot = 'TRUE'
1446                 else:
1447                     initial_dot = 'FALSE'
1448                 if cookie.expires is not None:
1449                     expires = compat_str(cookie.expires)
1450                 else:
1451                     expires = ''
1452                 if cookie.value is None:
1453                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1454                     # with no name, whereas http.cookiejar regards it as a
1455                     # cookie with no value.
1456                     name = ''
1457                     value = cookie.name
1458                 else:
1459                     name = cookie.name
1460                     value = cookie.value
1461                 f.write(
1462                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1463                                secure, expires, name, value]) + '\n')
1464
1465     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1466         """Load cookies from a file."""
1467         if filename is None:
1468             if self.filename is not None:
1469                 filename = self.filename
1470             else:
1471                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1472
1473         def prepare_line(line):
1474             if line.startswith(self._HTTPONLY_PREFIX):
1475                 line = line[len(self._HTTPONLY_PREFIX):]
1476             # comments and empty lines are fine
1477             if line.startswith('#') or not line.strip():
1478                 return line
1479             cookie_list = line.split('\t')
1480             if len(cookie_list) != self._ENTRY_LEN:
1481                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1482             cookie = self._CookieFileEntry(*cookie_list)
1483             if cookie.expires_at and not cookie.expires_at.isdigit():
1484                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1485             return line
1486
1487         cf = io.StringIO()
1488         with io.open(filename, encoding='utf-8') as f:
1489             for line in f:
1490                 try:
1491                     cf.write(prepare_line(line))
1492                 except compat_cookiejar.LoadError as e:
1493                     write_string(
1494                         'WARNING: skipping cookie file entry due to %s: %r\n'
1495                         % (e, line), sys.stderr)
1496                     continue
1497         cf.seek(0)
1498         self._really_load(cf, filename, ignore_discard, ignore_expires)
1499         # Session cookies are denoted by either `expires` field set to
1500         # an empty string or 0. MozillaCookieJar only recognizes the former
1501         # (see [1]). So we need force the latter to be recognized as session
1502         # cookies on our own.
1503         # Session cookies may be important for cookies-based authentication,
1504         # e.g. usually, when user does not check 'Remember me' check box while
1505         # logging in on a site, some important cookies are stored as session
1506         # cookies so that not recognizing them will result in failed login.
1507         # 1. https://bugs.python.org/issue17164
1508         for cookie in self:
1509             # Treat `expires=0` cookies as session cookies
1510             if cookie.expires == 0:
1511                 cookie.expires = None
1512                 cookie.discard = True
1513
1514
1515 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1516     def __init__(self, cookiejar=None):
1517         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1518
1519     def http_response(self, request, response):
1520         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1521         # characters in Set-Cookie HTTP header of last response (see
1522         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1523         # In order to at least prevent crashing we will percent encode Set-Cookie
1524         # header before HTTPCookieProcessor starts processing it.
1525         # if sys.version_info < (3, 0) and response.headers:
1526         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1527         #         set_cookie = response.headers.get(set_cookie_header)
1528         #         if set_cookie:
1529         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1530         #             if set_cookie != set_cookie_escaped:
1531         #                 del response.headers[set_cookie_header]
1532         #                 response.headers[set_cookie_header] = set_cookie_escaped
1533         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1534
1535     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1536     https_response = http_response
1537
1538
1539 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1540     """YoutubeDL redirect handler
1541
1542     The code is based on HTTPRedirectHandler implementation from CPython [1].
1543
1544     This redirect handler solves two issues:
1545      - ensures redirect URL is always unicode under python 2
1546      - introduces support for experimental HTTP response status code
1547        308 Permanent Redirect [2] used by some sites [3]
1548
1549     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1550     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1551     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1552     """
1553
1554     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1555
1556     def redirect_request(self, req, fp, code, msg, headers, newurl):
1557         """Return a Request or None in response to a redirect.
1558
1559         This is called by the http_error_30x methods when a
1560         redirection response is received.  If a redirection should
1561         take place, return a new Request to allow http_error_30x to
1562         perform the redirect.  Otherwise, raise HTTPError if no-one
1563         else should try to handle this url.  Return None if you can't
1564         but another Handler might.
1565         """
1566         m = req.get_method()
1567         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1568                  or code in (301, 302, 303) and m == "POST")):
1569             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1570         # Strictly (according to RFC 2616), 301 or 302 in response to
1571         # a POST MUST NOT cause a redirection without confirmation
1572         # from the user (of urllib.request, in this case).  In practice,
1573         # essentially all clients do redirect in this case, so we do
1574         # the same.
1575
1576         # On python 2 urlh.geturl() may sometimes return redirect URL
1577         # as byte string instead of unicode. This workaround allows
1578         # to force it always return unicode.
1579         if sys.version_info[0] < 3:
1580             newurl = compat_str(newurl)
1581
1582         # Be conciliant with URIs containing a space.  This is mainly
1583         # redundant with the more complete encoding done in http_error_302(),
1584         # but it is kept for compatibility with other callers.
1585         newurl = newurl.replace(' ', '%20')
1586
1587         CONTENT_HEADERS = ("content-length", "content-type")
1588         # NB: don't use dict comprehension for python 2.6 compatibility
1589         newheaders = dict((k, v) for k, v in req.headers.items()
1590                           if k.lower() not in CONTENT_HEADERS)
1591         return compat_urllib_request.Request(
1592             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1593             unverifiable=True)
1594
1595
1596 def extract_timezone(date_str):
1597     m = re.search(
1598         r'''(?x)
1599             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1600             (?P<tz>Z|                                            # just the UTC Z, or
1601                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1602                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1603                    [ ]?                                          # optional space
1604                 (?P<sign>\+|-)                                   # +/-
1605                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1606             $)
1607         ''', date_str)
1608     if not m:
1609         timezone = datetime.timedelta()
1610     else:
1611         date_str = date_str[:-len(m.group('tz'))]
1612         if not m.group('sign'):
1613             timezone = datetime.timedelta()
1614         else:
1615             sign = 1 if m.group('sign') == '+' else -1
1616             timezone = datetime.timedelta(
1617                 hours=sign * int(m.group('hours')),
1618                 minutes=sign * int(m.group('minutes')))
1619     return timezone, date_str
1620
1621
1622 def parse_iso8601(date_str, delimiter='T', timezone=None):
1623     """ Return a UNIX timestamp from the given date """
1624
1625     if date_str is None:
1626         return None
1627
1628     date_str = re.sub(r'\.[0-9]+', '', date_str)
1629
1630     if timezone is None:
1631         timezone, date_str = extract_timezone(date_str)
1632
1633     try:
1634         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1635         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1636         return calendar.timegm(dt.timetuple())
1637     except ValueError:
1638         pass
1639
1640
1641 def date_formats(day_first=True):
1642     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1643
1644
1645 def unified_strdate(date_str, day_first=True):
1646     """Return a string with the date in the format YYYYMMDD"""
1647
1648     if date_str is None:
1649         return None
1650     upload_date = None
1651     # Replace commas
1652     date_str = date_str.replace(',', ' ')
1653     # Remove AM/PM + timezone
1654     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1655     _, date_str = extract_timezone(date_str)
1656
1657     for expression in date_formats(day_first):
1658         try:
1659             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1660         except ValueError:
1661             pass
1662     if upload_date is None:
1663         timetuple = email.utils.parsedate_tz(date_str)
1664         if timetuple:
1665             try:
1666                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1667             except ValueError:
1668                 pass
1669     if upload_date is not None:
1670         return compat_str(upload_date)
1671
1672
1673 def unified_timestamp(date_str, day_first=True):
1674     if date_str is None:
1675         return None
1676
1677     date_str = re.sub(r'[,|]', '', date_str)
1678
1679     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1680     timezone, date_str = extract_timezone(date_str)
1681
1682     # Remove AM/PM + timezone
1683     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1684
1685     # Remove unrecognized timezones from ISO 8601 alike timestamps
1686     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1687     if m:
1688         date_str = date_str[:-len(m.group('tz'))]
1689
1690     # Python only supports microseconds, so remove nanoseconds
1691     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1692     if m:
1693         date_str = m.group(1)
1694
1695     for expression in date_formats(day_first):
1696         try:
1697             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1698             return calendar.timegm(dt.timetuple())
1699         except ValueError:
1700             pass
1701     timetuple = email.utils.parsedate_tz(date_str)
1702     if timetuple:
1703         return calendar.timegm(timetuple) + pm_delta * 3600
1704
1705
1706 def determine_ext(url, default_ext='unknown_video'):
1707     if url is None or '.' not in url:
1708         return default_ext
1709     guess = url.partition('?')[0].rpartition('.')[2]
1710     if re.match(r'^[A-Za-z0-9]+$', guess):
1711         return guess
1712     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1713     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1714         return guess.rstrip('/')
1715     else:
1716         return default_ext
1717
1718
1719 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1720     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1721
1722
1723 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1724     """
1725     Return a datetime object from a string in the format YYYYMMDD or
1726     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1727
1728     format: string date format used to return datetime object from
1729     precision: round the time portion of a datetime object.
1730                 auto|microsecond|second|minute|hour|day.
1731                 auto: round to the unit provided in date_str (if applicable).
1732     """
1733     auto_precision = False
1734     if precision == 'auto':
1735         auto_precision = True
1736         precision = 'microsecond'
1737     today = datetime_round(datetime.datetime.now(), precision)
1738     if date_str in ('now', 'today'):
1739         return today
1740     if date_str == 'yesterday':
1741         return today - datetime.timedelta(days=1)
1742     match = re.match(
1743         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1744         date_str)
1745     if match is not None:
1746         start_time = datetime_from_str(match.group('start'), precision, format)
1747         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1748         unit = match.group('unit')
1749         if unit == 'month' or unit == 'year':
1750             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1751             unit = 'day'
1752         else:
1753             if unit == 'week':
1754                 unit = 'day'
1755                 time *= 7
1756             delta = datetime.timedelta(**{unit + 's': time})
1757             new_date = start_time + delta
1758         if auto_precision:
1759             return datetime_round(new_date, unit)
1760         return new_date
1761
1762     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1763
1764
1765 def date_from_str(date_str, format='%Y%m%d'):
1766     """
1767     Return a datetime object from a string in the format YYYYMMDD or
1768     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1769
1770     format: string date format used to return datetime object from
1771     """
1772     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1773
1774
1775 def datetime_add_months(dt, months):
1776     """Increment/Decrement a datetime object by months."""
1777     month = dt.month + months - 1
1778     year = dt.year + month // 12
1779     month = month % 12 + 1
1780     day = min(dt.day, calendar.monthrange(year, month)[1])
1781     return dt.replace(year, month, day)
1782
1783
1784 def datetime_round(dt, precision='day'):
1785     """
1786     Round a datetime object's time to a specific precision
1787     """
1788     if precision == 'microsecond':
1789         return dt
1790
1791     unit_seconds = {
1792         'day': 86400,
1793         'hour': 3600,
1794         'minute': 60,
1795         'second': 1,
1796     }
1797     roundto = lambda x, n: ((x + n / 2) // n) * n
1798     timestamp = calendar.timegm(dt.timetuple())
1799     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1800
1801
1802 def hyphenate_date(date_str):
1803     """
1804     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1805     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1806     if match is not None:
1807         return '-'.join(match.groups())
1808     else:
1809         return date_str
1810
1811
1812 class DateRange(object):
1813     """Represents a time interval between two dates"""
1814
1815     def __init__(self, start=None, end=None):
1816         """start and end must be strings in the format accepted by date"""
1817         if start is not None:
1818             self.start = date_from_str(start)
1819         else:
1820             self.start = datetime.datetime.min.date()
1821         if end is not None:
1822             self.end = date_from_str(end)
1823         else:
1824             self.end = datetime.datetime.max.date()
1825         if self.start > self.end:
1826             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1827
1828     @classmethod
1829     def day(cls, day):
1830         """Returns a range that only contains the given day"""
1831         return cls(day, day)
1832
1833     def __contains__(self, date):
1834         """Check if the date is in the range"""
1835         if not isinstance(date, datetime.date):
1836             date = date_from_str(date)
1837         return self.start <= date <= self.end
1838
1839     def __str__(self):
1840         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1841
1842
1843 def platform_name():
1844     """ Returns the platform name as a compat_str """
1845     res = platform.platform()
1846     if isinstance(res, bytes):
1847         res = res.decode(preferredencoding())
1848
1849     assert isinstance(res, compat_str)
1850     return res
1851
1852
1853 def get_windows_version():
1854     ''' Get Windows version. None if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return None
1859
1860
1861 def _windows_write_string(s, out):
1862     """ Returns True if the string was written using special methods,
1863     False if it has yet to be written out."""
1864     # Adapted from http://stackoverflow.com/a/3259271/35070
1865
1866     import ctypes.wintypes
1867
1868     WIN_OUTPUT_IDS = {
1869         1: -11,
1870         2: -12,
1871     }
1872
1873     try:
1874         fileno = out.fileno()
1875     except AttributeError:
1876         # If the output stream doesn't have a fileno, it's virtual
1877         return False
1878     except io.UnsupportedOperation:
1879         # Some strange Windows pseudo files?
1880         return False
1881     if fileno not in WIN_OUTPUT_IDS:
1882         return False
1883
1884     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1885         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1886         ('GetStdHandle', ctypes.windll.kernel32))
1887     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1888
1889     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1890         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1891         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1892         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1893     written = ctypes.wintypes.DWORD(0)
1894
1895     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1896     FILE_TYPE_CHAR = 0x0002
1897     FILE_TYPE_REMOTE = 0x8000
1898     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1899         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1900         ctypes.POINTER(ctypes.wintypes.DWORD))(
1901         ('GetConsoleMode', ctypes.windll.kernel32))
1902     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1903
1904     def not_a_console(handle):
1905         if handle == INVALID_HANDLE_VALUE or handle is None:
1906             return True
1907         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1908                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1909
1910     if not_a_console(h):
1911         return False
1912
1913     def next_nonbmp_pos(s):
1914         try:
1915             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1916         except StopIteration:
1917             return len(s)
1918
1919     while s:
1920         count = min(next_nonbmp_pos(s), 1024)
1921
1922         ret = WriteConsoleW(
1923             h, s, count if count else 2, ctypes.byref(written), None)
1924         if ret == 0:
1925             raise OSError('Failed to write string')
1926         if not count:  # We just wrote a non-BMP character
1927             assert written.value == 2
1928             s = s[1:]
1929         else:
1930             assert written.value > 0
1931             s = s[written.value:]
1932     return True
1933
1934
1935 def write_string(s, out=None, encoding=None):
1936     if out is None:
1937         out = sys.stderr
1938     assert type(s) == compat_str
1939
1940     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1941         if _windows_write_string(s, out):
1942             return
1943
1944     if ('b' in getattr(out, 'mode', '')
1945             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1946         byt = s.encode(encoding or preferredencoding(), 'ignore')
1947         out.write(byt)
1948     elif hasattr(out, 'buffer'):
1949         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1950         byt = s.encode(enc, 'ignore')
1951         out.buffer.write(byt)
1952     else:
1953         out.write(s)
1954     out.flush()
1955
1956
1957 def bytes_to_intlist(bs):
1958     if not bs:
1959         return []
1960     if isinstance(bs[0], int):  # Python 3
1961         return list(bs)
1962     else:
1963         return [ord(c) for c in bs]
1964
1965
1966 def intlist_to_bytes(xs):
1967     if not xs:
1968         return b''
1969     return compat_struct_pack('%dB' % len(xs), *xs)
1970
1971
1972 # Cross-platform file locking
1973 if sys.platform == 'win32':
1974     import ctypes.wintypes
1975     import msvcrt
1976
1977     class OVERLAPPED(ctypes.Structure):
1978         _fields_ = [
1979             ('Internal', ctypes.wintypes.LPVOID),
1980             ('InternalHigh', ctypes.wintypes.LPVOID),
1981             ('Offset', ctypes.wintypes.DWORD),
1982             ('OffsetHigh', ctypes.wintypes.DWORD),
1983             ('hEvent', ctypes.wintypes.HANDLE),
1984         ]
1985
1986     kernel32 = ctypes.windll.kernel32
1987     LockFileEx = kernel32.LockFileEx
1988     LockFileEx.argtypes = [
1989         ctypes.wintypes.HANDLE,     # hFile
1990         ctypes.wintypes.DWORD,      # dwFlags
1991         ctypes.wintypes.DWORD,      # dwReserved
1992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1993         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1994         ctypes.POINTER(OVERLAPPED)  # Overlapped
1995     ]
1996     LockFileEx.restype = ctypes.wintypes.BOOL
1997     UnlockFileEx = kernel32.UnlockFileEx
1998     UnlockFileEx.argtypes = [
1999         ctypes.wintypes.HANDLE,     # hFile
2000         ctypes.wintypes.DWORD,      # dwReserved
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2002         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2003         ctypes.POINTER(OVERLAPPED)  # Overlapped
2004     ]
2005     UnlockFileEx.restype = ctypes.wintypes.BOOL
2006     whole_low = 0xffffffff
2007     whole_high = 0x7fffffff
2008
2009     def _lock_file(f, exclusive):
2010         overlapped = OVERLAPPED()
2011         overlapped.Offset = 0
2012         overlapped.OffsetHigh = 0
2013         overlapped.hEvent = 0
2014         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2015         handle = msvcrt.get_osfhandle(f.fileno())
2016         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2017                           whole_low, whole_high, f._lock_file_overlapped_p):
2018             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2019
2020     def _unlock_file(f):
2021         assert f._lock_file_overlapped_p
2022         handle = msvcrt.get_osfhandle(f.fileno())
2023         if not UnlockFileEx(handle, 0,
2024                             whole_low, whole_high, f._lock_file_overlapped_p):
2025             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2026
2027 else:
2028     # Some platforms, such as Jython, is missing fcntl
2029     try:
2030         import fcntl
2031
2032         def _lock_file(f, exclusive):
2033             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2034
2035         def _unlock_file(f):
2036             fcntl.flock(f, fcntl.LOCK_UN)
2037     except ImportError:
2038         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2039
2040         def _lock_file(f, exclusive):
2041             raise IOError(UNSUPPORTED_MSG)
2042
2043         def _unlock_file(f):
2044             raise IOError(UNSUPPORTED_MSG)
2045
2046
2047 class locked_file(object):
2048     def __init__(self, filename, mode, encoding=None):
2049         assert mode in ['r', 'a', 'w']
2050         self.f = io.open(filename, mode, encoding=encoding)
2051         self.mode = mode
2052
2053     def __enter__(self):
2054         exclusive = self.mode != 'r'
2055         try:
2056             _lock_file(self.f, exclusive)
2057         except IOError:
2058             self.f.close()
2059             raise
2060         return self
2061
2062     def __exit__(self, etype, value, traceback):
2063         try:
2064             _unlock_file(self.f)
2065         finally:
2066             self.f.close()
2067
2068     def __iter__(self):
2069         return iter(self.f)
2070
2071     def write(self, *args):
2072         return self.f.write(*args)
2073
2074     def read(self, *args):
2075         return self.f.read(*args)
2076
2077
2078 def get_filesystem_encoding():
2079     encoding = sys.getfilesystemencoding()
2080     return encoding if encoding is not None else 'utf-8'
2081
2082
2083 def shell_quote(args):
2084     quoted_args = []
2085     encoding = get_filesystem_encoding()
2086     for a in args:
2087         if isinstance(a, bytes):
2088             # We may get a filename encoded with 'encodeFilename'
2089             a = a.decode(encoding)
2090         quoted_args.append(compat_shlex_quote(a))
2091     return ' '.join(quoted_args)
2092
2093
2094 def smuggle_url(url, data):
2095     """ Pass additional data in a URL for internal use. """
2096
2097     url, idata = unsmuggle_url(url, {})
2098     data.update(idata)
2099     sdata = compat_urllib_parse_urlencode(
2100         {'__youtubedl_smuggle': json.dumps(data)})
2101     return url + '#' + sdata
2102
2103
2104 def unsmuggle_url(smug_url, default=None):
2105     if '#__youtubedl_smuggle' not in smug_url:
2106         return smug_url, default
2107     url, _, sdata = smug_url.rpartition('#')
2108     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2109     data = json.loads(jsond)
2110     return url, data
2111
2112
2113 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2114     """ Formats numbers with decimal sufixes like K, M, etc """
2115     num, factor = float_or_none(num), float(factor)
2116     if num is None:
2117         return None
2118     exponent = 0 if num == 0 else int(math.log(num, factor))
2119     suffix = ['', *'KMGTPEZY'][exponent]
2120     converted = num / (factor ** exponent)
2121     return fmt % (converted, suffix)
2122
2123
2124 def format_bytes(bytes):
2125     return format_decimal_suffix(bytes, '%.2f%siB', factor=1024) or 'N/A'
2126
2127
2128 def lookup_unit_table(unit_table, s):
2129     units_re = '|'.join(re.escape(u) for u in unit_table)
2130     m = re.match(
2131         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2132     if not m:
2133         return None
2134     num_str = m.group('num').replace(',', '.')
2135     mult = unit_table[m.group('unit')]
2136     return int(float(num_str) * mult)
2137
2138
2139 def parse_filesize(s):
2140     if s is None:
2141         return None
2142
2143     # The lower-case forms are of course incorrect and unofficial,
2144     # but we support those too
2145     _UNIT_TABLE = {
2146         'B': 1,
2147         'b': 1,
2148         'bytes': 1,
2149         'KiB': 1024,
2150         'KB': 1000,
2151         'kB': 1024,
2152         'Kb': 1000,
2153         'kb': 1000,
2154         'kilobytes': 1000,
2155         'kibibytes': 1024,
2156         'MiB': 1024 ** 2,
2157         'MB': 1000 ** 2,
2158         'mB': 1024 ** 2,
2159         'Mb': 1000 ** 2,
2160         'mb': 1000 ** 2,
2161         'megabytes': 1000 ** 2,
2162         'mebibytes': 1024 ** 2,
2163         'GiB': 1024 ** 3,
2164         'GB': 1000 ** 3,
2165         'gB': 1024 ** 3,
2166         'Gb': 1000 ** 3,
2167         'gb': 1000 ** 3,
2168         'gigabytes': 1000 ** 3,
2169         'gibibytes': 1024 ** 3,
2170         'TiB': 1024 ** 4,
2171         'TB': 1000 ** 4,
2172         'tB': 1024 ** 4,
2173         'Tb': 1000 ** 4,
2174         'tb': 1000 ** 4,
2175         'terabytes': 1000 ** 4,
2176         'tebibytes': 1024 ** 4,
2177         'PiB': 1024 ** 5,
2178         'PB': 1000 ** 5,
2179         'pB': 1024 ** 5,
2180         'Pb': 1000 ** 5,
2181         'pb': 1000 ** 5,
2182         'petabytes': 1000 ** 5,
2183         'pebibytes': 1024 ** 5,
2184         'EiB': 1024 ** 6,
2185         'EB': 1000 ** 6,
2186         'eB': 1024 ** 6,
2187         'Eb': 1000 ** 6,
2188         'eb': 1000 ** 6,
2189         'exabytes': 1000 ** 6,
2190         'exbibytes': 1024 ** 6,
2191         'ZiB': 1024 ** 7,
2192         'ZB': 1000 ** 7,
2193         'zB': 1024 ** 7,
2194         'Zb': 1000 ** 7,
2195         'zb': 1000 ** 7,
2196         'zettabytes': 1000 ** 7,
2197         'zebibytes': 1024 ** 7,
2198         'YiB': 1024 ** 8,
2199         'YB': 1000 ** 8,
2200         'yB': 1024 ** 8,
2201         'Yb': 1000 ** 8,
2202         'yb': 1000 ** 8,
2203         'yottabytes': 1000 ** 8,
2204         'yobibytes': 1024 ** 8,
2205     }
2206
2207     return lookup_unit_table(_UNIT_TABLE, s)
2208
2209
2210 def parse_count(s):
2211     if s is None:
2212         return None
2213
2214     s = s.strip()
2215
2216     if re.match(r'^[\d,.]+$', s):
2217         return str_to_int(s)
2218
2219     _UNIT_TABLE = {
2220         'k': 1000,
2221         'K': 1000,
2222         'm': 1000 ** 2,
2223         'M': 1000 ** 2,
2224         'kk': 1000 ** 2,
2225         'KK': 1000 ** 2,
2226     }
2227
2228     return lookup_unit_table(_UNIT_TABLE, s)
2229
2230
2231 def parse_resolution(s):
2232     if s is None:
2233         return {}
2234
2235     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2236     if mobj:
2237         return {
2238             'width': int(mobj.group('w')),
2239             'height': int(mobj.group('h')),
2240         }
2241
2242     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2243     if mobj:
2244         return {'height': int(mobj.group(1))}
2245
2246     mobj = re.search(r'\b([48])[kK]\b', s)
2247     if mobj:
2248         return {'height': int(mobj.group(1)) * 540}
2249
2250     return {}
2251
2252
2253 def parse_bitrate(s):
2254     if not isinstance(s, compat_str):
2255         return
2256     mobj = re.search(r'\b(\d+)\s*kbps', s)
2257     if mobj:
2258         return int(mobj.group(1))
2259
2260
2261 def month_by_name(name, lang='en'):
2262     """ Return the number of a month by (locale-independently) English name """
2263
2264     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2265
2266     try:
2267         return month_names.index(name) + 1
2268     except ValueError:
2269         return None
2270
2271
2272 def month_by_abbreviation(abbrev):
2273     """ Return the number of a month by (locale-independently) English
2274         abbreviations """
2275
2276     try:
2277         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2278     except ValueError:
2279         return None
2280
2281
2282 def fix_xml_ampersands(xml_str):
2283     """Replace all the '&' by '&amp;' in XML"""
2284     return re.sub(
2285         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2286         '&amp;',
2287         xml_str)
2288
2289
2290 def setproctitle(title):
2291     assert isinstance(title, compat_str)
2292
2293     # ctypes in Jython is not complete
2294     # http://bugs.jython.org/issue2148
2295     if sys.platform.startswith('java'):
2296         return
2297
2298     try:
2299         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2300     except OSError:
2301         return
2302     except TypeError:
2303         # LoadLibrary in Windows Python 2.7.13 only expects
2304         # a bytestring, but since unicode_literals turns
2305         # every string into a unicode string, it fails.
2306         return
2307     title_bytes = title.encode('utf-8')
2308     buf = ctypes.create_string_buffer(len(title_bytes))
2309     buf.value = title_bytes
2310     try:
2311         libc.prctl(15, buf, 0, 0, 0)
2312     except AttributeError:
2313         return  # Strange libc, just skip this
2314
2315
2316 def remove_start(s, start):
2317     return s[len(start):] if s is not None and s.startswith(start) else s
2318
2319
2320 def remove_end(s, end):
2321     return s[:-len(end)] if s is not None and s.endswith(end) else s
2322
2323
2324 def remove_quotes(s):
2325     if s is None or len(s) < 2:
2326         return s
2327     for quote in ('"', "'", ):
2328         if s[0] == quote and s[-1] == quote:
2329             return s[1:-1]
2330     return s
2331
2332
2333 def get_domain(url):
2334     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2335     return domain.group('domain') if domain else None
2336
2337
2338 def url_basename(url):
2339     path = compat_urlparse.urlparse(url).path
2340     return path.strip('/').split('/')[-1]
2341
2342
2343 def base_url(url):
2344     return re.match(r'https?://[^?#&]+/', url).group()
2345
2346
2347 def urljoin(base, path):
2348     if isinstance(path, bytes):
2349         path = path.decode('utf-8')
2350     if not isinstance(path, compat_str) or not path:
2351         return None
2352     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2353         return path
2354     if isinstance(base, bytes):
2355         base = base.decode('utf-8')
2356     if not isinstance(base, compat_str) or not re.match(
2357             r'^(?:https?:)?//', base):
2358         return None
2359     return compat_urlparse.urljoin(base, path)
2360
2361
2362 class HEADRequest(compat_urllib_request.Request):
2363     def get_method(self):
2364         return 'HEAD'
2365
2366
2367 class PUTRequest(compat_urllib_request.Request):
2368     def get_method(self):
2369         return 'PUT'
2370
2371
2372 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2373     if get_attr:
2374         if v is not None:
2375             v = getattr(v, get_attr, None)
2376     if v == '':
2377         v = None
2378     if v is None:
2379         return default
2380     try:
2381         return int(v) * invscale // scale
2382     except (ValueError, TypeError, OverflowError):
2383         return default
2384
2385
2386 def str_or_none(v, default=None):
2387     return default if v is None else compat_str(v)
2388
2389
2390 def str_to_int(int_str):
2391     """ A more relaxed version of int_or_none """
2392     if isinstance(int_str, compat_integer_types):
2393         return int_str
2394     elif isinstance(int_str, compat_str):
2395         int_str = re.sub(r'[,\.\+]', '', int_str)
2396         return int_or_none(int_str)
2397
2398
2399 def float_or_none(v, scale=1, invscale=1, default=None):
2400     if v is None:
2401         return default
2402     try:
2403         return float(v) * invscale / scale
2404     except (ValueError, TypeError):
2405         return default
2406
2407
2408 def bool_or_none(v, default=None):
2409     return v if isinstance(v, bool) else default
2410
2411
2412 def strip_or_none(v, default=None):
2413     return v.strip() if isinstance(v, compat_str) else default
2414
2415
2416 def url_or_none(url):
2417     if not url or not isinstance(url, compat_str):
2418         return None
2419     url = url.strip()
2420     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2421
2422
2423 def strftime_or_none(timestamp, date_format, default=None):
2424     datetime_object = None
2425     try:
2426         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2427             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2428         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2429             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2430         return datetime_object.strftime(date_format)
2431     except (ValueError, TypeError, AttributeError):
2432         return default
2433
2434
2435 def parse_duration(s):
2436     if not isinstance(s, compat_basestring):
2437         return None
2438     s = s.strip()
2439     if not s:
2440         return None
2441
2442     days, hours, mins, secs, ms = [None] * 5
2443     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2444     if m:
2445         days, hours, mins, secs, ms = m.groups()
2446     else:
2447         m = re.match(
2448             r'''(?ix)(?:P?
2449                 (?:
2450                     [0-9]+\s*y(?:ears?)?\s*
2451                 )?
2452                 (?:
2453                     [0-9]+\s*m(?:onths?)?\s*
2454                 )?
2455                 (?:
2456                     [0-9]+\s*w(?:eeks?)?\s*
2457                 )?
2458                 (?:
2459                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2460                 )?
2461                 T)?
2462                 (?:
2463                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2464                 )?
2465                 (?:
2466                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2467                 )?
2468                 (?:
2469                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2470                 )?Z?$''', s)
2471         if m:
2472             days, hours, mins, secs, ms = m.groups()
2473         else:
2474             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2475             if m:
2476                 hours, mins = m.groups()
2477             else:
2478                 return None
2479
2480     duration = 0
2481     if secs:
2482         duration += float(secs)
2483     if mins:
2484         duration += float(mins) * 60
2485     if hours:
2486         duration += float(hours) * 60 * 60
2487     if days:
2488         duration += float(days) * 24 * 60 * 60
2489     if ms:
2490         duration += float(ms)
2491     return duration
2492
2493
2494 def prepend_extension(filename, ext, expected_real_ext=None):
2495     name, real_ext = os.path.splitext(filename)
2496     return (
2497         '{0}.{1}{2}'.format(name, ext, real_ext)
2498         if not expected_real_ext or real_ext[1:] == expected_real_ext
2499         else '{0}.{1}'.format(filename, ext))
2500
2501
2502 def replace_extension(filename, ext, expected_real_ext=None):
2503     name, real_ext = os.path.splitext(filename)
2504     return '{0}.{1}'.format(
2505         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2506         ext)
2507
2508
2509 def check_executable(exe, args=[]):
2510     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2511     args can be a list of arguments for a short output (like -version) """
2512     try:
2513         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2514     except OSError:
2515         return False
2516     return exe
2517
2518
2519 def _get_exe_version_output(exe, args):
2520     try:
2521         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2522         # SIGTTOU if yt-dlp is run in the background.
2523         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2524         out, _ = Popen(
2525             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2526             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2527     except OSError:
2528         return False
2529     if isinstance(out, bytes):  # Python 2.x
2530         out = out.decode('ascii', 'ignore')
2531     return out
2532
2533
2534 def detect_exe_version(output, version_re=None, unrecognized='present'):
2535     assert isinstance(output, compat_str)
2536     if version_re is None:
2537         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2538     m = re.search(version_re, output)
2539     if m:
2540         return m.group(1)
2541     else:
2542         return unrecognized
2543
2544
2545 def get_exe_version(exe, args=['--version'],
2546                     version_re=None, unrecognized='present'):
2547     """ Returns the version of the specified executable,
2548     or False if the executable is not present """
2549     out = _get_exe_version_output(exe, args)
2550     return detect_exe_version(out, version_re, unrecognized) if out else False
2551
2552
2553 class LazyList(collections.abc.Sequence):
2554     ''' Lazy immutable list from an iterable
2555     Note that slices of a LazyList are lists and not LazyList'''
2556
2557     class IndexError(IndexError):
2558         pass
2559
2560     def __init__(self, iterable, *, reverse=False, _cache=None):
2561         self.__iterable = iter(iterable)
2562         self.__cache = [] if _cache is None else _cache
2563         self.__reversed = reverse
2564
2565     def __iter__(self):
2566         if self.__reversed:
2567             # We need to consume the entire iterable to iterate in reverse
2568             yield from self.exhaust()
2569             return
2570         yield from self.__cache
2571         for item in self.__iterable:
2572             self.__cache.append(item)
2573             yield item
2574
2575     def __exhaust(self):
2576         self.__cache.extend(self.__iterable)
2577         # Discard the emptied iterable to make it pickle-able
2578         self.__iterable = []
2579         return self.__cache
2580
2581     def exhaust(self):
2582         ''' Evaluate the entire iterable '''
2583         return self.__exhaust()[::-1 if self.__reversed else 1]
2584
2585     @staticmethod
2586     def __reverse_index(x):
2587         return None if x is None else -(x + 1)
2588
2589     def __getitem__(self, idx):
2590         if isinstance(idx, slice):
2591             if self.__reversed:
2592                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2593             start, stop, step = idx.start, idx.stop, idx.step or 1
2594         elif isinstance(idx, int):
2595             if self.__reversed:
2596                 idx = self.__reverse_index(idx)
2597             start, stop, step = idx, idx, 0
2598         else:
2599             raise TypeError('indices must be integers or slices')
2600         if ((start or 0) < 0 or (stop or 0) < 0
2601                 or (start is None and step < 0)
2602                 or (stop is None and step > 0)):
2603             # We need to consume the entire iterable to be able to slice from the end
2604             # Obviously, never use this with infinite iterables
2605             self.__exhaust()
2606             try:
2607                 return self.__cache[idx]
2608             except IndexError as e:
2609                 raise self.IndexError(e) from e
2610         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2611         if n > 0:
2612             self.__cache.extend(itertools.islice(self.__iterable, n))
2613         try:
2614             return self.__cache[idx]
2615         except IndexError as e:
2616             raise self.IndexError(e) from e
2617
2618     def __bool__(self):
2619         try:
2620             self[-1] if self.__reversed else self[0]
2621         except self.IndexError:
2622             return False
2623         return True
2624
2625     def __len__(self):
2626         self.__exhaust()
2627         return len(self.__cache)
2628
2629     def __reversed__(self):
2630         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2631
2632     def __copy__(self):
2633         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2634
2635     def __repr__(self):
2636         # repr and str should mimic a list. So we exhaust the iterable
2637         return repr(self.exhaust())
2638
2639     def __str__(self):
2640         return repr(self.exhaust())
2641
2642
2643 class PagedList:
2644
2645     class IndexError(IndexError):
2646         pass
2647
2648     def __len__(self):
2649         # This is only useful for tests
2650         return len(self.getslice())
2651
2652     def __init__(self, pagefunc, pagesize, use_cache=True):
2653         self._pagefunc = pagefunc
2654         self._pagesize = pagesize
2655         self._use_cache = use_cache
2656         self._cache = {}
2657
2658     def getpage(self, pagenum):
2659         page_results = self._cache.get(pagenum)
2660         if page_results is None:
2661             page_results = list(self._pagefunc(pagenum))
2662         if self._use_cache:
2663             self._cache[pagenum] = page_results
2664         return page_results
2665
2666     def getslice(self, start=0, end=None):
2667         return list(self._getslice(start, end))
2668
2669     def _getslice(self, start, end):
2670         raise NotImplementedError('This method must be implemented by subclasses')
2671
2672     def __getitem__(self, idx):
2673         # NOTE: cache must be enabled if this is used
2674         if not isinstance(idx, int) or idx < 0:
2675             raise TypeError('indices must be non-negative integers')
2676         entries = self.getslice(idx, idx + 1)
2677         if not entries:
2678             raise self.IndexError()
2679         return entries[0]
2680
2681
2682 class OnDemandPagedList(PagedList):
2683     def _getslice(self, start, end):
2684         for pagenum in itertools.count(start // self._pagesize):
2685             firstid = pagenum * self._pagesize
2686             nextfirstid = pagenum * self._pagesize + self._pagesize
2687             if start >= nextfirstid:
2688                 continue
2689
2690             startv = (
2691                 start % self._pagesize
2692                 if firstid <= start < nextfirstid
2693                 else 0)
2694             endv = (
2695                 ((end - 1) % self._pagesize) + 1
2696                 if (end is not None and firstid <= end <= nextfirstid)
2697                 else None)
2698
2699             page_results = self.getpage(pagenum)
2700             if startv != 0 or endv is not None:
2701                 page_results = page_results[startv:endv]
2702             yield from page_results
2703
2704             # A little optimization - if current page is not "full", ie. does
2705             # not contain page_size videos then we can assume that this page
2706             # is the last one - there are no more ids on further pages -
2707             # i.e. no need to query again.
2708             if len(page_results) + startv < self._pagesize:
2709                 break
2710
2711             # If we got the whole page, but the next page is not interesting,
2712             # break out early as well
2713             if end == nextfirstid:
2714                 break
2715
2716
2717 class InAdvancePagedList(PagedList):
2718     def __init__(self, pagefunc, pagecount, pagesize):
2719         self._pagecount = pagecount
2720         PagedList.__init__(self, pagefunc, pagesize, True)
2721
2722     def _getslice(self, start, end):
2723         start_page = start // self._pagesize
2724         end_page = (
2725             self._pagecount if end is None else (end // self._pagesize + 1))
2726         skip_elems = start - start_page * self._pagesize
2727         only_more = None if end is None else end - start
2728         for pagenum in range(start_page, end_page):
2729             page_results = self.getpage(pagenum)
2730             if skip_elems:
2731                 page_results = page_results[skip_elems:]
2732                 skip_elems = None
2733             if only_more is not None:
2734                 if len(page_results) < only_more:
2735                     only_more -= len(page_results)
2736                 else:
2737                     yield from page_results[:only_more]
2738                     break
2739             yield from page_results
2740
2741
2742 def uppercase_escape(s):
2743     unicode_escape = codecs.getdecoder('unicode_escape')
2744     return re.sub(
2745         r'\\U[0-9a-fA-F]{8}',
2746         lambda m: unicode_escape(m.group(0))[0],
2747         s)
2748
2749
2750 def lowercase_escape(s):
2751     unicode_escape = codecs.getdecoder('unicode_escape')
2752     return re.sub(
2753         r'\\u[0-9a-fA-F]{4}',
2754         lambda m: unicode_escape(m.group(0))[0],
2755         s)
2756
2757
2758 def escape_rfc3986(s):
2759     """Escape non-ASCII characters as suggested by RFC 3986"""
2760     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2761         s = s.encode('utf-8')
2762     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2763
2764
2765 def escape_url(url):
2766     """Escape URL as suggested by RFC 3986"""
2767     url_parsed = compat_urllib_parse_urlparse(url)
2768     return url_parsed._replace(
2769         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2770         path=escape_rfc3986(url_parsed.path),
2771         params=escape_rfc3986(url_parsed.params),
2772         query=escape_rfc3986(url_parsed.query),
2773         fragment=escape_rfc3986(url_parsed.fragment)
2774     ).geturl()
2775
2776
2777 def parse_qs(url):
2778     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2779
2780
2781 def read_batch_urls(batch_fd):
2782     def fixup(url):
2783         if not isinstance(url, compat_str):
2784             url = url.decode('utf-8', 'replace')
2785         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2786         for bom in BOM_UTF8:
2787             if url.startswith(bom):
2788                 url = url[len(bom):]
2789         url = url.lstrip()
2790         if not url or url.startswith(('#', ';', ']')):
2791             return False
2792         # "#" cannot be stripped out since it is part of the URI
2793         # However, it can be safely stipped out if follwing a whitespace
2794         return re.split(r'\s#', url, 1)[0].rstrip()
2795
2796     with contextlib.closing(batch_fd) as fd:
2797         return [url for url in map(fixup, fd) if url]
2798
2799
2800 def urlencode_postdata(*args, **kargs):
2801     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2802
2803
2804 def update_url_query(url, query):
2805     if not query:
2806         return url
2807     parsed_url = compat_urlparse.urlparse(url)
2808     qs = compat_parse_qs(parsed_url.query)
2809     qs.update(query)
2810     return compat_urlparse.urlunparse(parsed_url._replace(
2811         query=compat_urllib_parse_urlencode(qs, True)))
2812
2813
2814 def update_Request(req, url=None, data=None, headers={}, query={}):
2815     req_headers = req.headers.copy()
2816     req_headers.update(headers)
2817     req_data = data or req.data
2818     req_url = update_url_query(url or req.get_full_url(), query)
2819     req_get_method = req.get_method()
2820     if req_get_method == 'HEAD':
2821         req_type = HEADRequest
2822     elif req_get_method == 'PUT':
2823         req_type = PUTRequest
2824     else:
2825         req_type = compat_urllib_request.Request
2826     new_req = req_type(
2827         req_url, data=req_data, headers=req_headers,
2828         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2829     if hasattr(req, 'timeout'):
2830         new_req.timeout = req.timeout
2831     return new_req
2832
2833
2834 def _multipart_encode_impl(data, boundary):
2835     content_type = 'multipart/form-data; boundary=%s' % boundary
2836
2837     out = b''
2838     for k, v in data.items():
2839         out += b'--' + boundary.encode('ascii') + b'\r\n'
2840         if isinstance(k, compat_str):
2841             k = k.encode('utf-8')
2842         if isinstance(v, compat_str):
2843             v = v.encode('utf-8')
2844         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2845         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2846         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2847         if boundary.encode('ascii') in content:
2848             raise ValueError('Boundary overlaps with data')
2849         out += content
2850
2851     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2852
2853     return out, content_type
2854
2855
2856 def multipart_encode(data, boundary=None):
2857     '''
2858     Encode a dict to RFC 7578-compliant form-data
2859
2860     data:
2861         A dict where keys and values can be either Unicode or bytes-like
2862         objects.
2863     boundary:
2864         If specified a Unicode object, it's used as the boundary. Otherwise
2865         a random boundary is generated.
2866
2867     Reference: https://tools.ietf.org/html/rfc7578
2868     '''
2869     has_specified_boundary = boundary is not None
2870
2871     while True:
2872         if boundary is None:
2873             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2874
2875         try:
2876             out, content_type = _multipart_encode_impl(data, boundary)
2877             break
2878         except ValueError:
2879             if has_specified_boundary:
2880                 raise
2881             boundary = None
2882
2883     return out, content_type
2884
2885
2886 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2887     if isinstance(key_or_keys, (list, tuple)):
2888         for key in key_or_keys:
2889             if key not in d or d[key] is None or skip_false_values and not d[key]:
2890                 continue
2891             return d[key]
2892         return default
2893     return d.get(key_or_keys, default)
2894
2895
2896 def try_get(src, getter, expected_type=None):
2897     for get in variadic(getter):
2898         try:
2899             v = get(src)
2900         except (AttributeError, KeyError, TypeError, IndexError):
2901             pass
2902         else:
2903             if expected_type is None or isinstance(v, expected_type):
2904                 return v
2905
2906
2907 def merge_dicts(*dicts):
2908     merged = {}
2909     for a_dict in dicts:
2910         for k, v in a_dict.items():
2911             if v is None:
2912                 continue
2913             if (k not in merged
2914                     or (isinstance(v, compat_str) and v
2915                         and isinstance(merged[k], compat_str)
2916                         and not merged[k])):
2917                 merged[k] = v
2918     return merged
2919
2920
2921 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2922     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2923
2924
2925 US_RATINGS = {
2926     'G': 0,
2927     'PG': 10,
2928     'PG-13': 13,
2929     'R': 16,
2930     'NC': 18,
2931 }
2932
2933
2934 TV_PARENTAL_GUIDELINES = {
2935     'TV-Y': 0,
2936     'TV-Y7': 7,
2937     'TV-G': 0,
2938     'TV-PG': 0,
2939     'TV-14': 14,
2940     'TV-MA': 17,
2941 }
2942
2943
2944 def parse_age_limit(s):
2945     if type(s) == int:
2946         return s if 0 <= s <= 21 else None
2947     if not isinstance(s, compat_basestring):
2948         return None
2949     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2950     if m:
2951         return int(m.group('age'))
2952     s = s.upper()
2953     if s in US_RATINGS:
2954         return US_RATINGS[s]
2955     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2956     if m:
2957         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2958     return None
2959
2960
2961 def strip_jsonp(code):
2962     return re.sub(
2963         r'''(?sx)^
2964             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2965             (?:\s*&&\s*(?P=func_name))?
2966             \s*\(\s*(?P<callback_data>.*)\);?
2967             \s*?(?://[^\n]*)*$''',
2968         r'\g<callback_data>', code)
2969
2970
2971 def js_to_json(code, vars={}):
2972     # vars is a dict of var, val pairs to substitute
2973     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2974     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2975     INTEGER_TABLE = (
2976         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2977         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2978     )
2979
2980     def fix_kv(m):
2981         v = m.group(0)
2982         if v in ('true', 'false', 'null'):
2983             return v
2984         elif v in ('undefined', 'void 0'):
2985             return 'null'
2986         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2987             return ""
2988
2989         if v[0] in ("'", '"'):
2990             v = re.sub(r'(?s)\\.|"', lambda m: {
2991                 '"': '\\"',
2992                 "\\'": "'",
2993                 '\\\n': '',
2994                 '\\x': '\\u00',
2995             }.get(m.group(0), m.group(0)), v[1:-1])
2996         else:
2997             for regex, base in INTEGER_TABLE:
2998                 im = re.match(regex, v)
2999                 if im:
3000                     i = int(im.group(1), base)
3001                     return '"%d":' % i if v.endswith(':') else '%d' % i
3002
3003             if v in vars:
3004                 return vars[v]
3005
3006         return '"%s"' % v
3007
3008     return re.sub(r'''(?sx)
3009         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3010         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3011         {comment}|,(?={skip}[\]}}])|
3012         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3013         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3014         [0-9]+(?={skip}:)|
3015         !+
3016         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3017
3018
3019 def qualities(quality_ids):
3020     """ Get a numeric quality value out of a list of possible values """
3021     def q(qid):
3022         try:
3023             return quality_ids.index(qid)
3024         except ValueError:
3025             return -1
3026     return q
3027
3028
3029 DEFAULT_OUTTMPL = {
3030     'default': '%(title)s [%(id)s].%(ext)s',
3031     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3032 }
3033 OUTTMPL_TYPES = {
3034     'chapter': None,
3035     'subtitle': None,
3036     'thumbnail': None,
3037     'description': 'description',
3038     'annotation': 'annotations.xml',
3039     'infojson': 'info.json',
3040     'link': None,
3041     'pl_thumbnail': None,
3042     'pl_description': 'description',
3043     'pl_infojson': 'info.json',
3044 }
3045
3046 # As of [1] format syntax is:
3047 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3048 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3049 STR_FORMAT_RE_TMPL = r'''(?x)
3050     (?<!%)(?P<prefix>(?:%%)*)
3051     %
3052     (?P<has_key>\((?P<key>{0})\))?
3053     (?P<format>
3054         (?P<conversion>[#0\-+ ]+)?
3055         (?P<min_width>\d+)?
3056         (?P<precision>\.\d+)?
3057         (?P<len_mod>[hlL])?  # unused in python
3058         {1}  # conversion type
3059     )
3060 '''
3061
3062
3063 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3064
3065
3066 def limit_length(s, length):
3067     """ Add ellipses to overly long strings """
3068     if s is None:
3069         return None
3070     ELLIPSES = '...'
3071     if len(s) > length:
3072         return s[:length - len(ELLIPSES)] + ELLIPSES
3073     return s
3074
3075
3076 def version_tuple(v):
3077     return tuple(int(e) for e in re.split(r'[-.]', v))
3078
3079
3080 def is_outdated_version(version, limit, assume_new=True):
3081     if not version:
3082         return not assume_new
3083     try:
3084         return version_tuple(version) < version_tuple(limit)
3085     except ValueError:
3086         return not assume_new
3087
3088
3089 def ytdl_is_updateable():
3090     """ Returns if yt-dlp can be updated with -U """
3091
3092     from .update import is_non_updateable
3093
3094     return not is_non_updateable()
3095
3096
3097 def args_to_str(args):
3098     # Get a short string representation for a subprocess command
3099     return ' '.join(compat_shlex_quote(a) for a in args)
3100
3101
3102 def error_to_compat_str(err):
3103     err_str = str(err)
3104     # On python 2 error byte string must be decoded with proper
3105     # encoding rather than ascii
3106     if sys.version_info[0] < 3:
3107         err_str = err_str.decode(preferredencoding())
3108     return err_str
3109
3110
3111 def mimetype2ext(mt):
3112     if mt is None:
3113         return None
3114
3115     mt, _, params = mt.partition(';')
3116     mt = mt.strip()
3117
3118     FULL_MAP = {
3119         'audio/mp4': 'm4a',
3120         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3121         # it's the most popular one
3122         'audio/mpeg': 'mp3',
3123         'audio/x-wav': 'wav',
3124         'audio/wav': 'wav',
3125         'audio/wave': 'wav',
3126     }
3127
3128     ext = FULL_MAP.get(mt)
3129     if ext is not None:
3130         return ext
3131
3132     SUBTYPE_MAP = {
3133         '3gpp': '3gp',
3134         'smptett+xml': 'tt',
3135         'ttaf+xml': 'dfxp',
3136         'ttml+xml': 'ttml',
3137         'x-flv': 'flv',
3138         'x-mp4-fragmented': 'mp4',
3139         'x-ms-sami': 'sami',
3140         'x-ms-wmv': 'wmv',
3141         'mpegurl': 'm3u8',
3142         'x-mpegurl': 'm3u8',
3143         'vnd.apple.mpegurl': 'm3u8',
3144         'dash+xml': 'mpd',
3145         'f4m+xml': 'f4m',
3146         'hds+xml': 'f4m',
3147         'vnd.ms-sstr+xml': 'ism',
3148         'quicktime': 'mov',
3149         'mp2t': 'ts',
3150         'x-wav': 'wav',
3151         'filmstrip+json': 'fs',
3152         'svg+xml': 'svg',
3153     }
3154
3155     _, _, subtype = mt.rpartition('/')
3156     ext = SUBTYPE_MAP.get(subtype.lower())
3157     if ext is not None:
3158         return ext
3159
3160     SUFFIX_MAP = {
3161         'json': 'json',
3162         'xml': 'xml',
3163         'zip': 'zip',
3164         'gzip': 'gz',
3165     }
3166
3167     _, _, suffix = subtype.partition('+')
3168     ext = SUFFIX_MAP.get(suffix)
3169     if ext is not None:
3170         return ext
3171
3172     return subtype.replace('+', '.')
3173
3174
3175 def ext2mimetype(ext_or_url):
3176     if not ext_or_url:
3177         return None
3178     if '.' not in ext_or_url:
3179         ext_or_url = f'file.{ext_or_url}'
3180     return mimetypes.guess_type(ext_or_url)[0]
3181
3182
3183 def parse_codecs(codecs_str):
3184     # http://tools.ietf.org/html/rfc6381
3185     if not codecs_str:
3186         return {}
3187     split_codecs = list(filter(None, map(
3188         str.strip, codecs_str.strip().strip(',').split(','))))
3189     vcodec, acodec, hdr = None, None, None
3190     for full_codec in split_codecs:
3191         parts = full_codec.split('.')
3192         codec = parts[0].replace('0', '')
3193         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3194                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3195             if not vcodec:
3196                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3197                 if codec in ('dvh1', 'dvhe'):
3198                     hdr = 'DV'
3199                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3200                     hdr = 'HDR10'
3201                 elif full_codec.replace('0', '').startswith('vp9.2'):
3202                     hdr = 'HDR10'
3203         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3204             if not acodec:
3205                 acodec = full_codec
3206         else:
3207             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3208     if vcodec or acodec:
3209         return {
3210             'vcodec': vcodec or 'none',
3211             'acodec': acodec or 'none',
3212             'dynamic_range': hdr,
3213         }
3214     elif len(split_codecs) == 2:
3215         return {
3216             'vcodec': split_codecs[0],
3217             'acodec': split_codecs[1],
3218         }
3219     return {}
3220
3221
3222 def urlhandle_detect_ext(url_handle):
3223     getheader = url_handle.headers.get
3224
3225     cd = getheader('Content-Disposition')
3226     if cd:
3227         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3228         if m:
3229             e = determine_ext(m.group('filename'), default_ext=None)
3230             if e:
3231                 return e
3232
3233     return mimetype2ext(getheader('Content-Type'))
3234
3235
3236 def encode_data_uri(data, mime_type):
3237     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3238
3239
3240 def age_restricted(content_limit, age_limit):
3241     """ Returns True iff the content should be blocked """
3242
3243     if age_limit is None:  # No limit set
3244         return False
3245     if content_limit is None:
3246         return False  # Content available for everyone
3247     return age_limit < content_limit
3248
3249
3250 def is_html(first_bytes):
3251     """ Detect whether a file contains HTML by examining its first bytes. """
3252
3253     BOMS = [
3254         (b'\xef\xbb\xbf', 'utf-8'),
3255         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3256         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3257         (b'\xff\xfe', 'utf-16-le'),
3258         (b'\xfe\xff', 'utf-16-be'),
3259     ]
3260     for bom, enc in BOMS:
3261         if first_bytes.startswith(bom):
3262             s = first_bytes[len(bom):].decode(enc, 'replace')
3263             break
3264     else:
3265         s = first_bytes.decode('utf-8', 'replace')
3266
3267     return re.match(r'^\s*<', s)
3268
3269
3270 def determine_protocol(info_dict):
3271     protocol = info_dict.get('protocol')
3272     if protocol is not None:
3273         return protocol
3274
3275     url = sanitize_url(info_dict['url'])
3276     if url.startswith('rtmp'):
3277         return 'rtmp'
3278     elif url.startswith('mms'):
3279         return 'mms'
3280     elif url.startswith('rtsp'):
3281         return 'rtsp'
3282
3283     ext = determine_ext(url)
3284     if ext == 'm3u8':
3285         return 'm3u8'
3286     elif ext == 'f4m':
3287         return 'f4m'
3288
3289     return compat_urllib_parse_urlparse(url).scheme
3290
3291
3292 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3293     """ Render a list of rows, each as a list of values.
3294     Text after a \t will be right aligned """
3295     def width(string):
3296         return len(remove_terminal_sequences(string).replace('\t', ''))
3297
3298     def get_max_lens(table):
3299         return [max(width(str(v)) for v in col) for col in zip(*table)]
3300
3301     def filter_using_list(row, filterArray):
3302         return [col for (take, col) in zip(filterArray, row) if take]
3303
3304     if hide_empty:
3305         max_lens = get_max_lens(data)
3306         header_row = filter_using_list(header_row, max_lens)
3307         data = [filter_using_list(row, max_lens) for row in data]
3308
3309     table = [header_row] + data
3310     max_lens = get_max_lens(table)
3311     extra_gap += 1
3312     if delim:
3313         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3314         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3315     for row in table:
3316         for pos, text in enumerate(map(str, row)):
3317             if '\t' in text:
3318                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3319             else:
3320                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3321     ret = '\n'.join(''.join(row).rstrip() for row in table)
3322     return ret
3323
3324
3325 def _match_one(filter_part, dct, incomplete):
3326     # TODO: Generalize code with YoutubeDL._build_format_filter
3327     STRING_OPERATORS = {
3328         '*=': operator.contains,
3329         '^=': lambda attr, value: attr.startswith(value),
3330         '$=': lambda attr, value: attr.endswith(value),
3331         '~=': lambda attr, value: re.search(value, attr),
3332     }
3333     COMPARISON_OPERATORS = {
3334         **STRING_OPERATORS,
3335         '<=': operator.le,  # "<=" must be defined above "<"
3336         '<': operator.lt,
3337         '>=': operator.ge,
3338         '>': operator.gt,
3339         '=': operator.eq,
3340     }
3341
3342     operator_rex = re.compile(r'''(?x)\s*
3343         (?P<key>[a-z_]+)
3344         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3345         (?:
3346             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3347             (?P<strval>.+?)
3348         )
3349         \s*$
3350         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3351     m = operator_rex.search(filter_part)
3352     if m:
3353         m = m.groupdict()
3354         unnegated_op = COMPARISON_OPERATORS[m['op']]
3355         if m['negation']:
3356             op = lambda attr, value: not unnegated_op(attr, value)
3357         else:
3358             op = unnegated_op
3359         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3360         if m['quote']:
3361             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3362         actual_value = dct.get(m['key'])
3363         numeric_comparison = None
3364         if isinstance(actual_value, compat_numeric_types):
3365             # If the original field is a string and matching comparisonvalue is
3366             # a number we should respect the origin of the original field
3367             # and process comparison value as a string (see
3368             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3369             try:
3370                 numeric_comparison = int(comparison_value)
3371             except ValueError:
3372                 numeric_comparison = parse_filesize(comparison_value)
3373                 if numeric_comparison is None:
3374                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3375                 if numeric_comparison is None:
3376                     numeric_comparison = parse_duration(comparison_value)
3377         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3378             raise ValueError('Operator %s only supports string values!' % m['op'])
3379         if actual_value is None:
3380             return incomplete or m['none_inclusive']
3381         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3382
3383     UNARY_OPERATORS = {
3384         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3385         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3386     }
3387     operator_rex = re.compile(r'''(?x)\s*
3388         (?P<op>%s)\s*(?P<key>[a-z_]+)
3389         \s*$
3390         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3391     m = operator_rex.search(filter_part)
3392     if m:
3393         op = UNARY_OPERATORS[m.group('op')]
3394         actual_value = dct.get(m.group('key'))
3395         if incomplete and actual_value is None:
3396             return True
3397         return op(actual_value)
3398
3399     raise ValueError('Invalid filter part %r' % filter_part)
3400
3401
3402 def match_str(filter_str, dct, incomplete=False):
3403     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3404         When incomplete, all conditions passes on missing fields
3405     """
3406     return all(
3407         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3408         for filter_part in re.split(r'(?<!\\)&', filter_str))
3409
3410
3411 def match_filter_func(filter_str):
3412     def _match_func(info_dict, *args, **kwargs):
3413         if match_str(filter_str, info_dict, *args, **kwargs):
3414             return None
3415         else:
3416             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3417             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3418     return _match_func
3419
3420
3421 def parse_dfxp_time_expr(time_expr):
3422     if not time_expr:
3423         return
3424
3425     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3426     if mobj:
3427         return float(mobj.group('time_offset'))
3428
3429     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3430     if mobj:
3431         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3432
3433
3434 def srt_subtitles_timecode(seconds):
3435     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3436
3437
3438 def ass_subtitles_timecode(seconds):
3439     time = timetuple_from_msec(seconds * 1000)
3440     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3441
3442
3443 def dfxp2srt(dfxp_data):
3444     '''
3445     @param dfxp_data A bytes-like object containing DFXP data
3446     @returns A unicode object containing converted SRT data
3447     '''
3448     LEGACY_NAMESPACES = (
3449         (b'http://www.w3.org/ns/ttml', [
3450             b'http://www.w3.org/2004/11/ttaf1',
3451             b'http://www.w3.org/2006/04/ttaf1',
3452             b'http://www.w3.org/2006/10/ttaf1',
3453         ]),
3454         (b'http://www.w3.org/ns/ttml#styling', [
3455             b'http://www.w3.org/ns/ttml#style',
3456         ]),
3457     )
3458
3459     SUPPORTED_STYLING = [
3460         'color',
3461         'fontFamily',
3462         'fontSize',
3463         'fontStyle',
3464         'fontWeight',
3465         'textDecoration'
3466     ]
3467
3468     _x = functools.partial(xpath_with_ns, ns_map={
3469         'xml': 'http://www.w3.org/XML/1998/namespace',
3470         'ttml': 'http://www.w3.org/ns/ttml',
3471         'tts': 'http://www.w3.org/ns/ttml#styling',
3472     })
3473
3474     styles = {}
3475     default_style = {}
3476
3477     class TTMLPElementParser(object):
3478         _out = ''
3479         _unclosed_elements = []
3480         _applied_styles = []
3481
3482         def start(self, tag, attrib):
3483             if tag in (_x('ttml:br'), 'br'):
3484                 self._out += '\n'
3485             else:
3486                 unclosed_elements = []
3487                 style = {}
3488                 element_style_id = attrib.get('style')
3489                 if default_style:
3490                     style.update(default_style)
3491                 if element_style_id:
3492                     style.update(styles.get(element_style_id, {}))
3493                 for prop in SUPPORTED_STYLING:
3494                     prop_val = attrib.get(_x('tts:' + prop))
3495                     if prop_val:
3496                         style[prop] = prop_val
3497                 if style:
3498                     font = ''
3499                     for k, v in sorted(style.items()):
3500                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3501                             continue
3502                         if k == 'color':
3503                             font += ' color="%s"' % v
3504                         elif k == 'fontSize':
3505                             font += ' size="%s"' % v
3506                         elif k == 'fontFamily':
3507                             font += ' face="%s"' % v
3508                         elif k == 'fontWeight' and v == 'bold':
3509                             self._out += '<b>'
3510                             unclosed_elements.append('b')
3511                         elif k == 'fontStyle' and v == 'italic':
3512                             self._out += '<i>'
3513                             unclosed_elements.append('i')
3514                         elif k == 'textDecoration' and v == 'underline':
3515                             self._out += '<u>'
3516                             unclosed_elements.append('u')
3517                     if font:
3518                         self._out += '<font' + font + '>'
3519                         unclosed_elements.append('font')
3520                     applied_style = {}
3521                     if self._applied_styles:
3522                         applied_style.update(self._applied_styles[-1])
3523                     applied_style.update(style)
3524                     self._applied_styles.append(applied_style)
3525                 self._unclosed_elements.append(unclosed_elements)
3526
3527         def end(self, tag):
3528             if tag not in (_x('ttml:br'), 'br'):
3529                 unclosed_elements = self._unclosed_elements.pop()
3530                 for element in reversed(unclosed_elements):
3531                     self._out += '</%s>' % element
3532                 if unclosed_elements and self._applied_styles:
3533                     self._applied_styles.pop()
3534
3535         def data(self, data):
3536             self._out += data
3537
3538         def close(self):
3539             return self._out.strip()
3540
3541     def parse_node(node):
3542         target = TTMLPElementParser()
3543         parser = xml.etree.ElementTree.XMLParser(target=target)
3544         parser.feed(xml.etree.ElementTree.tostring(node))
3545         return parser.close()
3546
3547     for k, v in LEGACY_NAMESPACES:
3548         for ns in v:
3549             dfxp_data = dfxp_data.replace(ns, k)
3550
3551     dfxp = compat_etree_fromstring(dfxp_data)
3552     out = []
3553     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3554
3555     if not paras:
3556         raise ValueError('Invalid dfxp/TTML subtitle')
3557
3558     repeat = False
3559     while True:
3560         for style in dfxp.findall(_x('.//ttml:style')):
3561             style_id = style.get('id') or style.get(_x('xml:id'))
3562             if not style_id:
3563                 continue
3564             parent_style_id = style.get('style')
3565             if parent_style_id:
3566                 if parent_style_id not in styles:
3567                     repeat = True
3568                     continue
3569                 styles[style_id] = styles[parent_style_id].copy()
3570             for prop in SUPPORTED_STYLING:
3571                 prop_val = style.get(_x('tts:' + prop))
3572                 if prop_val:
3573                     styles.setdefault(style_id, {})[prop] = prop_val
3574         if repeat:
3575             repeat = False
3576         else:
3577             break
3578
3579     for p in ('body', 'div'):
3580         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3581         if ele is None:
3582             continue
3583         style = styles.get(ele.get('style'))
3584         if not style:
3585             continue
3586         default_style.update(style)
3587
3588     for para, index in zip(paras, itertools.count(1)):
3589         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3590         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3591         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3592         if begin_time is None:
3593             continue
3594         if not end_time:
3595             if not dur:
3596                 continue
3597             end_time = begin_time + dur
3598         out.append('%d\n%s --> %s\n%s\n\n' % (
3599             index,
3600             srt_subtitles_timecode(begin_time),
3601             srt_subtitles_timecode(end_time),
3602             parse_node(para)))
3603
3604     return ''.join(out)
3605
3606
3607 def cli_option(params, command_option, param):
3608     param = params.get(param)
3609     if param:
3610         param = compat_str(param)
3611     return [command_option, param] if param is not None else []
3612
3613
3614 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3615     param = params.get(param)
3616     if param is None:
3617         return []
3618     assert isinstance(param, bool)
3619     if separator:
3620         return [command_option + separator + (true_value if param else false_value)]
3621     return [command_option, true_value if param else false_value]
3622
3623
3624 def cli_valueless_option(params, command_option, param, expected_value=True):
3625     param = params.get(param)
3626     return [command_option] if param == expected_value else []
3627
3628
3629 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3630     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3631         if use_compat:
3632             return argdict
3633         else:
3634             argdict = None
3635     if argdict is None:
3636         return default
3637     assert isinstance(argdict, dict)
3638
3639     assert isinstance(keys, (list, tuple))
3640     for key_list in keys:
3641         arg_list = list(filter(
3642             lambda x: x is not None,
3643             [argdict.get(key.lower()) for key in variadic(key_list)]))
3644         if arg_list:
3645             return [arg for args in arg_list for arg in args]
3646     return default
3647
3648
3649 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3650     main_key, exe = main_key.lower(), exe.lower()
3651     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3652     keys = [f'{root_key}{k}' for k in (keys or [''])]
3653     if root_key in keys:
3654         if main_key != exe:
3655             keys.append((main_key, exe))
3656         keys.append('default')
3657     else:
3658         use_compat = False
3659     return cli_configuration_args(argdict, keys, default, use_compat)
3660
3661
3662 class ISO639Utils(object):
3663     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3664     _lang_map = {
3665         'aa': 'aar',
3666         'ab': 'abk',
3667         'ae': 'ave',
3668         'af': 'afr',
3669         'ak': 'aka',
3670         'am': 'amh',
3671         'an': 'arg',
3672         'ar': 'ara',
3673         'as': 'asm',
3674         'av': 'ava',
3675         'ay': 'aym',
3676         'az': 'aze',
3677         'ba': 'bak',
3678         'be': 'bel',
3679         'bg': 'bul',
3680         'bh': 'bih',
3681         'bi': 'bis',
3682         'bm': 'bam',
3683         'bn': 'ben',
3684         'bo': 'bod',
3685         'br': 'bre',
3686         'bs': 'bos',
3687         'ca': 'cat',
3688         'ce': 'che',
3689         'ch': 'cha',
3690         'co': 'cos',
3691         'cr': 'cre',
3692         'cs': 'ces',
3693         'cu': 'chu',
3694         'cv': 'chv',
3695         'cy': 'cym',
3696         'da': 'dan',
3697         'de': 'deu',
3698         'dv': 'div',
3699         'dz': 'dzo',
3700         'ee': 'ewe',
3701         'el': 'ell',
3702         'en': 'eng',
3703         'eo': 'epo',
3704         'es': 'spa',
3705         'et': 'est',
3706         'eu': 'eus',
3707         'fa': 'fas',
3708         'ff': 'ful',
3709         'fi': 'fin',
3710         'fj': 'fij',
3711         'fo': 'fao',
3712         'fr': 'fra',
3713         'fy': 'fry',
3714         'ga': 'gle',
3715         'gd': 'gla',
3716         'gl': 'glg',
3717         'gn': 'grn',
3718         'gu': 'guj',
3719         'gv': 'glv',
3720         'ha': 'hau',
3721         'he': 'heb',
3722         'iw': 'heb',  # Replaced by he in 1989 revision
3723         'hi': 'hin',
3724         'ho': 'hmo',
3725         'hr': 'hrv',
3726         'ht': 'hat',
3727         'hu': 'hun',
3728         'hy': 'hye',
3729         'hz': 'her',
3730         'ia': 'ina',
3731         'id': 'ind',
3732         'in': 'ind',  # Replaced by id in 1989 revision
3733         'ie': 'ile',
3734         'ig': 'ibo',
3735         'ii': 'iii',
3736         'ik': 'ipk',
3737         'io': 'ido',
3738         'is': 'isl',
3739         'it': 'ita',
3740         'iu': 'iku',
3741         'ja': 'jpn',
3742         'jv': 'jav',
3743         'ka': 'kat',
3744         'kg': 'kon',
3745         'ki': 'kik',
3746         'kj': 'kua',
3747         'kk': 'kaz',
3748         'kl': 'kal',
3749         'km': 'khm',
3750         'kn': 'kan',
3751         'ko': 'kor',
3752         'kr': 'kau',
3753         'ks': 'kas',
3754         'ku': 'kur',
3755         'kv': 'kom',
3756         'kw': 'cor',
3757         'ky': 'kir',
3758         'la': 'lat',
3759         'lb': 'ltz',
3760         'lg': 'lug',
3761         'li': 'lim',
3762         'ln': 'lin',
3763         'lo': 'lao',
3764         'lt': 'lit',
3765         'lu': 'lub',
3766         'lv': 'lav',
3767         'mg': 'mlg',
3768         'mh': 'mah',
3769         'mi': 'mri',
3770         'mk': 'mkd',
3771         'ml': 'mal',
3772         'mn': 'mon',
3773         'mr': 'mar',
3774         'ms': 'msa',
3775         'mt': 'mlt',
3776         'my': 'mya',
3777         'na': 'nau',
3778         'nb': 'nob',
3779         'nd': 'nde',
3780         'ne': 'nep',
3781         'ng': 'ndo',
3782         'nl': 'nld',
3783         'nn': 'nno',
3784         'no': 'nor',
3785         'nr': 'nbl',
3786         'nv': 'nav',
3787         'ny': 'nya',
3788         'oc': 'oci',
3789         'oj': 'oji',
3790         'om': 'orm',
3791         'or': 'ori',
3792         'os': 'oss',
3793         'pa': 'pan',
3794         'pi': 'pli',
3795         'pl': 'pol',
3796         'ps': 'pus',
3797         'pt': 'por',
3798         'qu': 'que',
3799         'rm': 'roh',
3800         'rn': 'run',
3801         'ro': 'ron',
3802         'ru': 'rus',
3803         'rw': 'kin',
3804         'sa': 'san',
3805         'sc': 'srd',
3806         'sd': 'snd',
3807         'se': 'sme',
3808         'sg': 'sag',
3809         'si': 'sin',
3810         'sk': 'slk',
3811         'sl': 'slv',
3812         'sm': 'smo',
3813         'sn': 'sna',
3814         'so': 'som',
3815         'sq': 'sqi',
3816         'sr': 'srp',
3817         'ss': 'ssw',
3818         'st': 'sot',
3819         'su': 'sun',
3820         'sv': 'swe',
3821         'sw': 'swa',
3822         'ta': 'tam',
3823         'te': 'tel',
3824         'tg': 'tgk',
3825         'th': 'tha',
3826         'ti': 'tir',
3827         'tk': 'tuk',
3828         'tl': 'tgl',
3829         'tn': 'tsn',
3830         'to': 'ton',
3831         'tr': 'tur',
3832         'ts': 'tso',
3833         'tt': 'tat',
3834         'tw': 'twi',
3835         'ty': 'tah',
3836         'ug': 'uig',
3837         'uk': 'ukr',
3838         'ur': 'urd',
3839         'uz': 'uzb',
3840         've': 'ven',
3841         'vi': 'vie',
3842         'vo': 'vol',
3843         'wa': 'wln',
3844         'wo': 'wol',
3845         'xh': 'xho',
3846         'yi': 'yid',
3847         'ji': 'yid',  # Replaced by yi in 1989 revision
3848         'yo': 'yor',
3849         'za': 'zha',
3850         'zh': 'zho',
3851         'zu': 'zul',
3852     }
3853
3854     @classmethod
3855     def short2long(cls, code):
3856         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3857         return cls._lang_map.get(code[:2])
3858
3859     @classmethod
3860     def long2short(cls, code):
3861         """Convert language code from ISO 639-2/T to ISO 639-1"""
3862         for short_name, long_name in cls._lang_map.items():
3863             if long_name == code:
3864                 return short_name
3865
3866
3867 class ISO3166Utils(object):
3868     # From http://data.okfn.org/data/core/country-list
3869     _country_map = {
3870         'AF': 'Afghanistan',
3871         'AX': 'Åland Islands',
3872         'AL': 'Albania',
3873         'DZ': 'Algeria',
3874         'AS': 'American Samoa',
3875         'AD': 'Andorra',
3876         'AO': 'Angola',
3877         'AI': 'Anguilla',
3878         'AQ': 'Antarctica',
3879         'AG': 'Antigua and Barbuda',
3880         'AR': 'Argentina',
3881         'AM': 'Armenia',
3882         'AW': 'Aruba',
3883         'AU': 'Australia',
3884         'AT': 'Austria',
3885         'AZ': 'Azerbaijan',
3886         'BS': 'Bahamas',
3887         'BH': 'Bahrain',
3888         'BD': 'Bangladesh',
3889         'BB': 'Barbados',
3890         'BY': 'Belarus',
3891         'BE': 'Belgium',
3892         'BZ': 'Belize',
3893         'BJ': 'Benin',
3894         'BM': 'Bermuda',
3895         'BT': 'Bhutan',
3896         'BO': 'Bolivia, Plurinational State of',
3897         'BQ': 'Bonaire, Sint Eustatius and Saba',
3898         'BA': 'Bosnia and Herzegovina',
3899         'BW': 'Botswana',
3900         'BV': 'Bouvet Island',
3901         'BR': 'Brazil',
3902         'IO': 'British Indian Ocean Territory',
3903         'BN': 'Brunei Darussalam',
3904         'BG': 'Bulgaria',
3905         'BF': 'Burkina Faso',
3906         'BI': 'Burundi',
3907         'KH': 'Cambodia',
3908         'CM': 'Cameroon',
3909         'CA': 'Canada',
3910         'CV': 'Cape Verde',
3911         'KY': 'Cayman Islands',
3912         'CF': 'Central African Republic',
3913         'TD': 'Chad',
3914         'CL': 'Chile',
3915         'CN': 'China',
3916         'CX': 'Christmas Island',
3917         'CC': 'Cocos (Keeling) Islands',
3918         'CO': 'Colombia',
3919         'KM': 'Comoros',
3920         'CG': 'Congo',
3921         'CD': 'Congo, the Democratic Republic of the',
3922         'CK': 'Cook Islands',
3923         'CR': 'Costa Rica',
3924         'CI': 'Côte d\'Ivoire',
3925         'HR': 'Croatia',
3926         'CU': 'Cuba',
3927         'CW': 'Curaçao',
3928         'CY': 'Cyprus',
3929         'CZ': 'Czech Republic',
3930         'DK': 'Denmark',
3931         'DJ': 'Djibouti',
3932         'DM': 'Dominica',
3933         'DO': 'Dominican Republic',
3934         'EC': 'Ecuador',
3935         'EG': 'Egypt',
3936         'SV': 'El Salvador',
3937         'GQ': 'Equatorial Guinea',
3938         'ER': 'Eritrea',
3939         'EE': 'Estonia',
3940         'ET': 'Ethiopia',
3941         'FK': 'Falkland Islands (Malvinas)',
3942         'FO': 'Faroe Islands',
3943         'FJ': 'Fiji',
3944         'FI': 'Finland',
3945         'FR': 'France',
3946         'GF': 'French Guiana',
3947         'PF': 'French Polynesia',
3948         'TF': 'French Southern Territories',
3949         'GA': 'Gabon',
3950         'GM': 'Gambia',
3951         'GE': 'Georgia',
3952         'DE': 'Germany',
3953         'GH': 'Ghana',
3954         'GI': 'Gibraltar',
3955         'GR': 'Greece',
3956         'GL': 'Greenland',
3957         'GD': 'Grenada',
3958         'GP': 'Guadeloupe',
3959         'GU': 'Guam',
3960         'GT': 'Guatemala',
3961         'GG': 'Guernsey',
3962         'GN': 'Guinea',
3963         'GW': 'Guinea-Bissau',
3964         'GY': 'Guyana',
3965         'HT': 'Haiti',
3966         'HM': 'Heard Island and McDonald Islands',
3967         'VA': 'Holy See (Vatican City State)',
3968         'HN': 'Honduras',
3969         'HK': 'Hong Kong',
3970         'HU': 'Hungary',
3971         'IS': 'Iceland',
3972         'IN': 'India',
3973         'ID': 'Indonesia',
3974         'IR': 'Iran, Islamic Republic of',
3975         'IQ': 'Iraq',
3976         'IE': 'Ireland',
3977         'IM': 'Isle of Man',
3978         'IL': 'Israel',
3979         'IT': 'Italy',
3980         'JM': 'Jamaica',
3981         'JP': 'Japan',
3982         'JE': 'Jersey',
3983         'JO': 'Jordan',
3984         'KZ': 'Kazakhstan',
3985         'KE': 'Kenya',
3986         'KI': 'Kiribati',
3987         'KP': 'Korea, Democratic People\'s Republic of',
3988         'KR': 'Korea, Republic of',
3989         'KW': 'Kuwait',
3990         'KG': 'Kyrgyzstan',
3991         'LA': 'Lao People\'s Democratic Republic',
3992         'LV': 'Latvia',
3993         'LB': 'Lebanon',
3994         'LS': 'Lesotho',
3995         'LR': 'Liberia',
3996         'LY': 'Libya',
3997         'LI': 'Liechtenstein',
3998         'LT': 'Lithuania',
3999         'LU': 'Luxembourg',
4000         'MO': 'Macao',
4001         'MK': 'Macedonia, the Former Yugoslav Republic of',
4002         'MG': 'Madagascar',
4003         'MW': 'Malawi',
4004         'MY': 'Malaysia',
4005         'MV': 'Maldives',
4006         'ML': 'Mali',
4007         'MT': 'Malta',
4008         'MH': 'Marshall Islands',
4009         'MQ': 'Martinique',
4010         'MR': 'Mauritania',
4011         'MU': 'Mauritius',
4012         'YT': 'Mayotte',
4013         'MX': 'Mexico',
4014         'FM': 'Micronesia, Federated States of',
4015         'MD': 'Moldova, Republic of',
4016         'MC': 'Monaco',
4017         'MN': 'Mongolia',
4018         'ME': 'Montenegro',
4019         'MS': 'Montserrat',
4020         'MA': 'Morocco',
4021         'MZ': 'Mozambique',
4022         'MM': 'Myanmar',
4023         'NA': 'Namibia',
4024         'NR': 'Nauru',
4025         'NP': 'Nepal',
4026         'NL': 'Netherlands',
4027         'NC': 'New Caledonia',
4028         'NZ': 'New Zealand',
4029         'NI': 'Nicaragua',
4030         'NE': 'Niger',
4031         'NG': 'Nigeria',
4032         'NU': 'Niue',
4033         'NF': 'Norfolk Island',
4034         'MP': 'Northern Mariana Islands',
4035         'NO': 'Norway',
4036         'OM': 'Oman',
4037         'PK': 'Pakistan',
4038         'PW': 'Palau',
4039         'PS': 'Palestine, State of',
4040         'PA': 'Panama',
4041         'PG': 'Papua New Guinea',
4042         'PY': 'Paraguay',
4043         'PE': 'Peru',
4044         'PH': 'Philippines',
4045         'PN': 'Pitcairn',
4046         'PL': 'Poland',
4047         'PT': 'Portugal',
4048         'PR': 'Puerto Rico',
4049         'QA': 'Qatar',
4050         'RE': 'Réunion',
4051         'RO': 'Romania',
4052         'RU': 'Russian Federation',
4053         'RW': 'Rwanda',
4054         'BL': 'Saint Barthélemy',
4055         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4056         'KN': 'Saint Kitts and Nevis',
4057         'LC': 'Saint Lucia',
4058         'MF': 'Saint Martin (French part)',
4059         'PM': 'Saint Pierre and Miquelon',
4060         'VC': 'Saint Vincent and the Grenadines',
4061         'WS': 'Samoa',
4062         'SM': 'San Marino',
4063         'ST': 'Sao Tome and Principe',
4064         'SA': 'Saudi Arabia',
4065         'SN': 'Senegal',
4066         'RS': 'Serbia',
4067         'SC': 'Seychelles',
4068         'SL': 'Sierra Leone',
4069         'SG': 'Singapore',
4070         'SX': 'Sint Maarten (Dutch part)',
4071         'SK': 'Slovakia',
4072         'SI': 'Slovenia',
4073         'SB': 'Solomon Islands',
4074         'SO': 'Somalia',
4075         'ZA': 'South Africa',
4076         'GS': 'South Georgia and the South Sandwich Islands',
4077         'SS': 'South Sudan',
4078         'ES': 'Spain',
4079         'LK': 'Sri Lanka',
4080         'SD': 'Sudan',
4081         'SR': 'Suriname',
4082         'SJ': 'Svalbard and Jan Mayen',
4083         'SZ': 'Swaziland',
4084         'SE': 'Sweden',
4085         'CH': 'Switzerland',
4086         'SY': 'Syrian Arab Republic',
4087         'TW': 'Taiwan, Province of China',
4088         'TJ': 'Tajikistan',
4089         'TZ': 'Tanzania, United Republic of',
4090         'TH': 'Thailand',
4091         'TL': 'Timor-Leste',
4092         'TG': 'Togo',
4093         'TK': 'Tokelau',
4094         'TO': 'Tonga',
4095         'TT': 'Trinidad and Tobago',
4096         'TN': 'Tunisia',
4097         'TR': 'Turkey',
4098         'TM': 'Turkmenistan',
4099         'TC': 'Turks and Caicos Islands',
4100         'TV': 'Tuvalu',
4101         'UG': 'Uganda',
4102         'UA': 'Ukraine',
4103         'AE': 'United Arab Emirates',
4104         'GB': 'United Kingdom',
4105         'US': 'United States',
4106         'UM': 'United States Minor Outlying Islands',
4107         'UY': 'Uruguay',
4108         'UZ': 'Uzbekistan',
4109         'VU': 'Vanuatu',
4110         'VE': 'Venezuela, Bolivarian Republic of',
4111         'VN': 'Viet Nam',
4112         'VG': 'Virgin Islands, British',
4113         'VI': 'Virgin Islands, U.S.',
4114         'WF': 'Wallis and Futuna',
4115         'EH': 'Western Sahara',
4116         'YE': 'Yemen',
4117         'ZM': 'Zambia',
4118         'ZW': 'Zimbabwe',
4119     }
4120
4121     @classmethod
4122     def short2full(cls, code):
4123         """Convert an ISO 3166-2 country code to the corresponding full name"""
4124         return cls._country_map.get(code.upper())
4125
4126
4127 class GeoUtils(object):
4128     # Major IPv4 address blocks per country
4129     _country_ip_map = {
4130         'AD': '46.172.224.0/19',
4131         'AE': '94.200.0.0/13',
4132         'AF': '149.54.0.0/17',
4133         'AG': '209.59.64.0/18',
4134         'AI': '204.14.248.0/21',
4135         'AL': '46.99.0.0/16',
4136         'AM': '46.70.0.0/15',
4137         'AO': '105.168.0.0/13',
4138         'AP': '182.50.184.0/21',
4139         'AQ': '23.154.160.0/24',
4140         'AR': '181.0.0.0/12',
4141         'AS': '202.70.112.0/20',
4142         'AT': '77.116.0.0/14',
4143         'AU': '1.128.0.0/11',
4144         'AW': '181.41.0.0/18',
4145         'AX': '185.217.4.0/22',
4146         'AZ': '5.197.0.0/16',
4147         'BA': '31.176.128.0/17',
4148         'BB': '65.48.128.0/17',
4149         'BD': '114.130.0.0/16',
4150         'BE': '57.0.0.0/8',
4151         'BF': '102.178.0.0/15',
4152         'BG': '95.42.0.0/15',
4153         'BH': '37.131.0.0/17',
4154         'BI': '154.117.192.0/18',
4155         'BJ': '137.255.0.0/16',
4156         'BL': '185.212.72.0/23',
4157         'BM': '196.12.64.0/18',
4158         'BN': '156.31.0.0/16',
4159         'BO': '161.56.0.0/16',
4160         'BQ': '161.0.80.0/20',
4161         'BR': '191.128.0.0/12',
4162         'BS': '24.51.64.0/18',
4163         'BT': '119.2.96.0/19',
4164         'BW': '168.167.0.0/16',
4165         'BY': '178.120.0.0/13',
4166         'BZ': '179.42.192.0/18',
4167         'CA': '99.224.0.0/11',
4168         'CD': '41.243.0.0/16',
4169         'CF': '197.242.176.0/21',
4170         'CG': '160.113.0.0/16',
4171         'CH': '85.0.0.0/13',
4172         'CI': '102.136.0.0/14',
4173         'CK': '202.65.32.0/19',
4174         'CL': '152.172.0.0/14',
4175         'CM': '102.244.0.0/14',
4176         'CN': '36.128.0.0/10',
4177         'CO': '181.240.0.0/12',
4178         'CR': '201.192.0.0/12',
4179         'CU': '152.206.0.0/15',
4180         'CV': '165.90.96.0/19',
4181         'CW': '190.88.128.0/17',
4182         'CY': '31.153.0.0/16',
4183         'CZ': '88.100.0.0/14',
4184         'DE': '53.0.0.0/8',
4185         'DJ': '197.241.0.0/17',
4186         'DK': '87.48.0.0/12',
4187         'DM': '192.243.48.0/20',
4188         'DO': '152.166.0.0/15',
4189         'DZ': '41.96.0.0/12',
4190         'EC': '186.68.0.0/15',
4191         'EE': '90.190.0.0/15',
4192         'EG': '156.160.0.0/11',
4193         'ER': '196.200.96.0/20',
4194         'ES': '88.0.0.0/11',
4195         'ET': '196.188.0.0/14',
4196         'EU': '2.16.0.0/13',
4197         'FI': '91.152.0.0/13',
4198         'FJ': '144.120.0.0/16',
4199         'FK': '80.73.208.0/21',
4200         'FM': '119.252.112.0/20',
4201         'FO': '88.85.32.0/19',
4202         'FR': '90.0.0.0/9',
4203         'GA': '41.158.0.0/15',
4204         'GB': '25.0.0.0/8',
4205         'GD': '74.122.88.0/21',
4206         'GE': '31.146.0.0/16',
4207         'GF': '161.22.64.0/18',
4208         'GG': '62.68.160.0/19',
4209         'GH': '154.160.0.0/12',
4210         'GI': '95.164.0.0/16',
4211         'GL': '88.83.0.0/19',
4212         'GM': '160.182.0.0/15',
4213         'GN': '197.149.192.0/18',
4214         'GP': '104.250.0.0/19',
4215         'GQ': '105.235.224.0/20',
4216         'GR': '94.64.0.0/13',
4217         'GT': '168.234.0.0/16',
4218         'GU': '168.123.0.0/16',
4219         'GW': '197.214.80.0/20',
4220         'GY': '181.41.64.0/18',
4221         'HK': '113.252.0.0/14',
4222         'HN': '181.210.0.0/16',
4223         'HR': '93.136.0.0/13',
4224         'HT': '148.102.128.0/17',
4225         'HU': '84.0.0.0/14',
4226         'ID': '39.192.0.0/10',
4227         'IE': '87.32.0.0/12',
4228         'IL': '79.176.0.0/13',
4229         'IM': '5.62.80.0/20',
4230         'IN': '117.192.0.0/10',
4231         'IO': '203.83.48.0/21',
4232         'IQ': '37.236.0.0/14',
4233         'IR': '2.176.0.0/12',
4234         'IS': '82.221.0.0/16',
4235         'IT': '79.0.0.0/10',
4236         'JE': '87.244.64.0/18',
4237         'JM': '72.27.0.0/17',
4238         'JO': '176.29.0.0/16',
4239         'JP': '133.0.0.0/8',
4240         'KE': '105.48.0.0/12',
4241         'KG': '158.181.128.0/17',
4242         'KH': '36.37.128.0/17',
4243         'KI': '103.25.140.0/22',
4244         'KM': '197.255.224.0/20',
4245         'KN': '198.167.192.0/19',
4246         'KP': '175.45.176.0/22',
4247         'KR': '175.192.0.0/10',
4248         'KW': '37.36.0.0/14',
4249         'KY': '64.96.0.0/15',
4250         'KZ': '2.72.0.0/13',
4251         'LA': '115.84.64.0/18',
4252         'LB': '178.135.0.0/16',
4253         'LC': '24.92.144.0/20',
4254         'LI': '82.117.0.0/19',
4255         'LK': '112.134.0.0/15',
4256         'LR': '102.183.0.0/16',
4257         'LS': '129.232.0.0/17',
4258         'LT': '78.56.0.0/13',
4259         'LU': '188.42.0.0/16',
4260         'LV': '46.109.0.0/16',
4261         'LY': '41.252.0.0/14',
4262         'MA': '105.128.0.0/11',
4263         'MC': '88.209.64.0/18',
4264         'MD': '37.246.0.0/16',
4265         'ME': '178.175.0.0/17',
4266         'MF': '74.112.232.0/21',
4267         'MG': '154.126.0.0/17',
4268         'MH': '117.103.88.0/21',
4269         'MK': '77.28.0.0/15',
4270         'ML': '154.118.128.0/18',
4271         'MM': '37.111.0.0/17',
4272         'MN': '49.0.128.0/17',
4273         'MO': '60.246.0.0/16',
4274         'MP': '202.88.64.0/20',
4275         'MQ': '109.203.224.0/19',
4276         'MR': '41.188.64.0/18',
4277         'MS': '208.90.112.0/22',
4278         'MT': '46.11.0.0/16',
4279         'MU': '105.16.0.0/12',
4280         'MV': '27.114.128.0/18',
4281         'MW': '102.70.0.0/15',
4282         'MX': '187.192.0.0/11',
4283         'MY': '175.136.0.0/13',
4284         'MZ': '197.218.0.0/15',
4285         'NA': '41.182.0.0/16',
4286         'NC': '101.101.0.0/18',
4287         'NE': '197.214.0.0/18',
4288         'NF': '203.17.240.0/22',
4289         'NG': '105.112.0.0/12',
4290         'NI': '186.76.0.0/15',
4291         'NL': '145.96.0.0/11',
4292         'NO': '84.208.0.0/13',
4293         'NP': '36.252.0.0/15',
4294         'NR': '203.98.224.0/19',
4295         'NU': '49.156.48.0/22',
4296         'NZ': '49.224.0.0/14',
4297         'OM': '5.36.0.0/15',
4298         'PA': '186.72.0.0/15',
4299         'PE': '186.160.0.0/14',
4300         'PF': '123.50.64.0/18',
4301         'PG': '124.240.192.0/19',
4302         'PH': '49.144.0.0/13',
4303         'PK': '39.32.0.0/11',
4304         'PL': '83.0.0.0/11',
4305         'PM': '70.36.0.0/20',
4306         'PR': '66.50.0.0/16',
4307         'PS': '188.161.0.0/16',
4308         'PT': '85.240.0.0/13',
4309         'PW': '202.124.224.0/20',
4310         'PY': '181.120.0.0/14',
4311         'QA': '37.210.0.0/15',
4312         'RE': '102.35.0.0/16',
4313         'RO': '79.112.0.0/13',
4314         'RS': '93.86.0.0/15',
4315         'RU': '5.136.0.0/13',
4316         'RW': '41.186.0.0/16',
4317         'SA': '188.48.0.0/13',
4318         'SB': '202.1.160.0/19',
4319         'SC': '154.192.0.0/11',
4320         'SD': '102.120.0.0/13',
4321         'SE': '78.64.0.0/12',
4322         'SG': '8.128.0.0/10',
4323         'SI': '188.196.0.0/14',
4324         'SK': '78.98.0.0/15',
4325         'SL': '102.143.0.0/17',
4326         'SM': '89.186.32.0/19',
4327         'SN': '41.82.0.0/15',
4328         'SO': '154.115.192.0/18',
4329         'SR': '186.179.128.0/17',
4330         'SS': '105.235.208.0/21',
4331         'ST': '197.159.160.0/19',
4332         'SV': '168.243.0.0/16',
4333         'SX': '190.102.0.0/20',
4334         'SY': '5.0.0.0/16',
4335         'SZ': '41.84.224.0/19',
4336         'TC': '65.255.48.0/20',
4337         'TD': '154.68.128.0/19',
4338         'TG': '196.168.0.0/14',
4339         'TH': '171.96.0.0/13',
4340         'TJ': '85.9.128.0/18',
4341         'TK': '27.96.24.0/21',
4342         'TL': '180.189.160.0/20',
4343         'TM': '95.85.96.0/19',
4344         'TN': '197.0.0.0/11',
4345         'TO': '175.176.144.0/21',
4346         'TR': '78.160.0.0/11',
4347         'TT': '186.44.0.0/15',
4348         'TV': '202.2.96.0/19',
4349         'TW': '120.96.0.0/11',
4350         'TZ': '156.156.0.0/14',
4351         'UA': '37.52.0.0/14',
4352         'UG': '102.80.0.0/13',
4353         'US': '6.0.0.0/8',
4354         'UY': '167.56.0.0/13',
4355         'UZ': '84.54.64.0/18',
4356         'VA': '212.77.0.0/19',
4357         'VC': '207.191.240.0/21',
4358         'VE': '186.88.0.0/13',
4359         'VG': '66.81.192.0/20',
4360         'VI': '146.226.0.0/16',
4361         'VN': '14.160.0.0/11',
4362         'VU': '202.80.32.0/20',
4363         'WF': '117.20.32.0/21',
4364         'WS': '202.4.32.0/19',
4365         'YE': '134.35.0.0/16',
4366         'YT': '41.242.116.0/22',
4367         'ZA': '41.0.0.0/11',
4368         'ZM': '102.144.0.0/13',
4369         'ZW': '102.177.192.0/18',
4370     }
4371
4372     @classmethod
4373     def random_ipv4(cls, code_or_block):
4374         if len(code_or_block) == 2:
4375             block = cls._country_ip_map.get(code_or_block.upper())
4376             if not block:
4377                 return None
4378         else:
4379             block = code_or_block
4380         addr, preflen = block.split('/')
4381         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4382         addr_max = addr_min | (0xffffffff >> int(preflen))
4383         return compat_str(socket.inet_ntoa(
4384             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4385
4386
4387 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4388     def __init__(self, proxies=None):
4389         # Set default handlers
4390         for type in ('http', 'https'):
4391             setattr(self, '%s_open' % type,
4392                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4393                         meth(r, proxy, type))
4394         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4395
4396     def proxy_open(self, req, proxy, type):
4397         req_proxy = req.headers.get('Ytdl-request-proxy')
4398         if req_proxy is not None:
4399             proxy = req_proxy
4400             del req.headers['Ytdl-request-proxy']
4401
4402         if proxy == '__noproxy__':
4403             return None  # No Proxy
4404         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4405             req.add_header('Ytdl-socks-proxy', proxy)
4406             # yt-dlp's http/https handlers do wrapping the socket with socks
4407             return None
4408         return compat_urllib_request.ProxyHandler.proxy_open(
4409             self, req, proxy, type)
4410
4411
4412 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4413 # released into Public Domain
4414 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4415
4416 def long_to_bytes(n, blocksize=0):
4417     """long_to_bytes(n:long, blocksize:int) : string
4418     Convert a long integer to a byte string.
4419
4420     If optional blocksize is given and greater than zero, pad the front of the
4421     byte string with binary zeros so that the length is a multiple of
4422     blocksize.
4423     """
4424     # after much testing, this algorithm was deemed to be the fastest
4425     s = b''
4426     n = int(n)
4427     while n > 0:
4428         s = compat_struct_pack('>I', n & 0xffffffff) + s
4429         n = n >> 32
4430     # strip off leading zeros
4431     for i in range(len(s)):
4432         if s[i] != b'\000'[0]:
4433             break
4434     else:
4435         # only happens when n == 0
4436         s = b'\000'
4437         i = 0
4438     s = s[i:]
4439     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4440     # de-padding being done above, but sigh...
4441     if blocksize > 0 and len(s) % blocksize:
4442         s = (blocksize - len(s) % blocksize) * b'\000' + s
4443     return s
4444
4445
4446 def bytes_to_long(s):
4447     """bytes_to_long(string) : long
4448     Convert a byte string to a long integer.
4449
4450     This is (essentially) the inverse of long_to_bytes().
4451     """
4452     acc = 0
4453     length = len(s)
4454     if length % 4:
4455         extra = (4 - length % 4)
4456         s = b'\000' * extra + s
4457         length = length + extra
4458     for i in range(0, length, 4):
4459         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4460     return acc
4461
4462
4463 def ohdave_rsa_encrypt(data, exponent, modulus):
4464     '''
4465     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4466
4467     Input:
4468         data: data to encrypt, bytes-like object
4469         exponent, modulus: parameter e and N of RSA algorithm, both integer
4470     Output: hex string of encrypted data
4471
4472     Limitation: supports one block encryption only
4473     '''
4474
4475     payload = int(binascii.hexlify(data[::-1]), 16)
4476     encrypted = pow(payload, exponent, modulus)
4477     return '%x' % encrypted
4478
4479
4480 def pkcs1pad(data, length):
4481     """
4482     Padding input data with PKCS#1 scheme
4483
4484     @param {int[]} data        input data
4485     @param {int}   length      target length
4486     @returns {int[]}           padded data
4487     """
4488     if len(data) > length - 11:
4489         raise ValueError('Input data too long for PKCS#1 padding')
4490
4491     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4492     return [0, 2] + pseudo_random + [0] + data
4493
4494
4495 def encode_base_n(num, n, table=None):
4496     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4497     if not table:
4498         table = FULL_TABLE[:n]
4499
4500     if n > len(table):
4501         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4502
4503     if num == 0:
4504         return table[0]
4505
4506     ret = ''
4507     while num:
4508         ret = table[num % n] + ret
4509         num = num // n
4510     return ret
4511
4512
4513 def decode_packed_codes(code):
4514     mobj = re.search(PACKED_CODES_RE, code)
4515     obfuscated_code, base, count, symbols = mobj.groups()
4516     base = int(base)
4517     count = int(count)
4518     symbols = symbols.split('|')
4519     symbol_table = {}
4520
4521     while count:
4522         count -= 1
4523         base_n_count = encode_base_n(count, base)
4524         symbol_table[base_n_count] = symbols[count] or base_n_count
4525
4526     return re.sub(
4527         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4528         obfuscated_code)
4529
4530
4531 def caesar(s, alphabet, shift):
4532     if shift == 0:
4533         return s
4534     l = len(alphabet)
4535     return ''.join(
4536         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4537         for c in s)
4538
4539
4540 def rot47(s):
4541     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4542
4543
4544 def parse_m3u8_attributes(attrib):
4545     info = {}
4546     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4547         if val.startswith('"'):
4548             val = val[1:-1]
4549         info[key] = val
4550     return info
4551
4552
4553 def urshift(val, n):
4554     return val >> n if val >= 0 else (val + 0x100000000) >> n
4555
4556
4557 # Based on png2str() written by @gdkchan and improved by @yokrysty
4558 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4559 def decode_png(png_data):
4560     # Reference: https://www.w3.org/TR/PNG/
4561     header = png_data[8:]
4562
4563     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4564         raise IOError('Not a valid PNG file.')
4565
4566     int_map = {1: '>B', 2: '>H', 4: '>I'}
4567     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4568
4569     chunks = []
4570
4571     while header:
4572         length = unpack_integer(header[:4])
4573         header = header[4:]
4574
4575         chunk_type = header[:4]
4576         header = header[4:]
4577
4578         chunk_data = header[:length]
4579         header = header[length:]
4580
4581         header = header[4:]  # Skip CRC
4582
4583         chunks.append({
4584             'type': chunk_type,
4585             'length': length,
4586             'data': chunk_data
4587         })
4588
4589     ihdr = chunks[0]['data']
4590
4591     width = unpack_integer(ihdr[:4])
4592     height = unpack_integer(ihdr[4:8])
4593
4594     idat = b''
4595
4596     for chunk in chunks:
4597         if chunk['type'] == b'IDAT':
4598             idat += chunk['data']
4599
4600     if not idat:
4601         raise IOError('Unable to read PNG data.')
4602
4603     decompressed_data = bytearray(zlib.decompress(idat))
4604
4605     stride = width * 3
4606     pixels = []
4607
4608     def _get_pixel(idx):
4609         x = idx % stride
4610         y = idx // stride
4611         return pixels[y][x]
4612
4613     for y in range(height):
4614         basePos = y * (1 + stride)
4615         filter_type = decompressed_data[basePos]
4616
4617         current_row = []
4618
4619         pixels.append(current_row)
4620
4621         for x in range(stride):
4622             color = decompressed_data[1 + basePos + x]
4623             basex = y * stride + x
4624             left = 0
4625             up = 0
4626
4627             if x > 2:
4628                 left = _get_pixel(basex - 3)
4629             if y > 0:
4630                 up = _get_pixel(basex - stride)
4631
4632             if filter_type == 1:  # Sub
4633                 color = (color + left) & 0xff
4634             elif filter_type == 2:  # Up
4635                 color = (color + up) & 0xff
4636             elif filter_type == 3:  # Average
4637                 color = (color + ((left + up) >> 1)) & 0xff
4638             elif filter_type == 4:  # Paeth
4639                 a = left
4640                 b = up
4641                 c = 0
4642
4643                 if x > 2 and y > 0:
4644                     c = _get_pixel(basex - stride - 3)
4645
4646                 p = a + b - c
4647
4648                 pa = abs(p - a)
4649                 pb = abs(p - b)
4650                 pc = abs(p - c)
4651
4652                 if pa <= pb and pa <= pc:
4653                     color = (color + a) & 0xff
4654                 elif pb <= pc:
4655                     color = (color + b) & 0xff
4656                 else:
4657                     color = (color + c) & 0xff
4658
4659             current_row.append(color)
4660
4661     return width, height, pixels
4662
4663
4664 def write_xattr(path, key, value):
4665     # This mess below finds the best xattr tool for the job
4666     try:
4667         # try the pyxattr module...
4668         import xattr
4669
4670         if hasattr(xattr, 'set'):  # pyxattr
4671             # Unicode arguments are not supported in python-pyxattr until
4672             # version 0.5.0
4673             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4674             pyxattr_required_version = '0.5.0'
4675             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4676                 # TODO: fallback to CLI tools
4677                 raise XAttrUnavailableError(
4678                     'python-pyxattr is detected but is too old. '
4679                     'yt-dlp requires %s or above while your version is %s. '
4680                     'Falling back to other xattr implementations' % (
4681                         pyxattr_required_version, xattr.__version__))
4682
4683             setxattr = xattr.set
4684         else:  # xattr
4685             setxattr = xattr.setxattr
4686
4687         try:
4688             setxattr(path, key, value)
4689         except EnvironmentError as e:
4690             raise XAttrMetadataError(e.errno, e.strerror)
4691
4692     except ImportError:
4693         if compat_os_name == 'nt':
4694             # Write xattrs to NTFS Alternate Data Streams:
4695             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4696             assert ':' not in key
4697             assert os.path.exists(path)
4698
4699             ads_fn = path + ':' + key
4700             try:
4701                 with open(ads_fn, 'wb') as f:
4702                     f.write(value)
4703             except EnvironmentError as e:
4704                 raise XAttrMetadataError(e.errno, e.strerror)
4705         else:
4706             user_has_setfattr = check_executable('setfattr', ['--version'])
4707             user_has_xattr = check_executable('xattr', ['-h'])
4708
4709             if user_has_setfattr or user_has_xattr:
4710
4711                 value = value.decode('utf-8')
4712                 if user_has_setfattr:
4713                     executable = 'setfattr'
4714                     opts = ['-n', key, '-v', value]
4715                 elif user_has_xattr:
4716                     executable = 'xattr'
4717                     opts = ['-w', key, value]
4718
4719                 cmd = ([encodeFilename(executable, True)]
4720                        + [encodeArgument(o) for o in opts]
4721                        + [encodeFilename(path, True)])
4722
4723                 try:
4724                     p = Popen(
4725                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4726                 except EnvironmentError as e:
4727                     raise XAttrMetadataError(e.errno, e.strerror)
4728                 stdout, stderr = p.communicate_or_kill()
4729                 stderr = stderr.decode('utf-8', 'replace')
4730                 if p.returncode != 0:
4731                     raise XAttrMetadataError(p.returncode, stderr)
4732
4733             else:
4734                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4735                 if sys.platform.startswith('linux'):
4736                     raise XAttrUnavailableError(
4737                         "Couldn't find a tool to set the xattrs. "
4738                         "Install either the python 'pyxattr' or 'xattr' "
4739                         "modules, or the GNU 'attr' package "
4740                         "(which contains the 'setfattr' tool).")
4741                 else:
4742                     raise XAttrUnavailableError(
4743                         "Couldn't find a tool to set the xattrs. "
4744                         "Install either the python 'xattr' module, "
4745                         "or the 'xattr' binary.")
4746
4747
4748 def random_birthday(year_field, month_field, day_field):
4749     start_date = datetime.date(1950, 1, 1)
4750     end_date = datetime.date(1995, 12, 31)
4751     offset = random.randint(0, (end_date - start_date).days)
4752     random_date = start_date + datetime.timedelta(offset)
4753     return {
4754         year_field: str(random_date.year),
4755         month_field: str(random_date.month),
4756         day_field: str(random_date.day),
4757     }
4758
4759
4760 # Templates for internet shortcut files, which are plain text files.
4761 DOT_URL_LINK_TEMPLATE = '''
4762 [InternetShortcut]
4763 URL=%(url)s
4764 '''.lstrip()
4765
4766 DOT_WEBLOC_LINK_TEMPLATE = '''
4767 <?xml version="1.0" encoding="UTF-8"?>
4768 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4769 <plist version="1.0">
4770 <dict>
4771 \t<key>URL</key>
4772 \t<string>%(url)s</string>
4773 </dict>
4774 </plist>
4775 '''.lstrip()
4776
4777 DOT_DESKTOP_LINK_TEMPLATE = '''
4778 [Desktop Entry]
4779 Encoding=UTF-8
4780 Name=%(filename)s
4781 Type=Link
4782 URL=%(url)s
4783 Icon=text-html
4784 '''.lstrip()
4785
4786 LINK_TEMPLATES = {
4787     'url': DOT_URL_LINK_TEMPLATE,
4788     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4789     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4790 }
4791
4792
4793 def iri_to_uri(iri):
4794     """
4795     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4796
4797     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4798     """
4799
4800     iri_parts = compat_urllib_parse_urlparse(iri)
4801
4802     if '[' in iri_parts.netloc:
4803         raise ValueError('IPv6 URIs are not, yet, supported.')
4804         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4805
4806     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4807
4808     net_location = ''
4809     if iri_parts.username:
4810         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4811         if iri_parts.password is not None:
4812             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4813         net_location += '@'
4814
4815     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4816     # The 'idna' encoding produces ASCII text.
4817     if iri_parts.port is not None and iri_parts.port != 80:
4818         net_location += ':' + str(iri_parts.port)
4819
4820     return compat_urllib_parse_urlunparse(
4821         (iri_parts.scheme,
4822             net_location,
4823
4824             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4825
4826             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4827             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4828
4829             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4830             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4831
4832             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4833
4834     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4835
4836
4837 def to_high_limit_path(path):
4838     if sys.platform in ['win32', 'cygwin']:
4839         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4840         return r'\\?\ '.rstrip() + os.path.abspath(path)
4841
4842     return path
4843
4844
4845 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4846     if field is None:
4847         val = obj if obj is not None else default
4848     else:
4849         val = obj.get(field, default)
4850     if func and val not in ignore:
4851         val = func(val)
4852     return template % val if val not in ignore else default
4853
4854
4855 def clean_podcast_url(url):
4856     return re.sub(r'''(?x)
4857         (?:
4858             (?:
4859                 chtbl\.com/track|
4860                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4861                 play\.podtrac\.com
4862             )/[^/]+|
4863             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4864             flex\.acast\.com|
4865             pd(?:
4866                 cn\.co| # https://podcorn.com/analytics-prefix/
4867                 st\.fm # https://podsights.com/docs/
4868             )/e
4869         )/''', '', url)
4870
4871
4872 _HEX_TABLE = '0123456789abcdef'
4873
4874
4875 def random_uuidv4():
4876     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4877
4878
4879 def make_dir(path, to_screen=None):
4880     try:
4881         dn = os.path.dirname(path)
4882         if dn and not os.path.exists(dn):
4883             os.makedirs(dn)
4884         return True
4885     except (OSError, IOError) as err:
4886         if callable(to_screen) is not None:
4887             to_screen('unable to create directory ' + error_to_compat_str(err))
4888         return False
4889
4890
4891 def get_executable_path():
4892     from zipimport import zipimporter
4893     if hasattr(sys, 'frozen'):  # Running from PyInstaller
4894         path = os.path.dirname(sys.executable)
4895     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
4896         path = os.path.join(os.path.dirname(__file__), '../..')
4897     else:
4898         path = os.path.join(os.path.dirname(__file__), '..')
4899     return os.path.abspath(path)
4900
4901
4902 def load_plugins(name, suffix, namespace):
4903     classes = {}
4904     try:
4905         plugins_spec = importlib.util.spec_from_file_location(
4906             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4907         plugins = importlib.util.module_from_spec(plugins_spec)
4908         sys.modules[plugins_spec.name] = plugins
4909         plugins_spec.loader.exec_module(plugins)
4910         for name in dir(plugins):
4911             if name in namespace:
4912                 continue
4913             if not name.endswith(suffix):
4914                 continue
4915             klass = getattr(plugins, name)
4916             classes[name] = namespace[name] = klass
4917     except FileNotFoundError:
4918         pass
4919     return classes
4920
4921
4922 def traverse_obj(
4923         obj, *path_list, default=None, expected_type=None, get_all=True,
4924         casesense=True, is_user_input=False, traverse_string=False):
4925     ''' Traverse nested list/dict/tuple
4926     @param path_list        A list of paths which are checked one by one.
4927                             Each path is a list of keys where each key is a string,
4928                             a function, a tuple of strings or "...".
4929                             When a fuction is given, it takes the key as argument and
4930                             returns whether the key matches or not. When a tuple is given,
4931                             all the keys given in the tuple are traversed, and
4932                             "..." traverses all the keys in the object
4933     @param default          Default value to return
4934     @param expected_type    Only accept final value of this type (Can also be any callable)
4935     @param get_all          Return all the values obtained from a path or only the first one
4936     @param casesense        Whether to consider dictionary keys as case sensitive
4937     @param is_user_input    Whether the keys are generated from user input. If True,
4938                             strings are converted to int/slice if necessary
4939     @param traverse_string  Whether to traverse inside strings. If True, any
4940                             non-compatible object will also be converted into a string
4941     # TODO: Write tests
4942     '''
4943     if not casesense:
4944         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4945         path_list = (map(_lower, variadic(path)) for path in path_list)
4946
4947     def _traverse_obj(obj, path, _current_depth=0):
4948         nonlocal depth
4949         path = tuple(variadic(path))
4950         for i, key in enumerate(path):
4951             if obj is None:
4952                 return None
4953             if isinstance(key, (list, tuple)):
4954                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4955                 key = ...
4956             if key is ...:
4957                 obj = (obj.values() if isinstance(obj, dict)
4958                        else obj if isinstance(obj, (list, tuple, LazyList))
4959                        else str(obj) if traverse_string else [])
4960                 _current_depth += 1
4961                 depth = max(depth, _current_depth)
4962                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4963             elif callable(key):
4964                 if isinstance(obj, (list, tuple, LazyList)):
4965                     obj = enumerate(obj)
4966                 elif isinstance(obj, dict):
4967                     obj = obj.items()
4968                 else:
4969                     if not traverse_string:
4970                         return None
4971                     obj = str(obj)
4972                 _current_depth += 1
4973                 depth = max(depth, _current_depth)
4974                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
4975             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4976                 obj = (obj.get(key) if casesense or (key in obj)
4977                        else next((v for k, v in obj.items() if _lower(k) == key), None))
4978             else:
4979                 if is_user_input:
4980                     key = (int_or_none(key) if ':' not in key
4981                            else slice(*map(int_or_none, key.split(':'))))
4982                     if key == slice(None):
4983                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4984                 if not isinstance(key, (int, slice)):
4985                     return None
4986                 if not isinstance(obj, (list, tuple, LazyList)):
4987                     if not traverse_string:
4988                         return None
4989                     obj = str(obj)
4990                 try:
4991                     obj = obj[key]
4992                 except IndexError:
4993                     return None
4994         return obj
4995
4996     if isinstance(expected_type, type):
4997         type_test = lambda val: val if isinstance(val, expected_type) else None
4998     elif expected_type is not None:
4999         type_test = expected_type
5000     else:
5001         type_test = lambda val: val
5002
5003     for path in path_list:
5004         depth = 0
5005         val = _traverse_obj(obj, path)
5006         if val is not None:
5007             if depth:
5008                 for _ in range(depth - 1):
5009                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5010                 val = [v for v in map(type_test, val) if v is not None]
5011                 if val:
5012                     return val if get_all else val[0]
5013             else:
5014                 val = type_test(val)
5015                 if val is not None:
5016                     return val
5017     return default
5018
5019
5020 # Deprecated
5021 def traverse_dict(dictn, keys, casesense=True):
5022     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5023                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5024     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5025
5026
5027 def variadic(x, allowed_types=(str, bytes, dict)):
5028     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5029
5030
5031 # create a JSON Web Signature (jws) with HS256 algorithm
5032 # the resulting format is in JWS Compact Serialization
5033 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5034 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5035 def jwt_encode_hs256(payload_data, key, headers={}):
5036     header_data = {
5037         'alg': 'HS256',
5038         'typ': 'JWT',
5039     }
5040     if headers:
5041         header_data.update(headers)
5042     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5043     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5044     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5045     signature_b64 = base64.b64encode(h.digest())
5046     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5047     return token
5048
5049
5050 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5051 def jwt_decode_hs256(jwt):
5052     header_b64, payload_b64, signature_b64 = jwt.split('.')
5053     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5054     return payload_data
5055
5056
5057 def supports_terminal_sequences(stream):
5058     if compat_os_name == 'nt':
5059         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5060         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5061             return False
5062     elif not os.getenv('TERM'):
5063         return False
5064     try:
5065         return stream.isatty()
5066     except BaseException:
5067         return False
5068
5069
5070 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5071
5072
5073 def remove_terminal_sequences(string):
5074     return _terminal_sequences_re.sub('', string)
5075
5076
5077 def number_of_digits(number):
5078     return len('%d' % number)
5079
5080
5081 def join_nonempty(*values, delim='-', from_dict=None):
5082     if from_dict is not None:
5083         values = map(from_dict.get, values)
5084     return delim.join(map(str, filter(None, values)))