yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_quote,
  62     compat_str,
  63     compat_struct_pack,
  64     compat_struct_unpack,
  65     compat_urllib_error,
  66     compat_urllib_parse,
  67     compat_urllib_parse_urlencode,
  68     compat_urllib_parse_urlparse,
  69     compat_urllib_parse_urlunparse,
  70     compat_urllib_parse_quote,
  71     compat_urllib_parse_quote_plus,
  72     compat_urllib_parse_unquote_plus,
  73     compat_urllib_request,
  74     compat_urlparse,
  75     compat_xpath,
  76 )
  77
  78 from .socks import (
  79     ProxyType,
  80     sockssocket,
  81 )
  82
  83
  84 def register_socks_protocols():
  85     # "Register" SOCKS protocols
  86     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  87     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  88     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  89         if scheme not in compat_urlparse.uses_netloc:
  90             compat_urlparse.uses_netloc.append(scheme)
  91
  92
  93 # This is not clearly defined otherwise
  94 compiled_regex_type = type(re.compile(''))
  95
  96
  97 def random_user_agent():
  98     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  99     _CHROME_VERSIONS = (
 100         '90.0.4430.212',
 101         '90.0.4430.24',
 102         '90.0.4430.70',
 103         '90.0.4430.72',
 104         '90.0.4430.85',
 105         '90.0.4430.93',
 106         '91.0.4472.101',
 107         '91.0.4472.106',
 108         '91.0.4472.114',
 109         '91.0.4472.124',
 110         '91.0.4472.164',
 111         '91.0.4472.19',
 112         '91.0.4472.77',
 113         '92.0.4515.107',
 114         '92.0.4515.115',
 115         '92.0.4515.131',
 116         '92.0.4515.159',
 117         '92.0.4515.43',
 118         '93.0.4556.0',
 119         '93.0.4577.15',
 120         '93.0.4577.63',
 121         '93.0.4577.82',
 122         '94.0.4606.41',
 123         '94.0.4606.54',
 124         '94.0.4606.61',
 125         '94.0.4606.71',
 126         '94.0.4606.81',
 127         '94.0.4606.85',
 128         '95.0.4638.17',
 129         '95.0.4638.50',
 130         '95.0.4638.54',
 131         '95.0.4638.69',
 132         '95.0.4638.74',
 133         '96.0.4664.18',
 134         '96.0.4664.45',
 135         '96.0.4664.55',
 136         '96.0.4664.93',
 137         '97.0.4692.20',
 138     )
 139     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 140
 141
 142 std_headers = {
 143     'User-Agent': random_user_agent(),
 144     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 145     'Accept-Encoding': 'gzip, deflate',
 146     'Accept-Language': 'en-us,en;q=0.5',
 147 }
 148
 149
 150 USER_AGENTS = {
 151     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 152 }
 153
 154
 155 NO_DEFAULT = object()
 156
 157 ENGLISH_MONTH_NAMES = [
 158     'January', 'February', 'March', 'April', 'May', 'June',
 159     'July', 'August', 'September', 'October', 'November', 'December']
 160
 161 MONTH_NAMES = {
 162     'en': ENGLISH_MONTH_NAMES,
 163     'fr': [
 164         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 165         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 166 }
 167
 168 KNOWN_EXTENSIONS = (
 169     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 170     'flv', 'f4v', 'f4a', 'f4b',
 171     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 172     'mkv', 'mka', 'mk3d',
 173     'avi', 'divx',
 174     'mov',
 175     'asf', 'wmv', 'wma',
 176     '3gp', '3g2',
 177     'mp3',
 178     'flac',
 179     'ape',
 180     'wav',
 181     'f4f', 'f4m', 'm3u8', 'smil')
 182
 183 # needed for sanitizing filenames in restricted mode
 184 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 185                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 186                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 187
 188 DATE_FORMATS = (
 189     '%d %B %Y',
 190     '%d %b %Y',
 191     '%B %d %Y',
 192     '%B %dst %Y',
 193     '%B %dnd %Y',
 194     '%B %drd %Y',
 195     '%B %dth %Y',
 196     '%b %d %Y',
 197     '%b %dst %Y',
 198     '%b %dnd %Y',
 199     '%b %drd %Y',
 200     '%b %dth %Y',
 201     '%b %dst %Y %I:%M',
 202     '%b %dnd %Y %I:%M',
 203     '%b %drd %Y %I:%M',
 204     '%b %dth %Y %I:%M',
 205     '%Y %m %d',
 206     '%Y-%m-%d',
 207     '%Y.%m.%d.',
 208     '%Y/%m/%d',
 209     '%Y/%m/%d %H:%M',
 210     '%Y/%m/%d %H:%M:%S',
 211     '%Y%m%d%H%M',
 212     '%Y%m%d%H%M%S',
 213     '%Y-%m-%d %H:%M',
 214     '%Y-%m-%d %H:%M:%S',
 215     '%Y-%m-%d %H:%M:%S.%f',
 216     '%Y-%m-%d %H:%M:%S:%f',
 217     '%d.%m.%Y %H:%M',
 218     '%d.%m.%Y %H.%M',
 219     '%Y-%m-%dT%H:%M:%SZ',
 220     '%Y-%m-%dT%H:%M:%S.%fZ',
 221     '%Y-%m-%dT%H:%M:%S.%f0Z',
 222     '%Y-%m-%dT%H:%M:%S',
 223     '%Y-%m-%dT%H:%M:%S.%f',
 224     '%Y-%m-%dT%H:%M',
 225     '%b %d %Y at %H:%M',
 226     '%b %d %Y at %H:%M:%S',
 227     '%B %d %Y at %H:%M',
 228     '%B %d %Y at %H:%M:%S',
 229     '%H:%M %d-%b-%Y',
 230 )
 231
 232 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 233 DATE_FORMATS_DAY_FIRST.extend([
 234     '%d-%m-%Y',
 235     '%d.%m.%Y',
 236     '%d.%m.%y',
 237     '%d/%m/%Y',
 238     '%d/%m/%y',
 239     '%d/%m/%Y %H:%M:%S',
 240 ])
 241
 242 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 243 DATE_FORMATS_MONTH_FIRST.extend([
 244     '%m-%d-%Y',
 245     '%m.%d.%Y',
 246     '%m/%d/%Y',
 247     '%m/%d/%y',
 248     '%m/%d/%Y %H:%M:%S',
 249 ])
 250
 251 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 252 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 253
 254
 255 def preferredencoding():
 256     """Get preferred encoding.
 257
 258     Returns the best encoding scheme for the system, based on
 259     locale.getpreferredencoding() and some further tweaks.
 260     """
 261     try:
 262         pref = locale.getpreferredencoding()
 263         'TEST'.encode(pref)
 264     except Exception:
 265         pref = 'UTF-8'
 266
 267     return pref
 268
 269
 270 def write_json_file(obj, fn):
 271     """ Encode obj as JSON and write it to fn, atomically if possible """
 272
 273     fn = encodeFilename(fn)
 274     if sys.version_info < (3, 0) and sys.platform != 'win32':
 275         encoding = get_filesystem_encoding()
 276         # os.path.basename returns a bytes object, but NamedTemporaryFile
 277         # will fail if the filename contains non ascii characters unless we
 278         # use a unicode object
 279         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 280         # the same for os.path.dirname
 281         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 282     else:
 283         path_basename = os.path.basename
 284         path_dirname = os.path.dirname
 285
 286     args = {
 287         'suffix': '.tmp',
 288         'prefix': path_basename(fn) + '.',
 289         'dir': path_dirname(fn),
 290         'delete': False,
 291     }
 292
 293     # In Python 2.x, json.dump expects a bytestream.
 294     # In Python 3.x, it writes to a character stream
 295     if sys.version_info < (3, 0):
 296         args['mode'] = 'wb'
 297     else:
 298         args.update({
 299             'mode': 'w',
 300             'encoding': 'utf-8',
 301         })
 302
 303     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 304
 305     try:
 306         with tf:
 307             json.dump(obj, tf)
 308         if sys.platform == 'win32':
 309             # Need to remove existing file on Windows, else os.rename raises
 310             # WindowsError or FileExistsError.
 311             try:
 312                 os.unlink(fn)
 313             except OSError:
 314                 pass
 315         try:
 316             mask = os.umask(0)
 317             os.umask(mask)
 318             os.chmod(tf.name, 0o666 & ~mask)
 319         except OSError:
 320             pass
 321         os.rename(tf.name, fn)
 322     except Exception:
 323         try:
 324             os.remove(tf.name)
 325         except OSError:
 326             pass
 327         raise
 328
 329
 330 if sys.version_info >= (2, 7):
 331     def find_xpath_attr(node, xpath, key, val=None):
 332         """ Find the xpath xpath[@key=val] """
 333         assert re.match(r'^[a-zA-Z_-]+$', key)
 334         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 335         return node.find(expr)
 336 else:
 337     def find_xpath_attr(node, xpath, key, val=None):
 338         for f in node.findall(compat_xpath(xpath)):
 339             if key not in f.attrib:
 340                 continue
 341             if val is None or f.attrib.get(key) == val:
 342                 return f
 343         return None
 344
 345 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 346 # the namespace parameter
 347
 348
 349 def xpath_with_ns(path, ns_map):
 350     components = [c.split(':') for c in path.split('/')]
 351     replaced = []
 352     for c in components:
 353         if len(c) == 1:
 354             replaced.append(c[0])
 355         else:
 356             ns, tag = c
 357             replaced.append('{%s}%s' % (ns_map[ns], tag))
 358     return '/'.join(replaced)
 359
 360
 361 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 362     def _find_xpath(xpath):
 363         return node.find(compat_xpath(xpath))
 364
 365     if isinstance(xpath, (str, compat_str)):
 366         n = _find_xpath(xpath)
 367     else:
 368         for xp in xpath:
 369             n = _find_xpath(xp)
 370             if n is not None:
 371                 break
 372
 373     if n is None:
 374         if default is not NO_DEFAULT:
 375             return default
 376         elif fatal:
 377             name = xpath if name is None else name
 378             raise ExtractorError('Could not find XML element %s' % name)
 379         else:
 380             return None
 381     return n
 382
 383
 384 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 385     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 386     if n is None or n == default:
 387         return n
 388     if n.text is None:
 389         if default is not NO_DEFAULT:
 390             return default
 391         elif fatal:
 392             name = xpath if name is None else name
 393             raise ExtractorError('Could not find XML element\'s text %s' % name)
 394         else:
 395             return None
 396     return n.text
 397
 398
 399 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 400     n = find_xpath_attr(node, xpath, key)
 401     if n is None:
 402         if default is not NO_DEFAULT:
 403             return default
 404         elif fatal:
 405             name = '%s[@%s]' % (xpath, key) if name is None else name
 406             raise ExtractorError('Could not find XML attribute %s' % name)
 407         else:
 408             return None
 409     return n.attrib[key]
 410
 411
 412 def get_element_by_id(id, html):
 413     """Return the content of the tag with the specified ID in the passed HTML document"""
 414     return get_element_by_attribute('id', id, html)
 415
 416
 417 def get_element_by_class(class_name, html):
 418     """Return the content of the first tag with the specified class in the passed HTML document"""
 419     retval = get_elements_by_class(class_name, html)
 420     return retval[0] if retval else None
 421
 422
 423 def get_element_by_attribute(attribute, value, html, escape_value=True):
 424     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 425     return retval[0] if retval else None
 426
 427
 428 def get_elements_by_class(class_name, html):
 429     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 430     return get_elements_by_attribute(
 431         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 432         html, escape_value=False)
 433
 434
 435 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 436     """Return the content of the tag with the specified attribute in the passed HTML document"""
 437
 438     value = re.escape(value) if escape_value else value
 439
 440     retlist = []
 441     for m in re.finditer(r'''(?xs)
 442         <([a-zA-Z0-9:._-]+)
 443          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 444          \s+%s=['"]?%s['"]?
 445          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 446         \s*>
 447         (?P<content>.*?)
 448         </\1>
 449     ''' % (re.escape(attribute), value), html):
 450         res = m.group('content')
 451
 452         if res.startswith('"') or res.startswith("'"):
 453             res = res[1:-1]
 454
 455         retlist.append(unescapeHTML(res))
 456
 457     return retlist
 458
 459
 460 class HTMLAttributeParser(compat_HTMLParser):
 461     """Trivial HTML parser to gather the attributes for a single element"""
 462
 463     def __init__(self):
 464         self.attrs = {}
 465         compat_HTMLParser.__init__(self)
 466
 467     def handle_starttag(self, tag, attrs):
 468         self.attrs = dict(attrs)
 469
 470
 471 class HTMLListAttrsParser(compat_HTMLParser):
 472     """HTML parser to gather the attributes for the elements of a list"""
 473
 474     def __init__(self):
 475         compat_HTMLParser.__init__(self)
 476         self.items = []
 477         self._level = 0
 478
 479     def handle_starttag(self, tag, attrs):
 480         if tag == 'li' and self._level == 0:
 481             self.items.append(dict(attrs))
 482         self._level += 1
 483
 484     def handle_endtag(self, tag):
 485         self._level -= 1
 486
 487
 488 def extract_attributes(html_element):
 489     """Given a string for an HTML element such as
 490     <el
 491          a="foo" B="bar" c="&98;az" d=boz
 492          empty= noval entity="&amp;"
 493          sq='"' dq="'"
 494     >
 495     Decode and return a dictionary of attributes.
 496     {
 497         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 498         'empty': '', 'noval': None, 'entity': '&',
 499         'sq': '"', 'dq': '\''
 500     }.
 501     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 502     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 503     """
 504     parser = HTMLAttributeParser()
 505     try:
 506         parser.feed(html_element)
 507         parser.close()
 508     # Older Python may throw HTMLParseError in case of malformed HTML
 509     except compat_HTMLParseError:
 510         pass
 511     return parser.attrs
 512
 513
 514 def parse_list(webpage):
 515     """Given a string for an series of HTML <li> elements,
 516     return a dictionary of their attributes"""
 517     parser = HTMLListAttrsParser()
 518     parser.feed(webpage)
 519     parser.close()
 520     return parser.items
 521
 522
 523 def clean_html(html):
 524     """Clean an HTML snippet into a readable string"""
 525
 526     if html is None:  # Convenience for sanitizing descriptions etc.
 527         return html
 528
 529     # Newline vs <br />
 530     html = html.replace('\n', ' ')
 531     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 532     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 533     # Strip html tags
 534     html = re.sub('<.*?>', '', html)
 535     # Replace html entities
 536     html = unescapeHTML(html)
 537     return html.strip()
 538
 539
 540 def sanitize_open(filename, open_mode):
 541     """Try to open the given filename, and slightly tweak it if this fails.
 542
 543     Attempts to open the given filename. If this fails, it tries to change
 544     the filename slightly, step by step, until it's either able to open it
 545     or it fails and raises a final exception, like the standard open()
 546     function.
 547
 548     It returns the tuple (stream, definitive_file_name).
 549     """
 550     try:
 551         if filename == '-':
 552             if sys.platform == 'win32':
 553                 import msvcrt
 554                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 555             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 556         stream = open(encodeFilename(filename), open_mode)
 557         return (stream, filename)
 558     except (IOError, OSError) as err:
 559         if err.errno in (errno.EACCES,):
 560             raise
 561
 562         # In case of error, try to remove win32 forbidden chars
 563         alt_filename = sanitize_path(filename)
 564         if alt_filename == filename:
 565             raise
 566         else:
 567             # An exception here should be caught in the caller
 568             stream = open(encodeFilename(alt_filename), open_mode)
 569             return (stream, alt_filename)
 570
 571
 572 def timeconvert(timestr):
 573     """Convert RFC 2822 defined time string into system timestamp"""
 574     timestamp = None
 575     timetuple = email.utils.parsedate_tz(timestr)
 576     if timetuple is not None:
 577         timestamp = email.utils.mktime_tz(timetuple)
 578     return timestamp
 579
 580
 581 def sanitize_filename(s, restricted=False, is_id=False):
 582     """Sanitizes a string so it could be used as part of a filename.
 583     If restricted is set, use a stricter subset of allowed characters.
 584     Set is_id if this is not an arbitrary string, but an ID that should be kept
 585     if possible.
 586     """
 587     def replace_insane(char):
 588         if restricted and char in ACCENT_CHARS:
 589             return ACCENT_CHARS[char]
 590         elif not restricted and char == '\n':
 591             return ' '
 592         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 593             return ''
 594         elif char == '"':
 595             return '' if restricted else '\''
 596         elif char == ':':
 597             return '_-' if restricted else ' -'
 598         elif char in '\\/|*<>':
 599             return '_'
 600         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 601             return '_'
 602         if restricted and ord(char) > 127:
 603             return '_'
 604         return char
 605
 606     if s == '':
 607         return ''
 608     # Handle timestamps
 609     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 610     result = ''.join(map(replace_insane, s))
 611     if not is_id:
 612         while '__' in result:
 613             result = result.replace('__', '_')
 614         result = result.strip('_')
 615         # Common case of "Foreign band name - English song title"
 616         if restricted and result.startswith('-_'):
 617             result = result[2:]
 618         if result.startswith('-'):
 619             result = '_' + result[len('-'):]
 620         result = result.lstrip('.')
 621         if not result:
 622             result = '_'
 623     return result
 624
 625
 626 def sanitize_path(s, force=False):
 627     """Sanitizes and normalizes path on Windows"""
 628     if sys.platform == 'win32':
 629         force = False
 630         drive_or_unc, _ = os.path.splitdrive(s)
 631         if sys.version_info < (2, 7) and not drive_or_unc:
 632             drive_or_unc, _ = os.path.splitunc(s)
 633     elif force:
 634         drive_or_unc = ''
 635     else:
 636         return s
 637
 638     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 639     if drive_or_unc:
 640         norm_path.pop(0)
 641     sanitized_path = [
 642         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 643         for path_part in norm_path]
 644     if drive_or_unc:
 645         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 646     elif force and s[0] == os.path.sep:
 647         sanitized_path.insert(0, os.path.sep)
 648     return os.path.join(*sanitized_path)
 649
 650
 651 def sanitize_url(url):
 652     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 653     # the number of unwanted failures due to missing protocol
 654     if url.startswith('//'):
 655         return 'http:%s' % url
 656     # Fix some common typos seen so far
 657     COMMON_TYPOS = (
 658         # https://github.com/ytdl-org/youtube-dl/issues/15649
 659         (r'^httpss://', r'https://'),
 660         # https://bx1.be/lives/direct-tv/
 661         (r'^rmtp([es]?)://', r'rtmp\1://'),
 662     )
 663     for mistake, fixup in COMMON_TYPOS:
 664         if re.match(mistake, url):
 665             return re.sub(mistake, fixup, url)
 666     return url
 667
 668
 669 def extract_basic_auth(url):
 670     parts = compat_urlparse.urlsplit(url)
 671     if parts.username is None:
 672         return url, None
 673     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 674         parts.hostname if parts.port is None
 675         else '%s:%d' % (parts.hostname, parts.port))))
 676     auth_payload = base64.b64encode(
 677         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 678     return url, 'Basic ' + auth_payload.decode('utf-8')
 679
 680
 681 def sanitized_Request(url, *args, **kwargs):
 682     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 683     if auth_header is not None:
 684         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 685         headers['Authorization'] = auth_header
 686     return compat_urllib_request.Request(url, *args, **kwargs)
 687
 688
 689 def expand_path(s):
 690     """Expand shell variables and ~"""
 691     return os.path.expandvars(compat_expanduser(s))
 692
 693
 694 def orderedSet(iterable):
 695     """ Remove all duplicates from the input iterable """
 696     res = []
 697     for el in iterable:
 698         if el not in res:
 699             res.append(el)
 700     return res
 701
 702
 703 def _htmlentity_transform(entity_with_semicolon):
 704     """Transforms an HTML entity to a character."""
 705     entity = entity_with_semicolon[:-1]
 706
 707     # Known non-numeric HTML entity
 708     if entity in compat_html_entities.name2codepoint:
 709         return compat_chr(compat_html_entities.name2codepoint[entity])
 710
 711     # TODO: HTML5 allows entities without a semicolon. For example,
 712     # '&Eacuteric' should be decoded as 'Éric'.
 713     if entity_with_semicolon in compat_html_entities_html5:
 714         return compat_html_entities_html5[entity_with_semicolon]
 715
 716     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 717     if mobj is not None:
 718         numstr = mobj.group(1)
 719         if numstr.startswith('x'):
 720             base = 16
 721             numstr = '0%s' % numstr
 722         else:
 723             base = 10
 724         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 725         try:
 726             return compat_chr(int(numstr, base))
 727         except ValueError:
 728             pass
 729
 730     # Unknown entity in name, return its literal representation
 731     return '&%s;' % entity
 732
 733
 734 def unescapeHTML(s):
 735     if s is None:
 736         return None
 737     assert type(s) == compat_str
 738
 739     return re.sub(
 740         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 741
 742
 743 def escapeHTML(text):
 744     return (
 745         text
 746         .replace('&', '&amp;')
 747         .replace('<', '&lt;')
 748         .replace('>', '&gt;')
 749         .replace('"', '&quot;')
 750         .replace("'", '&#39;')
 751     )
 752
 753
 754 def process_communicate_or_kill(p, *args, **kwargs):
 755     try:
 756         return p.communicate(*args, **kwargs)
 757     except BaseException:  # Including KeyboardInterrupt
 758         p.kill()
 759         p.wait()
 760         raise
 761
 762
 763 class Popen(subprocess.Popen):
 764     if sys.platform == 'win32':
 765         _startupinfo = subprocess.STARTUPINFO()
 766         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 767     else:
 768         _startupinfo = None
 769
 770     def __init__(self, *args, **kwargs):
 771         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 772
 773     def communicate_or_kill(self, *args, **kwargs):
 774         return process_communicate_or_kill(self, *args, **kwargs)
 775
 776
 777 def get_subprocess_encoding():
 778     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 779         # For subprocess calls, encode with locale encoding
 780         # Refer to http://stackoverflow.com/a/9951851/35070
 781         encoding = preferredencoding()
 782     else:
 783         encoding = sys.getfilesystemencoding()
 784     if encoding is None:
 785         encoding = 'utf-8'
 786     return encoding
 787
 788
 789 def encodeFilename(s, for_subprocess=False):
 790     """
 791     @param s The name of the file
 792     """
 793
 794     assert type(s) == compat_str
 795
 796     # Python 3 has a Unicode API
 797     if sys.version_info >= (3, 0):
 798         return s
 799
 800     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 801     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 802     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 803     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 804         return s
 805
 806     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 807     if sys.platform.startswith('java'):
 808         return s
 809
 810     return s.encode(get_subprocess_encoding(), 'ignore')
 811
 812
 813 def decodeFilename(b, for_subprocess=False):
 814
 815     if sys.version_info >= (3, 0):
 816         return b
 817
 818     if not isinstance(b, bytes):
 819         return b
 820
 821     return b.decode(get_subprocess_encoding(), 'ignore')
 822
 823
 824 def encodeArgument(s):
 825     if not isinstance(s, compat_str):
 826         # Legacy code that uses byte strings
 827         # Uncomment the following line after fixing all post processors
 828         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 829         s = s.decode('ascii')
 830     return encodeFilename(s, True)
 831
 832
 833 def decodeArgument(b):
 834     return decodeFilename(b, True)
 835
 836
 837 def decodeOption(optval):
 838     if optval is None:
 839         return optval
 840     if isinstance(optval, bytes):
 841         optval = optval.decode(preferredencoding())
 842
 843     assert isinstance(optval, compat_str)
 844     return optval
 845
 846
 847 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 848
 849
 850 def timetuple_from_msec(msec):
 851     secs, msec = divmod(msec, 1000)
 852     mins, secs = divmod(secs, 60)
 853     hrs, mins = divmod(mins, 60)
 854     return _timetuple(hrs, mins, secs, msec)
 855
 856
 857 def formatSeconds(secs, delim=':', msec=False):
 858     time = timetuple_from_msec(secs * 1000)
 859     if time.hours:
 860         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 861     elif time.minutes:
 862         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 863     else:
 864         ret = '%d' % time.seconds
 865     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 866
 867
 868 def _ssl_load_windows_store_certs(ssl_context, storename):
 869     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 870     try:
 871         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 872                  if encoding == 'x509_asn' and (
 873                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 874     except PermissionError:
 875         return
 876     for cert in certs:
 877         try:
 878             ssl_context.load_verify_locations(cadata=cert)
 879         except ssl.SSLError:
 880             pass
 881
 882
 883 def make_HTTPS_handler(params, **kwargs):
 884     opts_check_certificate = not params.get('nocheckcertificate')
 885     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 886     context.check_hostname = opts_check_certificate
 887     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 888     if opts_check_certificate:
 889         try:
 890             context.load_default_certs()
 891             # Work around the issue in load_default_certs when there are bad certificates. See:
 892             # https://github.com/yt-dlp/yt-dlp/issues/1060,
 893             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 894         except ssl.SSLError:
 895             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 896             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 897                 # Create a new context to discard any certificates that were already loaded
 898                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 899                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
 900                 for storename in ('CA', 'ROOT'):
 901                     _ssl_load_windows_store_certs(context, storename)
 902             context.set_default_verify_paths()
 903     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 904
 905
 906 def bug_reports_message(before=';'):
 907     if ytdl_is_updateable():
 908         update_cmd = 'type  yt-dlp -U  to update'
 909     else:
 910         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
 911     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
 912     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 913     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
 914
 915     before = before.rstrip()
 916     if not before or before.endswith(('.', '!', '?')):
 917         msg = msg[0].title() + msg[1:]
 918
 919     return (before + ' ' if before else '') + msg
 920
 921
 922 class YoutubeDLError(Exception):
 923     """Base exception for YoutubeDL errors."""
 924     msg = None
 925
 926     def __init__(self, msg=None):
 927         if msg is not None:
 928             self.msg = msg
 929         elif self.msg is None:
 930             self.msg = type(self).__name__
 931         super().__init__(self.msg)
 932
 933
 934 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 935 if hasattr(ssl, 'CertificateError'):
 936     network_exceptions.append(ssl.CertificateError)
 937 network_exceptions = tuple(network_exceptions)
 938
 939
 940 class ExtractorError(YoutubeDLError):
 941     """Error during info extraction."""
 942
 943     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 944         """ tb, if given, is the original traceback (so that it can be printed out).
 945         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 946         """
 947         if sys.exc_info()[0] in network_exceptions:
 948             expected = True
 949
 950         self.msg = str(msg)
 951         self.traceback = tb
 952         self.expected = expected
 953         self.cause = cause
 954         self.video_id = video_id
 955         self.ie = ie
 956         self.exc_info = sys.exc_info()  # preserve original exception
 957
 958         super(ExtractorError, self).__init__(''.join((
 959             format_field(ie, template='[%s] '),
 960             format_field(video_id, template='%s: '),
 961             self.msg,
 962             format_field(cause, template=' (caused by %r)'),
 963             '' if expected else bug_reports_message())))
 964
 965     def format_traceback(self):
 966         if self.traceback is None:
 967             return None
 968         return ''.join(traceback.format_tb(self.traceback))
 969
 970
 971 class UnsupportedError(ExtractorError):
 972     def __init__(self, url):
 973         super(UnsupportedError, self).__init__(
 974             'Unsupported URL: %s' % url, expected=True)
 975         self.url = url
 976
 977
 978 class RegexNotFoundError(ExtractorError):
 979     """Error when a regex didn't match"""
 980     pass
 981
 982
 983 class GeoRestrictedError(ExtractorError):
 984     """Geographic restriction Error exception.
 985
 986     This exception may be thrown when a video is not available from your
 987     geographic location due to geographic restrictions imposed by a website.
 988     """
 989
 990     def __init__(self, msg, countries=None, **kwargs):
 991         kwargs['expected'] = True
 992         super(GeoRestrictedError, self).__init__(msg, **kwargs)
 993         self.countries = countries
 994
 995
 996 class DownloadError(YoutubeDLError):
 997     """Download Error exception.
 998
 999     This exception may be thrown by FileDownloader objects if they are not
1000     configured to continue on errors. They will contain the appropriate
1001     error message.
1002     """
1003
1004     def __init__(self, msg, exc_info=None):
1005         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1006         super(DownloadError, self).__init__(msg)
1007         self.exc_info = exc_info
1008
1009
1010 class EntryNotInPlaylist(YoutubeDLError):
1011     """Entry not in playlist exception.
1012
1013     This exception will be thrown by YoutubeDL when a requested entry
1014     is not found in the playlist info_dict
1015     """
1016     msg = 'Entry not found in info'
1017
1018
1019 class SameFileError(YoutubeDLError):
1020     """Same File exception.
1021
1022     This exception will be thrown by FileDownloader objects if they detect
1023     multiple files would have to be downloaded to the same file on disk.
1024     """
1025     msg = 'Fixed output name but more than one file to download'
1026
1027     def __init__(self, filename=None):
1028         if filename is not None:
1029             self.msg += f': {filename}'
1030         super().__init__(self.msg)
1031
1032
1033 class PostProcessingError(YoutubeDLError):
1034     """Post Processing exception.
1035
1036     This exception may be raised by PostProcessor's .run() method to
1037     indicate an error in the postprocessing task.
1038     """
1039
1040
1041 class DownloadCancelled(YoutubeDLError):
1042     """ Exception raised when the download queue should be interrupted """
1043     msg = 'The download was cancelled'
1044
1045
1046 class ExistingVideoReached(DownloadCancelled):
1047     """ --break-on-existing triggered """
1048     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1049
1050
1051 class RejectedVideoReached(DownloadCancelled):
1052     """ --break-on-reject triggered """
1053     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1054
1055
1056 class MaxDownloadsReached(DownloadCancelled):
1057     """ --max-downloads limit has been reached. """
1058     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1059
1060
1061 class ReExtractInfo(YoutubeDLError):
1062     """ Video info needs to be re-extracted. """
1063
1064     def __init__(self, msg, expected=False):
1065         super().__init__(msg)
1066         self.expected = expected
1067
1068
1069 class ThrottledDownload(ReExtractInfo):
1070     """ Download speed below --throttled-rate. """
1071     msg = 'The download speed is below throttle limit'
1072
1073     def __init__(self):
1074         super().__init__(self.msg, expected=False)
1075
1076
1077 class UnavailableVideoError(YoutubeDLError):
1078     """Unavailable Format exception.
1079
1080     This exception will be thrown when a video is requested
1081     in a format that is not available for that video.
1082     """
1083     msg = 'Unable to download video'
1084
1085     def __init__(self, err=None):
1086         if err is not None:
1087             self.msg += f': {err}'
1088         super().__init__(self.msg)
1089
1090
1091 class ContentTooShortError(YoutubeDLError):
1092     """Content Too Short exception.
1093
1094     This exception may be raised by FileDownloader objects when a file they
1095     download is too small for what the server announced first, indicating
1096     the connection was probably interrupted.
1097     """
1098
1099     def __init__(self, downloaded, expected):
1100         super(ContentTooShortError, self).__init__(
1101             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1102         )
1103         # Both in bytes
1104         self.downloaded = downloaded
1105         self.expected = expected
1106
1107
1108 class XAttrMetadataError(YoutubeDLError):
1109     def __init__(self, code=None, msg='Unknown error'):
1110         super(XAttrMetadataError, self).__init__(msg)
1111         self.code = code
1112         self.msg = msg
1113
1114         # Parsing code and msg
1115         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1116                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1117             self.reason = 'NO_SPACE'
1118         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1119             self.reason = 'VALUE_TOO_LONG'
1120         else:
1121             self.reason = 'NOT_SUPPORTED'
1122
1123
1124 class XAttrUnavailableError(YoutubeDLError):
1125     pass
1126
1127
1128 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1129     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1130     # expected HTTP responses to meet HTTP/1.0 or later (see also
1131     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1132     if sys.version_info < (3, 0):
1133         kwargs['strict'] = True
1134     hc = http_class(*args, **compat_kwargs(kwargs))
1135     source_address = ydl_handler._params.get('source_address')
1136
1137     if source_address is not None:
1138         # This is to workaround _create_connection() from socket where it will try all
1139         # address data from getaddrinfo() including IPv6. This filters the result from
1140         # getaddrinfo() based on the source_address value.
1141         # This is based on the cpython socket.create_connection() function.
1142         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1143         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1144             host, port = address
1145             err = None
1146             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1147             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1148             ip_addrs = [addr for addr in addrs if addr[0] == af]
1149             if addrs and not ip_addrs:
1150                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1151                 raise socket.error(
1152                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1153                     % (ip_version, source_address[0]))
1154             for res in ip_addrs:
1155                 af, socktype, proto, canonname, sa = res
1156                 sock = None
1157                 try:
1158                     sock = socket.socket(af, socktype, proto)
1159                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1160                         sock.settimeout(timeout)
1161                     sock.bind(source_address)
1162                     sock.connect(sa)
1163                     err = None  # Explicitly break reference cycle
1164                     return sock
1165                 except socket.error as _:
1166                     err = _
1167                     if sock is not None:
1168                         sock.close()
1169             if err is not None:
1170                 raise err
1171             else:
1172                 raise socket.error('getaddrinfo returns an empty list')
1173         if hasattr(hc, '_create_connection'):
1174             hc._create_connection = _create_connection
1175         sa = (source_address, 0)
1176         if hasattr(hc, 'source_address'):  # Python 2.7+
1177             hc.source_address = sa
1178         else:  # Python 2.6
1179             def _hc_connect(self, *args, **kwargs):
1180                 sock = _create_connection(
1181                     (self.host, self.port), self.timeout, sa)
1182                 if is_https:
1183                     self.sock = ssl.wrap_socket(
1184                         sock, self.key_file, self.cert_file,
1185                         ssl_version=ssl.PROTOCOL_TLSv1)
1186                 else:
1187                     self.sock = sock
1188             hc.connect = functools.partial(_hc_connect, hc)
1189
1190     return hc
1191
1192
1193 def handle_youtubedl_headers(headers):
1194     filtered_headers = headers
1195
1196     if 'Youtubedl-no-compression' in filtered_headers:
1197         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1198         del filtered_headers['Youtubedl-no-compression']
1199
1200     return filtered_headers
1201
1202
1203 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1204     """Handler for HTTP requests and responses.
1205
1206     This class, when installed with an OpenerDirector, automatically adds
1207     the standard headers to every HTTP request and handles gzipped and
1208     deflated responses from web servers. If compression is to be avoided in
1209     a particular request, the original request in the program code only has
1210     to include the HTTP header "Youtubedl-no-compression", which will be
1211     removed before making the real request.
1212
1213     Part of this code was copied from:
1214
1215     http://techknack.net/python-urllib2-handlers/
1216
1217     Andrew Rowls, the author of that code, agreed to release it to the
1218     public domain.
1219     """
1220
1221     def __init__(self, params, *args, **kwargs):
1222         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1223         self._params = params
1224
1225     def http_open(self, req):
1226         conn_class = compat_http_client.HTTPConnection
1227
1228         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1229         if socks_proxy:
1230             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1231             del req.headers['Ytdl-socks-proxy']
1232
1233         return self.do_open(functools.partial(
1234             _create_http_connection, self, conn_class, False),
1235             req)
1236
1237     @staticmethod
1238     def deflate(data):
1239         if not data:
1240             return data
1241         try:
1242             return zlib.decompress(data, -zlib.MAX_WBITS)
1243         except zlib.error:
1244             return zlib.decompress(data)
1245
1246     def http_request(self, req):
1247         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1248         # always respected by websites, some tend to give out URLs with non percent-encoded
1249         # non-ASCII characters (see telemb.py, ard.py [#3412])
1250         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1251         # To work around aforementioned issue we will replace request's original URL with
1252         # percent-encoded one
1253         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1254         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1255         url = req.get_full_url()
1256         url_escaped = escape_url(url)
1257
1258         # Substitute URL if any change after escaping
1259         if url != url_escaped:
1260             req = update_Request(req, url=url_escaped)
1261
1262         for h, v in std_headers.items():
1263             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1264             # The dict keys are capitalized because of this bug by urllib
1265             if h.capitalize() not in req.headers:
1266                 req.add_header(h, v)
1267
1268         req.headers = handle_youtubedl_headers(req.headers)
1269
1270         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1271             # Python 2.6 is brain-dead when it comes to fragments
1272             req._Request__original = req._Request__original.partition('#')[0]
1273             req._Request__r_type = req._Request__r_type.partition('#')[0]
1274
1275         return req
1276
1277     def http_response(self, req, resp):
1278         old_resp = resp
1279         # gzip
1280         if resp.headers.get('Content-encoding', '') == 'gzip':
1281             content = resp.read()
1282             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1283             try:
1284                 uncompressed = io.BytesIO(gz.read())
1285             except IOError as original_ioerror:
1286                 # There may be junk add the end of the file
1287                 # See http://stackoverflow.com/q/4928560/35070 for details
1288                 for i in range(1, 1024):
1289                     try:
1290                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1291                         uncompressed = io.BytesIO(gz.read())
1292                     except IOError:
1293                         continue
1294                     break
1295                 else:
1296                     raise original_ioerror
1297             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1298             resp.msg = old_resp.msg
1299             del resp.headers['Content-encoding']
1300         # deflate
1301         if resp.headers.get('Content-encoding', '') == 'deflate':
1302             gz = io.BytesIO(self.deflate(resp.read()))
1303             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1304             resp.msg = old_resp.msg
1305             del resp.headers['Content-encoding']
1306         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1307         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1308         if 300 <= resp.code < 400:
1309             location = resp.headers.get('Location')
1310             if location:
1311                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1312                 if sys.version_info >= (3, 0):
1313                     location = location.encode('iso-8859-1').decode('utf-8')
1314                 else:
1315                     location = location.decode('utf-8')
1316                 location_escaped = escape_url(location)
1317                 if location != location_escaped:
1318                     del resp.headers['Location']
1319                     if sys.version_info < (3, 0):
1320                         location_escaped = location_escaped.encode('utf-8')
1321                     resp.headers['Location'] = location_escaped
1322         return resp
1323
1324     https_request = http_request
1325     https_response = http_response
1326
1327
1328 def make_socks_conn_class(base_class, socks_proxy):
1329     assert issubclass(base_class, (
1330         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1331
1332     url_components = compat_urlparse.urlparse(socks_proxy)
1333     if url_components.scheme.lower() == 'socks5':
1334         socks_type = ProxyType.SOCKS5
1335     elif url_components.scheme.lower() in ('socks', 'socks4'):
1336         socks_type = ProxyType.SOCKS4
1337     elif url_components.scheme.lower() == 'socks4a':
1338         socks_type = ProxyType.SOCKS4A
1339
1340     def unquote_if_non_empty(s):
1341         if not s:
1342             return s
1343         return compat_urllib_parse_unquote_plus(s)
1344
1345     proxy_args = (
1346         socks_type,
1347         url_components.hostname, url_components.port or 1080,
1348         True,  # Remote DNS
1349         unquote_if_non_empty(url_components.username),
1350         unquote_if_non_empty(url_components.password),
1351     )
1352
1353     class SocksConnection(base_class):
1354         def connect(self):
1355             self.sock = sockssocket()
1356             self.sock.setproxy(*proxy_args)
1357             if type(self.timeout) in (int, float):
1358                 self.sock.settimeout(self.timeout)
1359             self.sock.connect((self.host, self.port))
1360
1361             if isinstance(self, compat_http_client.HTTPSConnection):
1362                 if hasattr(self, '_context'):  # Python > 2.6
1363                     self.sock = self._context.wrap_socket(
1364                         self.sock, server_hostname=self.host)
1365                 else:
1366                     self.sock = ssl.wrap_socket(self.sock)
1367
1368     return SocksConnection
1369
1370
1371 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1372     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1373         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1374         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1375         self._params = params
1376
1377     def https_open(self, req):
1378         kwargs = {}
1379         conn_class = self._https_conn_class
1380
1381         if hasattr(self, '_context'):  # python > 2.6
1382             kwargs['context'] = self._context
1383         if hasattr(self, '_check_hostname'):  # python 3.x
1384             kwargs['check_hostname'] = self._check_hostname
1385
1386         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1387         if socks_proxy:
1388             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1389             del req.headers['Ytdl-socks-proxy']
1390
1391         return self.do_open(functools.partial(
1392             _create_http_connection, self, conn_class, True),
1393             req, **kwargs)
1394
1395
1396 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1397     """
1398     See [1] for cookie file format.
1399
1400     1. https://curl.haxx.se/docs/http-cookies.html
1401     """
1402     _HTTPONLY_PREFIX = '#HttpOnly_'
1403     _ENTRY_LEN = 7
1404     _HEADER = '''# Netscape HTTP Cookie File
1405 # This file is generated by yt-dlp.  Do not edit.
1406
1407 '''
1408     _CookieFileEntry = collections.namedtuple(
1409         'CookieFileEntry',
1410         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1411
1412     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1413         """
1414         Save cookies to a file.
1415
1416         Most of the code is taken from CPython 3.8 and slightly adapted
1417         to support cookie files with UTF-8 in both python 2 and 3.
1418         """
1419         if filename is None:
1420             if self.filename is not None:
1421                 filename = self.filename
1422             else:
1423                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1424
1425         # Store session cookies with `expires` set to 0 instead of an empty
1426         # string
1427         for cookie in self:
1428             if cookie.expires is None:
1429                 cookie.expires = 0
1430
1431         with io.open(filename, 'w', encoding='utf-8') as f:
1432             f.write(self._HEADER)
1433             now = time.time()
1434             for cookie in self:
1435                 if not ignore_discard and cookie.discard:
1436                     continue
1437                 if not ignore_expires and cookie.is_expired(now):
1438                     continue
1439                 if cookie.secure:
1440                     secure = 'TRUE'
1441                 else:
1442                     secure = 'FALSE'
1443                 if cookie.domain.startswith('.'):
1444                     initial_dot = 'TRUE'
1445                 else:
1446                     initial_dot = 'FALSE'
1447                 if cookie.expires is not None:
1448                     expires = compat_str(cookie.expires)
1449                 else:
1450                     expires = ''
1451                 if cookie.value is None:
1452                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1453                     # with no name, whereas http.cookiejar regards it as a
1454                     # cookie with no value.
1455                     name = ''
1456                     value = cookie.name
1457                 else:
1458                     name = cookie.name
1459                     value = cookie.value
1460                 f.write(
1461                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1462                                secure, expires, name, value]) + '\n')
1463
1464     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1465         """Load cookies from a file."""
1466         if filename is None:
1467             if self.filename is not None:
1468                 filename = self.filename
1469             else:
1470                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1471
1472         def prepare_line(line):
1473             if line.startswith(self._HTTPONLY_PREFIX):
1474                 line = line[len(self._HTTPONLY_PREFIX):]
1475             # comments and empty lines are fine
1476             if line.startswith('#') or not line.strip():
1477                 return line
1478             cookie_list = line.split('\t')
1479             if len(cookie_list) != self._ENTRY_LEN:
1480                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1481             cookie = self._CookieFileEntry(*cookie_list)
1482             if cookie.expires_at and not cookie.expires_at.isdigit():
1483                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1484             return line
1485
1486         cf = io.StringIO()
1487         with io.open(filename, encoding='utf-8') as f:
1488             for line in f:
1489                 try:
1490                     cf.write(prepare_line(line))
1491                 except compat_cookiejar.LoadError as e:
1492                     write_string(
1493                         'WARNING: skipping cookie file entry due to %s: %r\n'
1494                         % (e, line), sys.stderr)
1495                     continue
1496         cf.seek(0)
1497         self._really_load(cf, filename, ignore_discard, ignore_expires)
1498         # Session cookies are denoted by either `expires` field set to
1499         # an empty string or 0. MozillaCookieJar only recognizes the former
1500         # (see [1]). So we need force the latter to be recognized as session
1501         # cookies on our own.
1502         # Session cookies may be important for cookies-based authentication,
1503         # e.g. usually, when user does not check 'Remember me' check box while
1504         # logging in on a site, some important cookies are stored as session
1505         # cookies so that not recognizing them will result in failed login.
1506         # 1. https://bugs.python.org/issue17164
1507         for cookie in self:
1508             # Treat `expires=0` cookies as session cookies
1509             if cookie.expires == 0:
1510                 cookie.expires = None
1511                 cookie.discard = True
1512
1513
1514 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1515     def __init__(self, cookiejar=None):
1516         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1517
1518     def http_response(self, request, response):
1519         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1520         # characters in Set-Cookie HTTP header of last response (see
1521         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1522         # In order to at least prevent crashing we will percent encode Set-Cookie
1523         # header before HTTPCookieProcessor starts processing it.
1524         # if sys.version_info < (3, 0) and response.headers:
1525         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1526         #         set_cookie = response.headers.get(set_cookie_header)
1527         #         if set_cookie:
1528         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1529         #             if set_cookie != set_cookie_escaped:
1530         #                 del response.headers[set_cookie_header]
1531         #                 response.headers[set_cookie_header] = set_cookie_escaped
1532         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1533
1534     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1535     https_response = http_response
1536
1537
1538 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1539     """YoutubeDL redirect handler
1540
1541     The code is based on HTTPRedirectHandler implementation from CPython [1].
1542
1543     This redirect handler solves two issues:
1544      - ensures redirect URL is always unicode under python 2
1545      - introduces support for experimental HTTP response status code
1546        308 Permanent Redirect [2] used by some sites [3]
1547
1548     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1549     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1550     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1551     """
1552
1553     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1554
1555     def redirect_request(self, req, fp, code, msg, headers, newurl):
1556         """Return a Request or None in response to a redirect.
1557
1558         This is called by the http_error_30x methods when a
1559         redirection response is received.  If a redirection should
1560         take place, return a new Request to allow http_error_30x to
1561         perform the redirect.  Otherwise, raise HTTPError if no-one
1562         else should try to handle this url.  Return None if you can't
1563         but another Handler might.
1564         """
1565         m = req.get_method()
1566         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1567                  or code in (301, 302, 303) and m == "POST")):
1568             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1569         # Strictly (according to RFC 2616), 301 or 302 in response to
1570         # a POST MUST NOT cause a redirection without confirmation
1571         # from the user (of urllib.request, in this case).  In practice,
1572         # essentially all clients do redirect in this case, so we do
1573         # the same.
1574
1575         # On python 2 urlh.geturl() may sometimes return redirect URL
1576         # as byte string instead of unicode. This workaround allows
1577         # to force it always return unicode.
1578         if sys.version_info[0] < 3:
1579             newurl = compat_str(newurl)
1580
1581         # Be conciliant with URIs containing a space.  This is mainly
1582         # redundant with the more complete encoding done in http_error_302(),
1583         # but it is kept for compatibility with other callers.
1584         newurl = newurl.replace(' ', '%20')
1585
1586         CONTENT_HEADERS = ("content-length", "content-type")
1587         # NB: don't use dict comprehension for python 2.6 compatibility
1588         newheaders = dict((k, v) for k, v in req.headers.items()
1589                           if k.lower() not in CONTENT_HEADERS)
1590         return compat_urllib_request.Request(
1591             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1592             unverifiable=True)
1593
1594
1595 def extract_timezone(date_str):
1596     m = re.search(
1597         r'''(?x)
1598             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1599             (?P<tz>Z|                                            # just the UTC Z, or
1600                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1601                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1602                    [ ]?                                          # optional space
1603                 (?P<sign>\+|-)                                   # +/-
1604                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1605             $)
1606         ''', date_str)
1607     if not m:
1608         timezone = datetime.timedelta()
1609     else:
1610         date_str = date_str[:-len(m.group('tz'))]
1611         if not m.group('sign'):
1612             timezone = datetime.timedelta()
1613         else:
1614             sign = 1 if m.group('sign') == '+' else -1
1615             timezone = datetime.timedelta(
1616                 hours=sign * int(m.group('hours')),
1617                 minutes=sign * int(m.group('minutes')))
1618     return timezone, date_str
1619
1620
1621 def parse_iso8601(date_str, delimiter='T', timezone=None):
1622     """ Return a UNIX timestamp from the given date """
1623
1624     if date_str is None:
1625         return None
1626
1627     date_str = re.sub(r'\.[0-9]+', '', date_str)
1628
1629     if timezone is None:
1630         timezone, date_str = extract_timezone(date_str)
1631
1632     try:
1633         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1634         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1635         return calendar.timegm(dt.timetuple())
1636     except ValueError:
1637         pass
1638
1639
1640 def date_formats(day_first=True):
1641     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1642
1643
1644 def unified_strdate(date_str, day_first=True):
1645     """Return a string with the date in the format YYYYMMDD"""
1646
1647     if date_str is None:
1648         return None
1649     upload_date = None
1650     # Replace commas
1651     date_str = date_str.replace(',', ' ')
1652     # Remove AM/PM + timezone
1653     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1654     _, date_str = extract_timezone(date_str)
1655
1656     for expression in date_formats(day_first):
1657         try:
1658             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1659         except ValueError:
1660             pass
1661     if upload_date is None:
1662         timetuple = email.utils.parsedate_tz(date_str)
1663         if timetuple:
1664             try:
1665                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1666             except ValueError:
1667                 pass
1668     if upload_date is not None:
1669         return compat_str(upload_date)
1670
1671
1672 def unified_timestamp(date_str, day_first=True):
1673     if date_str is None:
1674         return None
1675
1676     date_str = re.sub(r'[,|]', '', date_str)
1677
1678     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1679     timezone, date_str = extract_timezone(date_str)
1680
1681     # Remove AM/PM + timezone
1682     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1683
1684     # Remove unrecognized timezones from ISO 8601 alike timestamps
1685     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1686     if m:
1687         date_str = date_str[:-len(m.group('tz'))]
1688
1689     # Python only supports microseconds, so remove nanoseconds
1690     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1691     if m:
1692         date_str = m.group(1)
1693
1694     for expression in date_formats(day_first):
1695         try:
1696             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1697             return calendar.timegm(dt.timetuple())
1698         except ValueError:
1699             pass
1700     timetuple = email.utils.parsedate_tz(date_str)
1701     if timetuple:
1702         return calendar.timegm(timetuple) + pm_delta * 3600
1703
1704
1705 def determine_ext(url, default_ext='unknown_video'):
1706     if url is None or '.' not in url:
1707         return default_ext
1708     guess = url.partition('?')[0].rpartition('.')[2]
1709     if re.match(r'^[A-Za-z0-9]+$', guess):
1710         return guess
1711     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1712     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1713         return guess.rstrip('/')
1714     else:
1715         return default_ext
1716
1717
1718 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1719     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1720
1721
1722 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1723     """
1724     Return a datetime object from a string in the format YYYYMMDD or
1725     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1726
1727     format: string date format used to return datetime object from
1728     precision: round the time portion of a datetime object.
1729                 auto|microsecond|second|minute|hour|day.
1730                 auto: round to the unit provided in date_str (if applicable).
1731     """
1732     auto_precision = False
1733     if precision == 'auto':
1734         auto_precision = True
1735         precision = 'microsecond'
1736     today = datetime_round(datetime.datetime.now(), precision)
1737     if date_str in ('now', 'today'):
1738         return today
1739     if date_str == 'yesterday':
1740         return today - datetime.timedelta(days=1)
1741     match = re.match(
1742         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1743         date_str)
1744     if match is not None:
1745         start_time = datetime_from_str(match.group('start'), precision, format)
1746         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1747         unit = match.group('unit')
1748         if unit == 'month' or unit == 'year':
1749             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1750             unit = 'day'
1751         else:
1752             if unit == 'week':
1753                 unit = 'day'
1754                 time *= 7
1755             delta = datetime.timedelta(**{unit + 's': time})
1756             new_date = start_time + delta
1757         if auto_precision:
1758             return datetime_round(new_date, unit)
1759         return new_date
1760
1761     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1762
1763
1764 def date_from_str(date_str, format='%Y%m%d'):
1765     """
1766     Return a datetime object from a string in the format YYYYMMDD or
1767     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1768
1769     format: string date format used to return datetime object from
1770     """
1771     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1772
1773
1774 def datetime_add_months(dt, months):
1775     """Increment/Decrement a datetime object by months."""
1776     month = dt.month + months - 1
1777     year = dt.year + month // 12
1778     month = month % 12 + 1
1779     day = min(dt.day, calendar.monthrange(year, month)[1])
1780     return dt.replace(year, month, day)
1781
1782
1783 def datetime_round(dt, precision='day'):
1784     """
1785     Round a datetime object's time to a specific precision
1786     """
1787     if precision == 'microsecond':
1788         return dt
1789
1790     unit_seconds = {
1791         'day': 86400,
1792         'hour': 3600,
1793         'minute': 60,
1794         'second': 1,
1795     }
1796     roundto = lambda x, n: ((x + n / 2) // n) * n
1797     timestamp = calendar.timegm(dt.timetuple())
1798     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1799
1800
1801 def hyphenate_date(date_str):
1802     """
1803     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1804     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1805     if match is not None:
1806         return '-'.join(match.groups())
1807     else:
1808         return date_str
1809
1810
1811 class DateRange(object):
1812     """Represents a time interval between two dates"""
1813
1814     def __init__(self, start=None, end=None):
1815         """start and end must be strings in the format accepted by date"""
1816         if start is not None:
1817             self.start = date_from_str(start)
1818         else:
1819             self.start = datetime.datetime.min.date()
1820         if end is not None:
1821             self.end = date_from_str(end)
1822         else:
1823             self.end = datetime.datetime.max.date()
1824         if self.start > self.end:
1825             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1826
1827     @classmethod
1828     def day(cls, day):
1829         """Returns a range that only contains the given day"""
1830         return cls(day, day)
1831
1832     def __contains__(self, date):
1833         """Check if the date is in the range"""
1834         if not isinstance(date, datetime.date):
1835             date = date_from_str(date)
1836         return self.start <= date <= self.end
1837
1838     def __str__(self):
1839         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1840
1841
1842 def platform_name():
1843     """ Returns the platform name as a compat_str """
1844     res = platform.platform()
1845     if isinstance(res, bytes):
1846         res = res.decode(preferredencoding())
1847
1848     assert isinstance(res, compat_str)
1849     return res
1850
1851
1852 def get_windows_version():
1853     ''' Get Windows version. None if it's not running on Windows '''
1854     if compat_os_name == 'nt':
1855         return version_tuple(platform.win32_ver()[1])
1856     else:
1857         return None
1858
1859
1860 def _windows_write_string(s, out):
1861     """ Returns True if the string was written using special methods,
1862     False if it has yet to be written out."""
1863     # Adapted from http://stackoverflow.com/a/3259271/35070
1864
1865     import ctypes.wintypes
1866
1867     WIN_OUTPUT_IDS = {
1868         1: -11,
1869         2: -12,
1870     }
1871
1872     try:
1873         fileno = out.fileno()
1874     except AttributeError:
1875         # If the output stream doesn't have a fileno, it's virtual
1876         return False
1877     except io.UnsupportedOperation:
1878         # Some strange Windows pseudo files?
1879         return False
1880     if fileno not in WIN_OUTPUT_IDS:
1881         return False
1882
1883     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1884         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1885         ('GetStdHandle', ctypes.windll.kernel32))
1886     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1887
1888     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1889         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1890         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1891         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1892     written = ctypes.wintypes.DWORD(0)
1893
1894     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1895     FILE_TYPE_CHAR = 0x0002
1896     FILE_TYPE_REMOTE = 0x8000
1897     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1898         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1899         ctypes.POINTER(ctypes.wintypes.DWORD))(
1900         ('GetConsoleMode', ctypes.windll.kernel32))
1901     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1902
1903     def not_a_console(handle):
1904         if handle == INVALID_HANDLE_VALUE or handle is None:
1905             return True
1906         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1907                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1908
1909     if not_a_console(h):
1910         return False
1911
1912     def next_nonbmp_pos(s):
1913         try:
1914             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1915         except StopIteration:
1916             return len(s)
1917
1918     while s:
1919         count = min(next_nonbmp_pos(s), 1024)
1920
1921         ret = WriteConsoleW(
1922             h, s, count if count else 2, ctypes.byref(written), None)
1923         if ret == 0:
1924             raise OSError('Failed to write string')
1925         if not count:  # We just wrote a non-BMP character
1926             assert written.value == 2
1927             s = s[1:]
1928         else:
1929             assert written.value > 0
1930             s = s[written.value:]
1931     return True
1932
1933
1934 def write_string(s, out=None, encoding=None):
1935     if out is None:
1936         out = sys.stderr
1937     assert type(s) == compat_str
1938
1939     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1940         if _windows_write_string(s, out):
1941             return
1942
1943     if ('b' in getattr(out, 'mode', '')
1944             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1945         byt = s.encode(encoding or preferredencoding(), 'ignore')
1946         out.write(byt)
1947     elif hasattr(out, 'buffer'):
1948         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1949         byt = s.encode(enc, 'ignore')
1950         out.buffer.write(byt)
1951     else:
1952         out.write(s)
1953     out.flush()
1954
1955
1956 def bytes_to_intlist(bs):
1957     if not bs:
1958         return []
1959     if isinstance(bs[0], int):  # Python 3
1960         return list(bs)
1961     else:
1962         return [ord(c) for c in bs]
1963
1964
1965 def intlist_to_bytes(xs):
1966     if not xs:
1967         return b''
1968     return compat_struct_pack('%dB' % len(xs), *xs)
1969
1970
1971 # Cross-platform file locking
1972 if sys.platform == 'win32':
1973     import ctypes.wintypes
1974     import msvcrt
1975
1976     class OVERLAPPED(ctypes.Structure):
1977         _fields_ = [
1978             ('Internal', ctypes.wintypes.LPVOID),
1979             ('InternalHigh', ctypes.wintypes.LPVOID),
1980             ('Offset', ctypes.wintypes.DWORD),
1981             ('OffsetHigh', ctypes.wintypes.DWORD),
1982             ('hEvent', ctypes.wintypes.HANDLE),
1983         ]
1984
1985     kernel32 = ctypes.windll.kernel32
1986     LockFileEx = kernel32.LockFileEx
1987     LockFileEx.argtypes = [
1988         ctypes.wintypes.HANDLE,     # hFile
1989         ctypes.wintypes.DWORD,      # dwFlags
1990         ctypes.wintypes.DWORD,      # dwReserved
1991         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1993         ctypes.POINTER(OVERLAPPED)  # Overlapped
1994     ]
1995     LockFileEx.restype = ctypes.wintypes.BOOL
1996     UnlockFileEx = kernel32.UnlockFileEx
1997     UnlockFileEx.argtypes = [
1998         ctypes.wintypes.HANDLE,     # hFile
1999         ctypes.wintypes.DWORD,      # dwReserved
2000         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2002         ctypes.POINTER(OVERLAPPED)  # Overlapped
2003     ]
2004     UnlockFileEx.restype = ctypes.wintypes.BOOL
2005     whole_low = 0xffffffff
2006     whole_high = 0x7fffffff
2007
2008     def _lock_file(f, exclusive):
2009         overlapped = OVERLAPPED()
2010         overlapped.Offset = 0
2011         overlapped.OffsetHigh = 0
2012         overlapped.hEvent = 0
2013         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2014         handle = msvcrt.get_osfhandle(f.fileno())
2015         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2016                           whole_low, whole_high, f._lock_file_overlapped_p):
2017             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2018
2019     def _unlock_file(f):
2020         assert f._lock_file_overlapped_p
2021         handle = msvcrt.get_osfhandle(f.fileno())
2022         if not UnlockFileEx(handle, 0,
2023                             whole_low, whole_high, f._lock_file_overlapped_p):
2024             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2025
2026 else:
2027     # Some platforms, such as Jython, is missing fcntl
2028     try:
2029         import fcntl
2030
2031         def _lock_file(f, exclusive):
2032             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2033
2034         def _unlock_file(f):
2035             fcntl.flock(f, fcntl.LOCK_UN)
2036     except ImportError:
2037         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2038
2039         def _lock_file(f, exclusive):
2040             raise IOError(UNSUPPORTED_MSG)
2041
2042         def _unlock_file(f):
2043             raise IOError(UNSUPPORTED_MSG)
2044
2045
2046 class locked_file(object):
2047     def __init__(self, filename, mode, encoding=None):
2048         assert mode in ['r', 'a', 'w']
2049         self.f = io.open(filename, mode, encoding=encoding)
2050         self.mode = mode
2051
2052     def __enter__(self):
2053         exclusive = self.mode != 'r'
2054         try:
2055             _lock_file(self.f, exclusive)
2056         except IOError:
2057             self.f.close()
2058             raise
2059         return self
2060
2061     def __exit__(self, etype, value, traceback):
2062         try:
2063             _unlock_file(self.f)
2064         finally:
2065             self.f.close()
2066
2067     def __iter__(self):
2068         return iter(self.f)
2069
2070     def write(self, *args):
2071         return self.f.write(*args)
2072
2073     def read(self, *args):
2074         return self.f.read(*args)
2075
2076
2077 def get_filesystem_encoding():
2078     encoding = sys.getfilesystemencoding()
2079     return encoding if encoding is not None else 'utf-8'
2080
2081
2082 def shell_quote(args):
2083     quoted_args = []
2084     encoding = get_filesystem_encoding()
2085     for a in args:
2086         if isinstance(a, bytes):
2087             # We may get a filename encoded with 'encodeFilename'
2088             a = a.decode(encoding)
2089         quoted_args.append(compat_shlex_quote(a))
2090     return ' '.join(quoted_args)
2091
2092
2093 def smuggle_url(url, data):
2094     """ Pass additional data in a URL for internal use. """
2095
2096     url, idata = unsmuggle_url(url, {})
2097     data.update(idata)
2098     sdata = compat_urllib_parse_urlencode(
2099         {'__youtubedl_smuggle': json.dumps(data)})
2100     return url + '#' + sdata
2101
2102
2103 def unsmuggle_url(smug_url, default=None):
2104     if '#__youtubedl_smuggle' not in smug_url:
2105         return smug_url, default
2106     url, _, sdata = smug_url.rpartition('#')
2107     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2108     data = json.loads(jsond)
2109     return url, data
2110
2111
2112 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2113     """ Formats numbers with decimal sufixes like K, M, etc """
2114     num, factor = float_or_none(num), float(factor)
2115     if num is None:
2116         return None
2117     exponent = 0 if num == 0 else int(math.log(num, factor))
2118     suffix = ['', *'KMGTPEZY'][exponent]
2119     converted = num / (factor ** exponent)
2120     return fmt % (converted, suffix)
2121
2122
2123 def format_bytes(bytes):
2124     return format_decimal_suffix(bytes, '%.2f%siB', factor=1024) or 'N/A'
2125
2126
2127 def lookup_unit_table(unit_table, s):
2128     units_re = '|'.join(re.escape(u) for u in unit_table)
2129     m = re.match(
2130         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2131     if not m:
2132         return None
2133     num_str = m.group('num').replace(',', '.')
2134     mult = unit_table[m.group('unit')]
2135     return int(float(num_str) * mult)
2136
2137
2138 def parse_filesize(s):
2139     if s is None:
2140         return None
2141
2142     # The lower-case forms are of course incorrect and unofficial,
2143     # but we support those too
2144     _UNIT_TABLE = {
2145         'B': 1,
2146         'b': 1,
2147         'bytes': 1,
2148         'KiB': 1024,
2149         'KB': 1000,
2150         'kB': 1024,
2151         'Kb': 1000,
2152         'kb': 1000,
2153         'kilobytes': 1000,
2154         'kibibytes': 1024,
2155         'MiB': 1024 ** 2,
2156         'MB': 1000 ** 2,
2157         'mB': 1024 ** 2,
2158         'Mb': 1000 ** 2,
2159         'mb': 1000 ** 2,
2160         'megabytes': 1000 ** 2,
2161         'mebibytes': 1024 ** 2,
2162         'GiB': 1024 ** 3,
2163         'GB': 1000 ** 3,
2164         'gB': 1024 ** 3,
2165         'Gb': 1000 ** 3,
2166         'gb': 1000 ** 3,
2167         'gigabytes': 1000 ** 3,
2168         'gibibytes': 1024 ** 3,
2169         'TiB': 1024 ** 4,
2170         'TB': 1000 ** 4,
2171         'tB': 1024 ** 4,
2172         'Tb': 1000 ** 4,
2173         'tb': 1000 ** 4,
2174         'terabytes': 1000 ** 4,
2175         'tebibytes': 1024 ** 4,
2176         'PiB': 1024 ** 5,
2177         'PB': 1000 ** 5,
2178         'pB': 1024 ** 5,
2179         'Pb': 1000 ** 5,
2180         'pb': 1000 ** 5,
2181         'petabytes': 1000 ** 5,
2182         'pebibytes': 1024 ** 5,
2183         'EiB': 1024 ** 6,
2184         'EB': 1000 ** 6,
2185         'eB': 1024 ** 6,
2186         'Eb': 1000 ** 6,
2187         'eb': 1000 ** 6,
2188         'exabytes': 1000 ** 6,
2189         'exbibytes': 1024 ** 6,
2190         'ZiB': 1024 ** 7,
2191         'ZB': 1000 ** 7,
2192         'zB': 1024 ** 7,
2193         'Zb': 1000 ** 7,
2194         'zb': 1000 ** 7,
2195         'zettabytes': 1000 ** 7,
2196         'zebibytes': 1024 ** 7,
2197         'YiB': 1024 ** 8,
2198         'YB': 1000 ** 8,
2199         'yB': 1024 ** 8,
2200         'Yb': 1000 ** 8,
2201         'yb': 1000 ** 8,
2202         'yottabytes': 1000 ** 8,
2203         'yobibytes': 1024 ** 8,
2204     }
2205
2206     return lookup_unit_table(_UNIT_TABLE, s)
2207
2208
2209 def parse_count(s):
2210     if s is None:
2211         return None
2212
2213     s = s.strip()
2214
2215     if re.match(r'^[\d,.]+$', s):
2216         return str_to_int(s)
2217
2218     _UNIT_TABLE = {
2219         'k': 1000,
2220         'K': 1000,
2221         'm': 1000 ** 2,
2222         'M': 1000 ** 2,
2223         'kk': 1000 ** 2,
2224         'KK': 1000 ** 2,
2225     }
2226
2227     return lookup_unit_table(_UNIT_TABLE, s)
2228
2229
2230 def parse_resolution(s):
2231     if s is None:
2232         return {}
2233
2234     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2235     if mobj:
2236         return {
2237             'width': int(mobj.group('w')),
2238             'height': int(mobj.group('h')),
2239         }
2240
2241     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2242     if mobj:
2243         return {'height': int(mobj.group(1))}
2244
2245     mobj = re.search(r'\b([48])[kK]\b', s)
2246     if mobj:
2247         return {'height': int(mobj.group(1)) * 540}
2248
2249     return {}
2250
2251
2252 def parse_bitrate(s):
2253     if not isinstance(s, compat_str):
2254         return
2255     mobj = re.search(r'\b(\d+)\s*kbps', s)
2256     if mobj:
2257         return int(mobj.group(1))
2258
2259
2260 def month_by_name(name, lang='en'):
2261     """ Return the number of a month by (locale-independently) English name """
2262
2263     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2264
2265     try:
2266         return month_names.index(name) + 1
2267     except ValueError:
2268         return None
2269
2270
2271 def month_by_abbreviation(abbrev):
2272     """ Return the number of a month by (locale-independently) English
2273         abbreviations """
2274
2275     try:
2276         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2277     except ValueError:
2278         return None
2279
2280
2281 def fix_xml_ampersands(xml_str):
2282     """Replace all the '&' by '&amp;' in XML"""
2283     return re.sub(
2284         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2285         '&amp;',
2286         xml_str)
2287
2288
2289 def setproctitle(title):
2290     assert isinstance(title, compat_str)
2291
2292     # ctypes in Jython is not complete
2293     # http://bugs.jython.org/issue2148
2294     if sys.platform.startswith('java'):
2295         return
2296
2297     try:
2298         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2299     except OSError:
2300         return
2301     except TypeError:
2302         # LoadLibrary in Windows Python 2.7.13 only expects
2303         # a bytestring, but since unicode_literals turns
2304         # every string into a unicode string, it fails.
2305         return
2306     title_bytes = title.encode('utf-8')
2307     buf = ctypes.create_string_buffer(len(title_bytes))
2308     buf.value = title_bytes
2309     try:
2310         libc.prctl(15, buf, 0, 0, 0)
2311     except AttributeError:
2312         return  # Strange libc, just skip this
2313
2314
2315 def remove_start(s, start):
2316     return s[len(start):] if s is not None and s.startswith(start) else s
2317
2318
2319 def remove_end(s, end):
2320     return s[:-len(end)] if s is not None and s.endswith(end) else s
2321
2322
2323 def remove_quotes(s):
2324     if s is None or len(s) < 2:
2325         return s
2326     for quote in ('"', "'", ):
2327         if s[0] == quote and s[-1] == quote:
2328             return s[1:-1]
2329     return s
2330
2331
2332 def get_domain(url):
2333     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2334     return domain.group('domain') if domain else None
2335
2336
2337 def url_basename(url):
2338     path = compat_urlparse.urlparse(url).path
2339     return path.strip('/').split('/')[-1]
2340
2341
2342 def base_url(url):
2343     return re.match(r'https?://[^?#&]+/', url).group()
2344
2345
2346 def urljoin(base, path):
2347     if isinstance(path, bytes):
2348         path = path.decode('utf-8')
2349     if not isinstance(path, compat_str) or not path:
2350         return None
2351     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2352         return path
2353     if isinstance(base, bytes):
2354         base = base.decode('utf-8')
2355     if not isinstance(base, compat_str) or not re.match(
2356             r'^(?:https?:)?//', base):
2357         return None
2358     return compat_urlparse.urljoin(base, path)
2359
2360
2361 class HEADRequest(compat_urllib_request.Request):
2362     def get_method(self):
2363         return 'HEAD'
2364
2365
2366 class PUTRequest(compat_urllib_request.Request):
2367     def get_method(self):
2368         return 'PUT'
2369
2370
2371 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2372     if get_attr:
2373         if v is not None:
2374             v = getattr(v, get_attr, None)
2375     if v == '':
2376         v = None
2377     if v is None:
2378         return default
2379     try:
2380         return int(v) * invscale // scale
2381     except (ValueError, TypeError, OverflowError):
2382         return default
2383
2384
2385 def str_or_none(v, default=None):
2386     return default if v is None else compat_str(v)
2387
2388
2389 def str_to_int(int_str):
2390     """ A more relaxed version of int_or_none """
2391     if isinstance(int_str, compat_integer_types):
2392         return int_str
2393     elif isinstance(int_str, compat_str):
2394         int_str = re.sub(r'[,\.\+]', '', int_str)
2395         return int_or_none(int_str)
2396
2397
2398 def float_or_none(v, scale=1, invscale=1, default=None):
2399     if v is None:
2400         return default
2401     try:
2402         return float(v) * invscale / scale
2403     except (ValueError, TypeError):
2404         return default
2405
2406
2407 def bool_or_none(v, default=None):
2408     return v if isinstance(v, bool) else default
2409
2410
2411 def strip_or_none(v, default=None):
2412     return v.strip() if isinstance(v, compat_str) else default
2413
2414
2415 def url_or_none(url):
2416     if not url or not isinstance(url, compat_str):
2417         return None
2418     url = url.strip()
2419     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2420
2421
2422 def strftime_or_none(timestamp, date_format, default=None):
2423     datetime_object = None
2424     try:
2425         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2426             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2427         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2428             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2429         return datetime_object.strftime(date_format)
2430     except (ValueError, TypeError, AttributeError):
2431         return default
2432
2433
2434 def parse_duration(s):
2435     if not isinstance(s, compat_basestring):
2436         return None
2437     s = s.strip()
2438     if not s:
2439         return None
2440
2441     days, hours, mins, secs, ms = [None] * 5
2442     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2443     if m:
2444         days, hours, mins, secs, ms = m.groups()
2445     else:
2446         m = re.match(
2447             r'''(?ix)(?:P?
2448                 (?:
2449                     [0-9]+\s*y(?:ears?)?\s*
2450                 )?
2451                 (?:
2452                     [0-9]+\s*m(?:onths?)?\s*
2453                 )?
2454                 (?:
2455                     [0-9]+\s*w(?:eeks?)?\s*
2456                 )?
2457                 (?:
2458                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2459                 )?
2460                 T)?
2461                 (?:
2462                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2463                 )?
2464                 (?:
2465                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2466                 )?
2467                 (?:
2468                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2469                 )?Z?$''', s)
2470         if m:
2471             days, hours, mins, secs, ms = m.groups()
2472         else:
2473             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2474             if m:
2475                 hours, mins = m.groups()
2476             else:
2477                 return None
2478
2479     duration = 0
2480     if secs:
2481         duration += float(secs)
2482     if mins:
2483         duration += float(mins) * 60
2484     if hours:
2485         duration += float(hours) * 60 * 60
2486     if days:
2487         duration += float(days) * 24 * 60 * 60
2488     if ms:
2489         duration += float(ms)
2490     return duration
2491
2492
2493 def prepend_extension(filename, ext, expected_real_ext=None):
2494     name, real_ext = os.path.splitext(filename)
2495     return (
2496         '{0}.{1}{2}'.format(name, ext, real_ext)
2497         if not expected_real_ext or real_ext[1:] == expected_real_ext
2498         else '{0}.{1}'.format(filename, ext))
2499
2500
2501 def replace_extension(filename, ext, expected_real_ext=None):
2502     name, real_ext = os.path.splitext(filename)
2503     return '{0}.{1}'.format(
2504         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2505         ext)
2506
2507
2508 def check_executable(exe, args=[]):
2509     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2510     args can be a list of arguments for a short output (like -version) """
2511     try:
2512         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2513     except OSError:
2514         return False
2515     return exe
2516
2517
2518 def _get_exe_version_output(exe, args):
2519     try:
2520         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2521         # SIGTTOU if yt-dlp is run in the background.
2522         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2523         out, _ = Popen(
2524             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2525             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2526     except OSError:
2527         return False
2528     if isinstance(out, bytes):  # Python 2.x
2529         out = out.decode('ascii', 'ignore')
2530     return out
2531
2532
2533 def detect_exe_version(output, version_re=None, unrecognized='present'):
2534     assert isinstance(output, compat_str)
2535     if version_re is None:
2536         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2537     m = re.search(version_re, output)
2538     if m:
2539         return m.group(1)
2540     else:
2541         return unrecognized
2542
2543
2544 def get_exe_version(exe, args=['--version'],
2545                     version_re=None, unrecognized='present'):
2546     """ Returns the version of the specified executable,
2547     or False if the executable is not present """
2548     out = _get_exe_version_output(exe, args)
2549     return detect_exe_version(out, version_re, unrecognized) if out else False
2550
2551
2552 class LazyList(collections.abc.Sequence):
2553     ''' Lazy immutable list from an iterable
2554     Note that slices of a LazyList are lists and not LazyList'''
2555
2556     class IndexError(IndexError):
2557         pass
2558
2559     def __init__(self, iterable, *, reverse=False, _cache=None):
2560         self.__iterable = iter(iterable)
2561         self.__cache = [] if _cache is None else _cache
2562         self.__reversed = reverse
2563
2564     def __iter__(self):
2565         if self.__reversed:
2566             # We need to consume the entire iterable to iterate in reverse
2567             yield from self.exhaust()
2568             return
2569         yield from self.__cache
2570         for item in self.__iterable:
2571             self.__cache.append(item)
2572             yield item
2573
2574     def __exhaust(self):
2575         self.__cache.extend(self.__iterable)
2576         # Discard the emptied iterable to make it pickle-able
2577         self.__iterable = []
2578         return self.__cache
2579
2580     def exhaust(self):
2581         ''' Evaluate the entire iterable '''
2582         return self.__exhaust()[::-1 if self.__reversed else 1]
2583
2584     @staticmethod
2585     def __reverse_index(x):
2586         return None if x is None else -(x + 1)
2587
2588     def __getitem__(self, idx):
2589         if isinstance(idx, slice):
2590             if self.__reversed:
2591                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2592             start, stop, step = idx.start, idx.stop, idx.step or 1
2593         elif isinstance(idx, int):
2594             if self.__reversed:
2595                 idx = self.__reverse_index(idx)
2596             start, stop, step = idx, idx, 0
2597         else:
2598             raise TypeError('indices must be integers or slices')
2599         if ((start or 0) < 0 or (stop or 0) < 0
2600                 or (start is None and step < 0)
2601                 or (stop is None and step > 0)):
2602             # We need to consume the entire iterable to be able to slice from the end
2603             # Obviously, never use this with infinite iterables
2604             self.__exhaust()
2605             try:
2606                 return self.__cache[idx]
2607             except IndexError as e:
2608                 raise self.IndexError(e) from e
2609         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2610         if n > 0:
2611             self.__cache.extend(itertools.islice(self.__iterable, n))
2612         try:
2613             return self.__cache[idx]
2614         except IndexError as e:
2615             raise self.IndexError(e) from e
2616
2617     def __bool__(self):
2618         try:
2619             self[-1] if self.__reversed else self[0]
2620         except self.IndexError:
2621             return False
2622         return True
2623
2624     def __len__(self):
2625         self.__exhaust()
2626         return len(self.__cache)
2627
2628     def __reversed__(self):
2629         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2630
2631     def __copy__(self):
2632         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2633
2634     def __repr__(self):
2635         # repr and str should mimic a list. So we exhaust the iterable
2636         return repr(self.exhaust())
2637
2638     def __str__(self):
2639         return repr(self.exhaust())
2640
2641
2642 class PagedList:
2643
2644     class IndexError(IndexError):
2645         pass
2646
2647     def __len__(self):
2648         # This is only useful for tests
2649         return len(self.getslice())
2650
2651     def __init__(self, pagefunc, pagesize, use_cache=True):
2652         self._pagefunc = pagefunc
2653         self._pagesize = pagesize
2654         self._use_cache = use_cache
2655         self._cache = {}
2656
2657     def getpage(self, pagenum):
2658         page_results = self._cache.get(pagenum)
2659         if page_results is None:
2660             page_results = list(self._pagefunc(pagenum))
2661         if self._use_cache:
2662             self._cache[pagenum] = page_results
2663         return page_results
2664
2665     def getslice(self, start=0, end=None):
2666         return list(self._getslice(start, end))
2667
2668     def _getslice(self, start, end):
2669         raise NotImplementedError('This method must be implemented by subclasses')
2670
2671     def __getitem__(self, idx):
2672         # NOTE: cache must be enabled if this is used
2673         if not isinstance(idx, int) or idx < 0:
2674             raise TypeError('indices must be non-negative integers')
2675         entries = self.getslice(idx, idx + 1)
2676         if not entries:
2677             raise self.IndexError()
2678         return entries[0]
2679
2680
2681 class OnDemandPagedList(PagedList):
2682     def _getslice(self, start, end):
2683         for pagenum in itertools.count(start // self._pagesize):
2684             firstid = pagenum * self._pagesize
2685             nextfirstid = pagenum * self._pagesize + self._pagesize
2686             if start >= nextfirstid:
2687                 continue
2688
2689             startv = (
2690                 start % self._pagesize
2691                 if firstid <= start < nextfirstid
2692                 else 0)
2693             endv = (
2694                 ((end - 1) % self._pagesize) + 1
2695                 if (end is not None and firstid <= end <= nextfirstid)
2696                 else None)
2697
2698             page_results = self.getpage(pagenum)
2699             if startv != 0 or endv is not None:
2700                 page_results = page_results[startv:endv]
2701             yield from page_results
2702
2703             # A little optimization - if current page is not "full", ie. does
2704             # not contain page_size videos then we can assume that this page
2705             # is the last one - there are no more ids on further pages -
2706             # i.e. no need to query again.
2707             if len(page_results) + startv < self._pagesize:
2708                 break
2709
2710             # If we got the whole page, but the next page is not interesting,
2711             # break out early as well
2712             if end == nextfirstid:
2713                 break
2714
2715
2716 class InAdvancePagedList(PagedList):
2717     def __init__(self, pagefunc, pagecount, pagesize):
2718         self._pagecount = pagecount
2719         PagedList.__init__(self, pagefunc, pagesize, True)
2720
2721     def _getslice(self, start, end):
2722         start_page = start // self._pagesize
2723         end_page = (
2724             self._pagecount if end is None else (end // self._pagesize + 1))
2725         skip_elems = start - start_page * self._pagesize
2726         only_more = None if end is None else end - start
2727         for pagenum in range(start_page, end_page):
2728             page_results = self.getpage(pagenum)
2729             if skip_elems:
2730                 page_results = page_results[skip_elems:]
2731                 skip_elems = None
2732             if only_more is not None:
2733                 if len(page_results) < only_more:
2734                     only_more -= len(page_results)
2735                 else:
2736                     yield from page_results[:only_more]
2737                     break
2738             yield from page_results
2739
2740
2741 def uppercase_escape(s):
2742     unicode_escape = codecs.getdecoder('unicode_escape')
2743     return re.sub(
2744         r'\\U[0-9a-fA-F]{8}',
2745         lambda m: unicode_escape(m.group(0))[0],
2746         s)
2747
2748
2749 def lowercase_escape(s):
2750     unicode_escape = codecs.getdecoder('unicode_escape')
2751     return re.sub(
2752         r'\\u[0-9a-fA-F]{4}',
2753         lambda m: unicode_escape(m.group(0))[0],
2754         s)
2755
2756
2757 def escape_rfc3986(s):
2758     """Escape non-ASCII characters as suggested by RFC 3986"""
2759     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2760         s = s.encode('utf-8')
2761     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2762
2763
2764 def escape_url(url):
2765     """Escape URL as suggested by RFC 3986"""
2766     url_parsed = compat_urllib_parse_urlparse(url)
2767     return url_parsed._replace(
2768         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2769         path=escape_rfc3986(url_parsed.path),
2770         params=escape_rfc3986(url_parsed.params),
2771         query=escape_rfc3986(url_parsed.query),
2772         fragment=escape_rfc3986(url_parsed.fragment)
2773     ).geturl()
2774
2775
2776 def parse_qs(url):
2777     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2778
2779
2780 def read_batch_urls(batch_fd):
2781     def fixup(url):
2782         if not isinstance(url, compat_str):
2783             url = url.decode('utf-8', 'replace')
2784         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2785         for bom in BOM_UTF8:
2786             if url.startswith(bom):
2787                 url = url[len(bom):]
2788         url = url.lstrip()
2789         if not url or url.startswith(('#', ';', ']')):
2790             return False
2791         # "#" cannot be stripped out since it is part of the URI
2792         # However, it can be safely stipped out if follwing a whitespace
2793         return re.split(r'\s#', url, 1)[0].rstrip()
2794
2795     with contextlib.closing(batch_fd) as fd:
2796         return [url for url in map(fixup, fd) if url]
2797
2798
2799 def urlencode_postdata(*args, **kargs):
2800     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2801
2802
2803 def update_url_query(url, query):
2804     if not query:
2805         return url
2806     parsed_url = compat_urlparse.urlparse(url)
2807     qs = compat_parse_qs(parsed_url.query)
2808     qs.update(query)
2809     return compat_urlparse.urlunparse(parsed_url._replace(
2810         query=compat_urllib_parse_urlencode(qs, True)))
2811
2812
2813 def update_Request(req, url=None, data=None, headers={}, query={}):
2814     req_headers = req.headers.copy()
2815     req_headers.update(headers)
2816     req_data = data or req.data
2817     req_url = update_url_query(url or req.get_full_url(), query)
2818     req_get_method = req.get_method()
2819     if req_get_method == 'HEAD':
2820         req_type = HEADRequest
2821     elif req_get_method == 'PUT':
2822         req_type = PUTRequest
2823     else:
2824         req_type = compat_urllib_request.Request
2825     new_req = req_type(
2826         req_url, data=req_data, headers=req_headers,
2827         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2828     if hasattr(req, 'timeout'):
2829         new_req.timeout = req.timeout
2830     return new_req
2831
2832
2833 def _multipart_encode_impl(data, boundary):
2834     content_type = 'multipart/form-data; boundary=%s' % boundary
2835
2836     out = b''
2837     for k, v in data.items():
2838         out += b'--' + boundary.encode('ascii') + b'\r\n'
2839         if isinstance(k, compat_str):
2840             k = k.encode('utf-8')
2841         if isinstance(v, compat_str):
2842             v = v.encode('utf-8')
2843         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2844         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2845         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2846         if boundary.encode('ascii') in content:
2847             raise ValueError('Boundary overlaps with data')
2848         out += content
2849
2850     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2851
2852     return out, content_type
2853
2854
2855 def multipart_encode(data, boundary=None):
2856     '''
2857     Encode a dict to RFC 7578-compliant form-data
2858
2859     data:
2860         A dict where keys and values can be either Unicode or bytes-like
2861         objects.
2862     boundary:
2863         If specified a Unicode object, it's used as the boundary. Otherwise
2864         a random boundary is generated.
2865
2866     Reference: https://tools.ietf.org/html/rfc7578
2867     '''
2868     has_specified_boundary = boundary is not None
2869
2870     while True:
2871         if boundary is None:
2872             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2873
2874         try:
2875             out, content_type = _multipart_encode_impl(data, boundary)
2876             break
2877         except ValueError:
2878             if has_specified_boundary:
2879                 raise
2880             boundary = None
2881
2882     return out, content_type
2883
2884
2885 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2886     if isinstance(key_or_keys, (list, tuple)):
2887         for key in key_or_keys:
2888             if key not in d or d[key] is None or skip_false_values and not d[key]:
2889                 continue
2890             return d[key]
2891         return default
2892     return d.get(key_or_keys, default)
2893
2894
2895 def try_get(src, getter, expected_type=None):
2896     for get in variadic(getter):
2897         try:
2898             v = get(src)
2899         except (AttributeError, KeyError, TypeError, IndexError):
2900             pass
2901         else:
2902             if expected_type is None or isinstance(v, expected_type):
2903                 return v
2904
2905
2906 def merge_dicts(*dicts):
2907     merged = {}
2908     for a_dict in dicts:
2909         for k, v in a_dict.items():
2910             if v is None:
2911                 continue
2912             if (k not in merged
2913                     or (isinstance(v, compat_str) and v
2914                         and isinstance(merged[k], compat_str)
2915                         and not merged[k])):
2916                 merged[k] = v
2917     return merged
2918
2919
2920 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2921     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2922
2923
2924 US_RATINGS = {
2925     'G': 0,
2926     'PG': 10,
2927     'PG-13': 13,
2928     'R': 16,
2929     'NC': 18,
2930 }
2931
2932
2933 TV_PARENTAL_GUIDELINES = {
2934     'TV-Y': 0,
2935     'TV-Y7': 7,
2936     'TV-G': 0,
2937     'TV-PG': 0,
2938     'TV-14': 14,
2939     'TV-MA': 17,
2940 }
2941
2942
2943 def parse_age_limit(s):
2944     if type(s) == int:
2945         return s if 0 <= s <= 21 else None
2946     if not isinstance(s, compat_basestring):
2947         return None
2948     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2949     if m:
2950         return int(m.group('age'))
2951     s = s.upper()
2952     if s in US_RATINGS:
2953         return US_RATINGS[s]
2954     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2955     if m:
2956         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2957     return None
2958
2959
2960 def strip_jsonp(code):
2961     return re.sub(
2962         r'''(?sx)^
2963             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2964             (?:\s*&&\s*(?P=func_name))?
2965             \s*\(\s*(?P<callback_data>.*)\);?
2966             \s*?(?://[^\n]*)*$''',
2967         r'\g<callback_data>', code)
2968
2969
2970 def js_to_json(code, vars={}):
2971     # vars is a dict of var, val pairs to substitute
2972     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2973     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2974     INTEGER_TABLE = (
2975         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2976         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2977     )
2978
2979     def fix_kv(m):
2980         v = m.group(0)
2981         if v in ('true', 'false', 'null'):
2982             return v
2983         elif v in ('undefined', 'void 0'):
2984             return 'null'
2985         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2986             return ""
2987
2988         if v[0] in ("'", '"'):
2989             v = re.sub(r'(?s)\\.|"', lambda m: {
2990                 '"': '\\"',
2991                 "\\'": "'",
2992                 '\\\n': '',
2993                 '\\x': '\\u00',
2994             }.get(m.group(0), m.group(0)), v[1:-1])
2995         else:
2996             for regex, base in INTEGER_TABLE:
2997                 im = re.match(regex, v)
2998                 if im:
2999                     i = int(im.group(1), base)
3000                     return '"%d":' % i if v.endswith(':') else '%d' % i
3001
3002             if v in vars:
3003                 return vars[v]
3004
3005         return '"%s"' % v
3006
3007     return re.sub(r'''(?sx)
3008         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3009         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3010         {comment}|,(?={skip}[\]}}])|
3011         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3012         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3013         [0-9]+(?={skip}:)|
3014         !+
3015         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3016
3017
3018 def qualities(quality_ids):
3019     """ Get a numeric quality value out of a list of possible values """
3020     def q(qid):
3021         try:
3022             return quality_ids.index(qid)
3023         except ValueError:
3024             return -1
3025     return q
3026
3027
3028 DEFAULT_OUTTMPL = {
3029     'default': '%(title)s [%(id)s].%(ext)s',
3030     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3031 }
3032 OUTTMPL_TYPES = {
3033     'chapter': None,
3034     'subtitle': None,
3035     'thumbnail': None,
3036     'description': 'description',
3037     'annotation': 'annotations.xml',
3038     'infojson': 'info.json',
3039     'link': None,
3040     'pl_thumbnail': None,
3041     'pl_description': 'description',
3042     'pl_infojson': 'info.json',
3043 }
3044
3045 # As of [1] format syntax is:
3046 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3047 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3048 STR_FORMAT_RE_TMPL = r'''(?x)
3049     (?<!%)(?P<prefix>(?:%%)*)
3050     %
3051     (?P<has_key>\((?P<key>{0})\))?
3052     (?P<format>
3053         (?P<conversion>[#0\-+ ]+)?
3054         (?P<min_width>\d+)?
3055         (?P<precision>\.\d+)?
3056         (?P<len_mod>[hlL])?  # unused in python
3057         {1}  # conversion type
3058     )
3059 '''
3060
3061
3062 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3063
3064
3065 def limit_length(s, length):
3066     """ Add ellipses to overly long strings """
3067     if s is None:
3068         return None
3069     ELLIPSES = '...'
3070     if len(s) > length:
3071         return s[:length - len(ELLIPSES)] + ELLIPSES
3072     return s
3073
3074
3075 def version_tuple(v):
3076     return tuple(int(e) for e in re.split(r'[-.]', v))
3077
3078
3079 def is_outdated_version(version, limit, assume_new=True):
3080     if not version:
3081         return not assume_new
3082     try:
3083         return version_tuple(version) < version_tuple(limit)
3084     except ValueError:
3085         return not assume_new
3086
3087
3088 def ytdl_is_updateable():
3089     """ Returns if yt-dlp can be updated with -U """
3090
3091     from .update import is_non_updateable
3092
3093     return not is_non_updateable()
3094
3095
3096 def args_to_str(args):
3097     # Get a short string representation for a subprocess command
3098     return ' '.join(compat_shlex_quote(a) for a in args)
3099
3100
3101 def error_to_compat_str(err):
3102     err_str = str(err)
3103     # On python 2 error byte string must be decoded with proper
3104     # encoding rather than ascii
3105     if sys.version_info[0] < 3:
3106         err_str = err_str.decode(preferredencoding())
3107     return err_str
3108
3109
3110 def mimetype2ext(mt):
3111     if mt is None:
3112         return None
3113
3114     mt, _, params = mt.partition(';')
3115     mt = mt.strip()
3116
3117     FULL_MAP = {
3118         'audio/mp4': 'm4a',
3119         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3120         # it's the most popular one
3121         'audio/mpeg': 'mp3',
3122         'audio/x-wav': 'wav',
3123         'audio/wav': 'wav',
3124         'audio/wave': 'wav',
3125     }
3126
3127     ext = FULL_MAP.get(mt)
3128     if ext is not None:
3129         return ext
3130
3131     SUBTYPE_MAP = {
3132         '3gpp': '3gp',
3133         'smptett+xml': 'tt',
3134         'ttaf+xml': 'dfxp',
3135         'ttml+xml': 'ttml',
3136         'x-flv': 'flv',
3137         'x-mp4-fragmented': 'mp4',
3138         'x-ms-sami': 'sami',
3139         'x-ms-wmv': 'wmv',
3140         'mpegurl': 'm3u8',
3141         'x-mpegurl': 'm3u8',
3142         'vnd.apple.mpegurl': 'm3u8',
3143         'dash+xml': 'mpd',
3144         'f4m+xml': 'f4m',
3145         'hds+xml': 'f4m',
3146         'vnd.ms-sstr+xml': 'ism',
3147         'quicktime': 'mov',
3148         'mp2t': 'ts',
3149         'x-wav': 'wav',
3150         'filmstrip+json': 'fs',
3151         'svg+xml': 'svg',
3152     }
3153
3154     _, _, subtype = mt.rpartition('/')
3155     ext = SUBTYPE_MAP.get(subtype.lower())
3156     if ext is not None:
3157         return ext
3158
3159     SUFFIX_MAP = {
3160         'json': 'json',
3161         'xml': 'xml',
3162         'zip': 'zip',
3163         'gzip': 'gz',
3164     }
3165
3166     _, _, suffix = subtype.partition('+')
3167     ext = SUFFIX_MAP.get(suffix)
3168     if ext is not None:
3169         return ext
3170
3171     return subtype.replace('+', '.')
3172
3173
3174 def ext2mimetype(ext_or_url):
3175     if not ext_or_url:
3176         return None
3177     if '.' not in ext_or_url:
3178         ext_or_url = f'file.{ext_or_url}'
3179     return mimetypes.guess_type(ext_or_url)[0]
3180
3181
3182 def parse_codecs(codecs_str):
3183     # http://tools.ietf.org/html/rfc6381
3184     if not codecs_str:
3185         return {}
3186     split_codecs = list(filter(None, map(
3187         str.strip, codecs_str.strip().strip(',').split(','))))
3188     vcodec, acodec, hdr = None, None, None
3189     for full_codec in split_codecs:
3190         parts = full_codec.split('.')
3191         codec = parts[0].replace('0', '')
3192         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3193                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3194             if not vcodec:
3195                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3196                 if codec in ('dvh1', 'dvhe'):
3197                     hdr = 'DV'
3198                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3199                     hdr = 'HDR10'
3200                 elif full_codec.replace('0', '').startswith('vp9.2'):
3201                     hdr = 'HDR10'
3202         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3203             if not acodec:
3204                 acodec = full_codec
3205         else:
3206             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3207     if vcodec or acodec:
3208         return {
3209             'vcodec': vcodec or 'none',
3210             'acodec': acodec or 'none',
3211             'dynamic_range': hdr,
3212         }
3213     elif len(split_codecs) == 2:
3214         return {
3215             'vcodec': split_codecs[0],
3216             'acodec': split_codecs[1],
3217         }
3218     return {}
3219
3220
3221 def urlhandle_detect_ext(url_handle):
3222     getheader = url_handle.headers.get
3223
3224     cd = getheader('Content-Disposition')
3225     if cd:
3226         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3227         if m:
3228             e = determine_ext(m.group('filename'), default_ext=None)
3229             if e:
3230                 return e
3231
3232     return mimetype2ext(getheader('Content-Type'))
3233
3234
3235 def encode_data_uri(data, mime_type):
3236     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3237
3238
3239 def age_restricted(content_limit, age_limit):
3240     """ Returns True iff the content should be blocked """
3241
3242     if age_limit is None:  # No limit set
3243         return False
3244     if content_limit is None:
3245         return False  # Content available for everyone
3246     return age_limit < content_limit
3247
3248
3249 def is_html(first_bytes):
3250     """ Detect whether a file contains HTML by examining its first bytes. """
3251
3252     BOMS = [
3253         (b'\xef\xbb\xbf', 'utf-8'),
3254         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3255         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3256         (b'\xff\xfe', 'utf-16-le'),
3257         (b'\xfe\xff', 'utf-16-be'),
3258     ]
3259     for bom, enc in BOMS:
3260         if first_bytes.startswith(bom):
3261             s = first_bytes[len(bom):].decode(enc, 'replace')
3262             break
3263     else:
3264         s = first_bytes.decode('utf-8', 'replace')
3265
3266     return re.match(r'^\s*<', s)
3267
3268
3269 def determine_protocol(info_dict):
3270     protocol = info_dict.get('protocol')
3271     if protocol is not None:
3272         return protocol
3273
3274     url = sanitize_url(info_dict['url'])
3275     if url.startswith('rtmp'):
3276         return 'rtmp'
3277     elif url.startswith('mms'):
3278         return 'mms'
3279     elif url.startswith('rtsp'):
3280         return 'rtsp'
3281
3282     ext = determine_ext(url)
3283     if ext == 'm3u8':
3284         return 'm3u8'
3285     elif ext == 'f4m':
3286         return 'f4m'
3287
3288     return compat_urllib_parse_urlparse(url).scheme
3289
3290
3291 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3292     """ Render a list of rows, each as a list of values.
3293     Text after a \t will be right aligned """
3294     def width(string):
3295         return len(remove_terminal_sequences(string).replace('\t', ''))
3296
3297     def get_max_lens(table):
3298         return [max(width(str(v)) for v in col) for col in zip(*table)]
3299
3300     def filter_using_list(row, filterArray):
3301         return [col for (take, col) in zip(filterArray, row) if take]
3302
3303     if hide_empty:
3304         max_lens = get_max_lens(data)
3305         header_row = filter_using_list(header_row, max_lens)
3306         data = [filter_using_list(row, max_lens) for row in data]
3307
3308     table = [header_row] + data
3309     max_lens = get_max_lens(table)
3310     extra_gap += 1
3311     if delim:
3312         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3313         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3314     for row in table:
3315         for pos, text in enumerate(map(str, row)):
3316             if '\t' in text:
3317                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3318             else:
3319                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3320     ret = '\n'.join(''.join(row).rstrip() for row in table)
3321     return ret
3322
3323
3324 def _match_one(filter_part, dct, incomplete):
3325     # TODO: Generalize code with YoutubeDL._build_format_filter
3326     STRING_OPERATORS = {
3327         '*=': operator.contains,
3328         '^=': lambda attr, value: attr.startswith(value),
3329         '$=': lambda attr, value: attr.endswith(value),
3330         '~=': lambda attr, value: re.search(value, attr),
3331     }
3332     COMPARISON_OPERATORS = {
3333         **STRING_OPERATORS,
3334         '<=': operator.le,  # "<=" must be defined above "<"
3335         '<': operator.lt,
3336         '>=': operator.ge,
3337         '>': operator.gt,
3338         '=': operator.eq,
3339     }
3340
3341     operator_rex = re.compile(r'''(?x)\s*
3342         (?P<key>[a-z_]+)
3343         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3344         (?:
3345             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3346             (?P<strval>.+?)
3347         )
3348         \s*$
3349         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3350     m = operator_rex.search(filter_part)
3351     if m:
3352         m = m.groupdict()
3353         unnegated_op = COMPARISON_OPERATORS[m['op']]
3354         if m['negation']:
3355             op = lambda attr, value: not unnegated_op(attr, value)
3356         else:
3357             op = unnegated_op
3358         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3359         if m['quote']:
3360             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3361         actual_value = dct.get(m['key'])
3362         numeric_comparison = None
3363         if isinstance(actual_value, compat_numeric_types):
3364             # If the original field is a string and matching comparisonvalue is
3365             # a number we should respect the origin of the original field
3366             # and process comparison value as a string (see
3367             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3368             try:
3369                 numeric_comparison = int(comparison_value)
3370             except ValueError:
3371                 numeric_comparison = parse_filesize(comparison_value)
3372                 if numeric_comparison is None:
3373                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3374                 if numeric_comparison is None:
3375                     numeric_comparison = parse_duration(comparison_value)
3376         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3377             raise ValueError('Operator %s only supports string values!' % m['op'])
3378         if actual_value is None:
3379             return incomplete or m['none_inclusive']
3380         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3381
3382     UNARY_OPERATORS = {
3383         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3384         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3385     }
3386     operator_rex = re.compile(r'''(?x)\s*
3387         (?P<op>%s)\s*(?P<key>[a-z_]+)
3388         \s*$
3389         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3390     m = operator_rex.search(filter_part)
3391     if m:
3392         op = UNARY_OPERATORS[m.group('op')]
3393         actual_value = dct.get(m.group('key'))
3394         if incomplete and actual_value is None:
3395             return True
3396         return op(actual_value)
3397
3398     raise ValueError('Invalid filter part %r' % filter_part)
3399
3400
3401 def match_str(filter_str, dct, incomplete=False):
3402     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3403         When incomplete, all conditions passes on missing fields
3404     """
3405     return all(
3406         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3407         for filter_part in re.split(r'(?<!\\)&', filter_str))
3408
3409
3410 def match_filter_func(filter_str):
3411     def _match_func(info_dict, *args, **kwargs):
3412         if match_str(filter_str, info_dict, *args, **kwargs):
3413             return None
3414         else:
3415             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3416             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3417     return _match_func
3418
3419
3420 def parse_dfxp_time_expr(time_expr):
3421     if not time_expr:
3422         return
3423
3424     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3425     if mobj:
3426         return float(mobj.group('time_offset'))
3427
3428     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3429     if mobj:
3430         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3431
3432
3433 def srt_subtitles_timecode(seconds):
3434     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3435
3436
3437 def ass_subtitles_timecode(seconds):
3438     time = timetuple_from_msec(seconds * 1000)
3439     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3440
3441
3442 def dfxp2srt(dfxp_data):
3443     '''
3444     @param dfxp_data A bytes-like object containing DFXP data
3445     @returns A unicode object containing converted SRT data
3446     '''
3447     LEGACY_NAMESPACES = (
3448         (b'http://www.w3.org/ns/ttml', [
3449             b'http://www.w3.org/2004/11/ttaf1',
3450             b'http://www.w3.org/2006/04/ttaf1',
3451             b'http://www.w3.org/2006/10/ttaf1',
3452         ]),
3453         (b'http://www.w3.org/ns/ttml#styling', [
3454             b'http://www.w3.org/ns/ttml#style',
3455         ]),
3456     )
3457
3458     SUPPORTED_STYLING = [
3459         'color',
3460         'fontFamily',
3461         'fontSize',
3462         'fontStyle',
3463         'fontWeight',
3464         'textDecoration'
3465     ]
3466
3467     _x = functools.partial(xpath_with_ns, ns_map={
3468         'xml': 'http://www.w3.org/XML/1998/namespace',
3469         'ttml': 'http://www.w3.org/ns/ttml',
3470         'tts': 'http://www.w3.org/ns/ttml#styling',
3471     })
3472
3473     styles = {}
3474     default_style = {}
3475
3476     class TTMLPElementParser(object):
3477         _out = ''
3478         _unclosed_elements = []
3479         _applied_styles = []
3480
3481         def start(self, tag, attrib):
3482             if tag in (_x('ttml:br'), 'br'):
3483                 self._out += '\n'
3484             else:
3485                 unclosed_elements = []
3486                 style = {}
3487                 element_style_id = attrib.get('style')
3488                 if default_style:
3489                     style.update(default_style)
3490                 if element_style_id:
3491                     style.update(styles.get(element_style_id, {}))
3492                 for prop in SUPPORTED_STYLING:
3493                     prop_val = attrib.get(_x('tts:' + prop))
3494                     if prop_val:
3495                         style[prop] = prop_val
3496                 if style:
3497                     font = ''
3498                     for k, v in sorted(style.items()):
3499                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3500                             continue
3501                         if k == 'color':
3502                             font += ' color="%s"' % v
3503                         elif k == 'fontSize':
3504                             font += ' size="%s"' % v
3505                         elif k == 'fontFamily':
3506                             font += ' face="%s"' % v
3507                         elif k == 'fontWeight' and v == 'bold':
3508                             self._out += '<b>'
3509                             unclosed_elements.append('b')
3510                         elif k == 'fontStyle' and v == 'italic':
3511                             self._out += '<i>'
3512                             unclosed_elements.append('i')
3513                         elif k == 'textDecoration' and v == 'underline':
3514                             self._out += '<u>'
3515                             unclosed_elements.append('u')
3516                     if font:
3517                         self._out += '<font' + font + '>'
3518                         unclosed_elements.append('font')
3519                     applied_style = {}
3520                     if self._applied_styles:
3521                         applied_style.update(self._applied_styles[-1])
3522                     applied_style.update(style)
3523                     self._applied_styles.append(applied_style)
3524                 self._unclosed_elements.append(unclosed_elements)
3525
3526         def end(self, tag):
3527             if tag not in (_x('ttml:br'), 'br'):
3528                 unclosed_elements = self._unclosed_elements.pop()
3529                 for element in reversed(unclosed_elements):
3530                     self._out += '</%s>' % element
3531                 if unclosed_elements and self._applied_styles:
3532                     self._applied_styles.pop()
3533
3534         def data(self, data):
3535             self._out += data
3536
3537         def close(self):
3538             return self._out.strip()
3539
3540     def parse_node(node):
3541         target = TTMLPElementParser()
3542         parser = xml.etree.ElementTree.XMLParser(target=target)
3543         parser.feed(xml.etree.ElementTree.tostring(node))
3544         return parser.close()
3545
3546     for k, v in LEGACY_NAMESPACES:
3547         for ns in v:
3548             dfxp_data = dfxp_data.replace(ns, k)
3549
3550     dfxp = compat_etree_fromstring(dfxp_data)
3551     out = []
3552     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3553
3554     if not paras:
3555         raise ValueError('Invalid dfxp/TTML subtitle')
3556
3557     repeat = False
3558     while True:
3559         for style in dfxp.findall(_x('.//ttml:style')):
3560             style_id = style.get('id') or style.get(_x('xml:id'))
3561             if not style_id:
3562                 continue
3563             parent_style_id = style.get('style')
3564             if parent_style_id:
3565                 if parent_style_id not in styles:
3566                     repeat = True
3567                     continue
3568                 styles[style_id] = styles[parent_style_id].copy()
3569             for prop in SUPPORTED_STYLING:
3570                 prop_val = style.get(_x('tts:' + prop))
3571                 if prop_val:
3572                     styles.setdefault(style_id, {})[prop] = prop_val
3573         if repeat:
3574             repeat = False
3575         else:
3576             break
3577
3578     for p in ('body', 'div'):
3579         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3580         if ele is None:
3581             continue
3582         style = styles.get(ele.get('style'))
3583         if not style:
3584             continue
3585         default_style.update(style)
3586
3587     for para, index in zip(paras, itertools.count(1)):
3588         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3589         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3590         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3591         if begin_time is None:
3592             continue
3593         if not end_time:
3594             if not dur:
3595                 continue
3596             end_time = begin_time + dur
3597         out.append('%d\n%s --> %s\n%s\n\n' % (
3598             index,
3599             srt_subtitles_timecode(begin_time),
3600             srt_subtitles_timecode(end_time),
3601             parse_node(para)))
3602
3603     return ''.join(out)
3604
3605
3606 def cli_option(params, command_option, param):
3607     param = params.get(param)
3608     if param:
3609         param = compat_str(param)
3610     return [command_option, param] if param is not None else []
3611
3612
3613 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3614     param = params.get(param)
3615     if param is None:
3616         return []
3617     assert isinstance(param, bool)
3618     if separator:
3619         return [command_option + separator + (true_value if param else false_value)]
3620     return [command_option, true_value if param else false_value]
3621
3622
3623 def cli_valueless_option(params, command_option, param, expected_value=True):
3624     param = params.get(param)
3625     return [command_option] if param == expected_value else []
3626
3627
3628 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3629     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3630         if use_compat:
3631             return argdict
3632         else:
3633             argdict = None
3634     if argdict is None:
3635         return default
3636     assert isinstance(argdict, dict)
3637
3638     assert isinstance(keys, (list, tuple))
3639     for key_list in keys:
3640         arg_list = list(filter(
3641             lambda x: x is not None,
3642             [argdict.get(key.lower()) for key in variadic(key_list)]))
3643         if arg_list:
3644             return [arg for args in arg_list for arg in args]
3645     return default
3646
3647
3648 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3649     main_key, exe = main_key.lower(), exe.lower()
3650     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3651     keys = [f'{root_key}{k}' for k in (keys or [''])]
3652     if root_key in keys:
3653         if main_key != exe:
3654             keys.append((main_key, exe))
3655         keys.append('default')
3656     else:
3657         use_compat = False
3658     return cli_configuration_args(argdict, keys, default, use_compat)
3659
3660
3661 class ISO639Utils(object):
3662     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3663     _lang_map = {
3664         'aa': 'aar',
3665         'ab': 'abk',
3666         'ae': 'ave',
3667         'af': 'afr',
3668         'ak': 'aka',
3669         'am': 'amh',
3670         'an': 'arg',
3671         'ar': 'ara',
3672         'as': 'asm',
3673         'av': 'ava',
3674         'ay': 'aym',
3675         'az': 'aze',
3676         'ba': 'bak',
3677         'be': 'bel',
3678         'bg': 'bul',
3679         'bh': 'bih',
3680         'bi': 'bis',
3681         'bm': 'bam',
3682         'bn': 'ben',
3683         'bo': 'bod',
3684         'br': 'bre',
3685         'bs': 'bos',
3686         'ca': 'cat',
3687         'ce': 'che',
3688         'ch': 'cha',
3689         'co': 'cos',
3690         'cr': 'cre',
3691         'cs': 'ces',
3692         'cu': 'chu',
3693         'cv': 'chv',
3694         'cy': 'cym',
3695         'da': 'dan',
3696         'de': 'deu',
3697         'dv': 'div',
3698         'dz': 'dzo',
3699         'ee': 'ewe',
3700         'el': 'ell',
3701         'en': 'eng',
3702         'eo': 'epo',
3703         'es': 'spa',
3704         'et': 'est',
3705         'eu': 'eus',
3706         'fa': 'fas',
3707         'ff': 'ful',
3708         'fi': 'fin',
3709         'fj': 'fij',
3710         'fo': 'fao',
3711         'fr': 'fra',
3712         'fy': 'fry',
3713         'ga': 'gle',
3714         'gd': 'gla',
3715         'gl': 'glg',
3716         'gn': 'grn',
3717         'gu': 'guj',
3718         'gv': 'glv',
3719         'ha': 'hau',
3720         'he': 'heb',
3721         'iw': 'heb',  # Replaced by he in 1989 revision
3722         'hi': 'hin',
3723         'ho': 'hmo',
3724         'hr': 'hrv',
3725         'ht': 'hat',
3726         'hu': 'hun',
3727         'hy': 'hye',
3728         'hz': 'her',
3729         'ia': 'ina',
3730         'id': 'ind',
3731         'in': 'ind',  # Replaced by id in 1989 revision
3732         'ie': 'ile',
3733         'ig': 'ibo',
3734         'ii': 'iii',
3735         'ik': 'ipk',
3736         'io': 'ido',
3737         'is': 'isl',
3738         'it': 'ita',
3739         'iu': 'iku',
3740         'ja': 'jpn',
3741         'jv': 'jav',
3742         'ka': 'kat',
3743         'kg': 'kon',
3744         'ki': 'kik',
3745         'kj': 'kua',
3746         'kk': 'kaz',
3747         'kl': 'kal',
3748         'km': 'khm',
3749         'kn': 'kan',
3750         'ko': 'kor',
3751         'kr': 'kau',
3752         'ks': 'kas',
3753         'ku': 'kur',
3754         'kv': 'kom',
3755         'kw': 'cor',
3756         'ky': 'kir',
3757         'la': 'lat',
3758         'lb': 'ltz',
3759         'lg': 'lug',
3760         'li': 'lim',
3761         'ln': 'lin',
3762         'lo': 'lao',
3763         'lt': 'lit',
3764         'lu': 'lub',
3765         'lv': 'lav',
3766         'mg': 'mlg',
3767         'mh': 'mah',
3768         'mi': 'mri',
3769         'mk': 'mkd',
3770         'ml': 'mal',
3771         'mn': 'mon',
3772         'mr': 'mar',
3773         'ms': 'msa',
3774         'mt': 'mlt',
3775         'my': 'mya',
3776         'na': 'nau',
3777         'nb': 'nob',
3778         'nd': 'nde',
3779         'ne': 'nep',
3780         'ng': 'ndo',
3781         'nl': 'nld',
3782         'nn': 'nno',
3783         'no': 'nor',
3784         'nr': 'nbl',
3785         'nv': 'nav',
3786         'ny': 'nya',
3787         'oc': 'oci',
3788         'oj': 'oji',
3789         'om': 'orm',
3790         'or': 'ori',
3791         'os': 'oss',
3792         'pa': 'pan',
3793         'pi': 'pli',
3794         'pl': 'pol',
3795         'ps': 'pus',
3796         'pt': 'por',
3797         'qu': 'que',
3798         'rm': 'roh',
3799         'rn': 'run',
3800         'ro': 'ron',
3801         'ru': 'rus',
3802         'rw': 'kin',
3803         'sa': 'san',
3804         'sc': 'srd',
3805         'sd': 'snd',
3806         'se': 'sme',
3807         'sg': 'sag',
3808         'si': 'sin',
3809         'sk': 'slk',
3810         'sl': 'slv',
3811         'sm': 'smo',
3812         'sn': 'sna',
3813         'so': 'som',
3814         'sq': 'sqi',
3815         'sr': 'srp',
3816         'ss': 'ssw',
3817         'st': 'sot',
3818         'su': 'sun',
3819         'sv': 'swe',
3820         'sw': 'swa',
3821         'ta': 'tam',
3822         'te': 'tel',
3823         'tg': 'tgk',
3824         'th': 'tha',
3825         'ti': 'tir',
3826         'tk': 'tuk',
3827         'tl': 'tgl',
3828         'tn': 'tsn',
3829         'to': 'ton',
3830         'tr': 'tur',
3831         'ts': 'tso',
3832         'tt': 'tat',
3833         'tw': 'twi',
3834         'ty': 'tah',
3835         'ug': 'uig',
3836         'uk': 'ukr',
3837         'ur': 'urd',
3838         'uz': 'uzb',
3839         've': 'ven',
3840         'vi': 'vie',
3841         'vo': 'vol',
3842         'wa': 'wln',
3843         'wo': 'wol',
3844         'xh': 'xho',
3845         'yi': 'yid',
3846         'ji': 'yid',  # Replaced by yi in 1989 revision
3847         'yo': 'yor',
3848         'za': 'zha',
3849         'zh': 'zho',
3850         'zu': 'zul',
3851     }
3852
3853     @classmethod
3854     def short2long(cls, code):
3855         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3856         return cls._lang_map.get(code[:2])
3857
3858     @classmethod
3859     def long2short(cls, code):
3860         """Convert language code from ISO 639-2/T to ISO 639-1"""
3861         for short_name, long_name in cls._lang_map.items():
3862             if long_name == code:
3863                 return short_name
3864
3865
3866 class ISO3166Utils(object):
3867     # From http://data.okfn.org/data/core/country-list
3868     _country_map = {
3869         'AF': 'Afghanistan',
3870         'AX': 'Åland Islands',
3871         'AL': 'Albania',
3872         'DZ': 'Algeria',
3873         'AS': 'American Samoa',
3874         'AD': 'Andorra',
3875         'AO': 'Angola',
3876         'AI': 'Anguilla',
3877         'AQ': 'Antarctica',
3878         'AG': 'Antigua and Barbuda',
3879         'AR': 'Argentina',
3880         'AM': 'Armenia',
3881         'AW': 'Aruba',
3882         'AU': 'Australia',
3883         'AT': 'Austria',
3884         'AZ': 'Azerbaijan',
3885         'BS': 'Bahamas',
3886         'BH': 'Bahrain',
3887         'BD': 'Bangladesh',
3888         'BB': 'Barbados',
3889         'BY': 'Belarus',
3890         'BE': 'Belgium',
3891         'BZ': 'Belize',
3892         'BJ': 'Benin',
3893         'BM': 'Bermuda',
3894         'BT': 'Bhutan',
3895         'BO': 'Bolivia, Plurinational State of',
3896         'BQ': 'Bonaire, Sint Eustatius and Saba',
3897         'BA': 'Bosnia and Herzegovina',
3898         'BW': 'Botswana',
3899         'BV': 'Bouvet Island',
3900         'BR': 'Brazil',
3901         'IO': 'British Indian Ocean Territory',
3902         'BN': 'Brunei Darussalam',
3903         'BG': 'Bulgaria',
3904         'BF': 'Burkina Faso',
3905         'BI': 'Burundi',
3906         'KH': 'Cambodia',
3907         'CM': 'Cameroon',
3908         'CA': 'Canada',
3909         'CV': 'Cape Verde',
3910         'KY': 'Cayman Islands',
3911         'CF': 'Central African Republic',
3912         'TD': 'Chad',
3913         'CL': 'Chile',
3914         'CN': 'China',
3915         'CX': 'Christmas Island',
3916         'CC': 'Cocos (Keeling) Islands',
3917         'CO': 'Colombia',
3918         'KM': 'Comoros',
3919         'CG': 'Congo',
3920         'CD': 'Congo, the Democratic Republic of the',
3921         'CK': 'Cook Islands',
3922         'CR': 'Costa Rica',
3923         'CI': 'Côte d\'Ivoire',
3924         'HR': 'Croatia',
3925         'CU': 'Cuba',
3926         'CW': 'Curaçao',
3927         'CY': 'Cyprus',
3928         'CZ': 'Czech Republic',
3929         'DK': 'Denmark',
3930         'DJ': 'Djibouti',
3931         'DM': 'Dominica',
3932         'DO': 'Dominican Republic',
3933         'EC': 'Ecuador',
3934         'EG': 'Egypt',
3935         'SV': 'El Salvador',
3936         'GQ': 'Equatorial Guinea',
3937         'ER': 'Eritrea',
3938         'EE': 'Estonia',
3939         'ET': 'Ethiopia',
3940         'FK': 'Falkland Islands (Malvinas)',
3941         'FO': 'Faroe Islands',
3942         'FJ': 'Fiji',
3943         'FI': 'Finland',
3944         'FR': 'France',
3945         'GF': 'French Guiana',
3946         'PF': 'French Polynesia',
3947         'TF': 'French Southern Territories',
3948         'GA': 'Gabon',
3949         'GM': 'Gambia',
3950         'GE': 'Georgia',
3951         'DE': 'Germany',
3952         'GH': 'Ghana',
3953         'GI': 'Gibraltar',
3954         'GR': 'Greece',
3955         'GL': 'Greenland',
3956         'GD': 'Grenada',
3957         'GP': 'Guadeloupe',
3958         'GU': 'Guam',
3959         'GT': 'Guatemala',
3960         'GG': 'Guernsey',
3961         'GN': 'Guinea',
3962         'GW': 'Guinea-Bissau',
3963         'GY': 'Guyana',
3964         'HT': 'Haiti',
3965         'HM': 'Heard Island and McDonald Islands',
3966         'VA': 'Holy See (Vatican City State)',
3967         'HN': 'Honduras',
3968         'HK': 'Hong Kong',
3969         'HU': 'Hungary',
3970         'IS': 'Iceland',
3971         'IN': 'India',
3972         'ID': 'Indonesia',
3973         'IR': 'Iran, Islamic Republic of',
3974         'IQ': 'Iraq',
3975         'IE': 'Ireland',
3976         'IM': 'Isle of Man',
3977         'IL': 'Israel',
3978         'IT': 'Italy',
3979         'JM': 'Jamaica',
3980         'JP': 'Japan',
3981         'JE': 'Jersey',
3982         'JO': 'Jordan',
3983         'KZ': 'Kazakhstan',
3984         'KE': 'Kenya',
3985         'KI': 'Kiribati',
3986         'KP': 'Korea, Democratic People\'s Republic of',
3987         'KR': 'Korea, Republic of',
3988         'KW': 'Kuwait',
3989         'KG': 'Kyrgyzstan',
3990         'LA': 'Lao People\'s Democratic Republic',
3991         'LV': 'Latvia',
3992         'LB': 'Lebanon',
3993         'LS': 'Lesotho',
3994         'LR': 'Liberia',
3995         'LY': 'Libya',
3996         'LI': 'Liechtenstein',
3997         'LT': 'Lithuania',
3998         'LU': 'Luxembourg',
3999         'MO': 'Macao',
4000         'MK': 'Macedonia, the Former Yugoslav Republic of',
4001         'MG': 'Madagascar',
4002         'MW': 'Malawi',
4003         'MY': 'Malaysia',
4004         'MV': 'Maldives',
4005         'ML': 'Mali',
4006         'MT': 'Malta',
4007         'MH': 'Marshall Islands',
4008         'MQ': 'Martinique',
4009         'MR': 'Mauritania',
4010         'MU': 'Mauritius',
4011         'YT': 'Mayotte',
4012         'MX': 'Mexico',
4013         'FM': 'Micronesia, Federated States of',
4014         'MD': 'Moldova, Republic of',
4015         'MC': 'Monaco',
4016         'MN': 'Mongolia',
4017         'ME': 'Montenegro',
4018         'MS': 'Montserrat',
4019         'MA': 'Morocco',
4020         'MZ': 'Mozambique',
4021         'MM': 'Myanmar',
4022         'NA': 'Namibia',
4023         'NR': 'Nauru',
4024         'NP': 'Nepal',
4025         'NL': 'Netherlands',
4026         'NC': 'New Caledonia',
4027         'NZ': 'New Zealand',
4028         'NI': 'Nicaragua',
4029         'NE': 'Niger',
4030         'NG': 'Nigeria',
4031         'NU': 'Niue',
4032         'NF': 'Norfolk Island',
4033         'MP': 'Northern Mariana Islands',
4034         'NO': 'Norway',
4035         'OM': 'Oman',
4036         'PK': 'Pakistan',
4037         'PW': 'Palau',
4038         'PS': 'Palestine, State of',
4039         'PA': 'Panama',
4040         'PG': 'Papua New Guinea',
4041         'PY': 'Paraguay',
4042         'PE': 'Peru',
4043         'PH': 'Philippines',
4044         'PN': 'Pitcairn',
4045         'PL': 'Poland',
4046         'PT': 'Portugal',
4047         'PR': 'Puerto Rico',
4048         'QA': 'Qatar',
4049         'RE': 'Réunion',
4050         'RO': 'Romania',
4051         'RU': 'Russian Federation',
4052         'RW': 'Rwanda',
4053         'BL': 'Saint Barthélemy',
4054         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4055         'KN': 'Saint Kitts and Nevis',
4056         'LC': 'Saint Lucia',
4057         'MF': 'Saint Martin (French part)',
4058         'PM': 'Saint Pierre and Miquelon',
4059         'VC': 'Saint Vincent and the Grenadines',
4060         'WS': 'Samoa',
4061         'SM': 'San Marino',
4062         'ST': 'Sao Tome and Principe',
4063         'SA': 'Saudi Arabia',
4064         'SN': 'Senegal',
4065         'RS': 'Serbia',
4066         'SC': 'Seychelles',
4067         'SL': 'Sierra Leone',
4068         'SG': 'Singapore',
4069         'SX': 'Sint Maarten (Dutch part)',
4070         'SK': 'Slovakia',
4071         'SI': 'Slovenia',
4072         'SB': 'Solomon Islands',
4073         'SO': 'Somalia',
4074         'ZA': 'South Africa',
4075         'GS': 'South Georgia and the South Sandwich Islands',
4076         'SS': 'South Sudan',
4077         'ES': 'Spain',
4078         'LK': 'Sri Lanka',
4079         'SD': 'Sudan',
4080         'SR': 'Suriname',
4081         'SJ': 'Svalbard and Jan Mayen',
4082         'SZ': 'Swaziland',
4083         'SE': 'Sweden',
4084         'CH': 'Switzerland',
4085         'SY': 'Syrian Arab Republic',
4086         'TW': 'Taiwan, Province of China',
4087         'TJ': 'Tajikistan',
4088         'TZ': 'Tanzania, United Republic of',
4089         'TH': 'Thailand',
4090         'TL': 'Timor-Leste',
4091         'TG': 'Togo',
4092         'TK': 'Tokelau',
4093         'TO': 'Tonga',
4094         'TT': 'Trinidad and Tobago',
4095         'TN': 'Tunisia',
4096         'TR': 'Turkey',
4097         'TM': 'Turkmenistan',
4098         'TC': 'Turks and Caicos Islands',
4099         'TV': 'Tuvalu',
4100         'UG': 'Uganda',
4101         'UA': 'Ukraine',
4102         'AE': 'United Arab Emirates',
4103         'GB': 'United Kingdom',
4104         'US': 'United States',
4105         'UM': 'United States Minor Outlying Islands',
4106         'UY': 'Uruguay',
4107         'UZ': 'Uzbekistan',
4108         'VU': 'Vanuatu',
4109         'VE': 'Venezuela, Bolivarian Republic of',
4110         'VN': 'Viet Nam',
4111         'VG': 'Virgin Islands, British',
4112         'VI': 'Virgin Islands, U.S.',
4113         'WF': 'Wallis and Futuna',
4114         'EH': 'Western Sahara',
4115         'YE': 'Yemen',
4116         'ZM': 'Zambia',
4117         'ZW': 'Zimbabwe',
4118     }
4119
4120     @classmethod
4121     def short2full(cls, code):
4122         """Convert an ISO 3166-2 country code to the corresponding full name"""
4123         return cls._country_map.get(code.upper())
4124
4125
4126 class GeoUtils(object):
4127     # Major IPv4 address blocks per country
4128     _country_ip_map = {
4129         'AD': '46.172.224.0/19',
4130         'AE': '94.200.0.0/13',
4131         'AF': '149.54.0.0/17',
4132         'AG': '209.59.64.0/18',
4133         'AI': '204.14.248.0/21',
4134         'AL': '46.99.0.0/16',
4135         'AM': '46.70.0.0/15',
4136         'AO': '105.168.0.0/13',
4137         'AP': '182.50.184.0/21',
4138         'AQ': '23.154.160.0/24',
4139         'AR': '181.0.0.0/12',
4140         'AS': '202.70.112.0/20',
4141         'AT': '77.116.0.0/14',
4142         'AU': '1.128.0.0/11',
4143         'AW': '181.41.0.0/18',
4144         'AX': '185.217.4.0/22',
4145         'AZ': '5.197.0.0/16',
4146         'BA': '31.176.128.0/17',
4147         'BB': '65.48.128.0/17',
4148         'BD': '114.130.0.0/16',
4149         'BE': '57.0.0.0/8',
4150         'BF': '102.178.0.0/15',
4151         'BG': '95.42.0.0/15',
4152         'BH': '37.131.0.0/17',
4153         'BI': '154.117.192.0/18',
4154         'BJ': '137.255.0.0/16',
4155         'BL': '185.212.72.0/23',
4156         'BM': '196.12.64.0/18',
4157         'BN': '156.31.0.0/16',
4158         'BO': '161.56.0.0/16',
4159         'BQ': '161.0.80.0/20',
4160         'BR': '191.128.0.0/12',
4161         'BS': '24.51.64.0/18',
4162         'BT': '119.2.96.0/19',
4163         'BW': '168.167.0.0/16',
4164         'BY': '178.120.0.0/13',
4165         'BZ': '179.42.192.0/18',
4166         'CA': '99.224.0.0/11',
4167         'CD': '41.243.0.0/16',
4168         'CF': '197.242.176.0/21',
4169         'CG': '160.113.0.0/16',
4170         'CH': '85.0.0.0/13',
4171         'CI': '102.136.0.0/14',
4172         'CK': '202.65.32.0/19',
4173         'CL': '152.172.0.0/14',
4174         'CM': '102.244.0.0/14',
4175         'CN': '36.128.0.0/10',
4176         'CO': '181.240.0.0/12',
4177         'CR': '201.192.0.0/12',
4178         'CU': '152.206.0.0/15',
4179         'CV': '165.90.96.0/19',
4180         'CW': '190.88.128.0/17',
4181         'CY': '31.153.0.0/16',
4182         'CZ': '88.100.0.0/14',
4183         'DE': '53.0.0.0/8',
4184         'DJ': '197.241.0.0/17',
4185         'DK': '87.48.0.0/12',
4186         'DM': '192.243.48.0/20',
4187         'DO': '152.166.0.0/15',
4188         'DZ': '41.96.0.0/12',
4189         'EC': '186.68.0.0/15',
4190         'EE': '90.190.0.0/15',
4191         'EG': '156.160.0.0/11',
4192         'ER': '196.200.96.0/20',
4193         'ES': '88.0.0.0/11',
4194         'ET': '196.188.0.0/14',
4195         'EU': '2.16.0.0/13',
4196         'FI': '91.152.0.0/13',
4197         'FJ': '144.120.0.0/16',
4198         'FK': '80.73.208.0/21',
4199         'FM': '119.252.112.0/20',
4200         'FO': '88.85.32.0/19',
4201         'FR': '90.0.0.0/9',
4202         'GA': '41.158.0.0/15',
4203         'GB': '25.0.0.0/8',
4204         'GD': '74.122.88.0/21',
4205         'GE': '31.146.0.0/16',
4206         'GF': '161.22.64.0/18',
4207         'GG': '62.68.160.0/19',
4208         'GH': '154.160.0.0/12',
4209         'GI': '95.164.0.0/16',
4210         'GL': '88.83.0.0/19',
4211         'GM': '160.182.0.0/15',
4212         'GN': '197.149.192.0/18',
4213         'GP': '104.250.0.0/19',
4214         'GQ': '105.235.224.0/20',
4215         'GR': '94.64.0.0/13',
4216         'GT': '168.234.0.0/16',
4217         'GU': '168.123.0.0/16',
4218         'GW': '197.214.80.0/20',
4219         'GY': '181.41.64.0/18',
4220         'HK': '113.252.0.0/14',
4221         'HN': '181.210.0.0/16',
4222         'HR': '93.136.0.0/13',
4223         'HT': '148.102.128.0/17',
4224         'HU': '84.0.0.0/14',
4225         'ID': '39.192.0.0/10',
4226         'IE': '87.32.0.0/12',
4227         'IL': '79.176.0.0/13',
4228         'IM': '5.62.80.0/20',
4229         'IN': '117.192.0.0/10',
4230         'IO': '203.83.48.0/21',
4231         'IQ': '37.236.0.0/14',
4232         'IR': '2.176.0.0/12',
4233         'IS': '82.221.0.0/16',
4234         'IT': '79.0.0.0/10',
4235         'JE': '87.244.64.0/18',
4236         'JM': '72.27.0.0/17',
4237         'JO': '176.29.0.0/16',
4238         'JP': '133.0.0.0/8',
4239         'KE': '105.48.0.0/12',
4240         'KG': '158.181.128.0/17',
4241         'KH': '36.37.128.0/17',
4242         'KI': '103.25.140.0/22',
4243         'KM': '197.255.224.0/20',
4244         'KN': '198.167.192.0/19',
4245         'KP': '175.45.176.0/22',
4246         'KR': '175.192.0.0/10',
4247         'KW': '37.36.0.0/14',
4248         'KY': '64.96.0.0/15',
4249         'KZ': '2.72.0.0/13',
4250         'LA': '115.84.64.0/18',
4251         'LB': '178.135.0.0/16',
4252         'LC': '24.92.144.0/20',
4253         'LI': '82.117.0.0/19',
4254         'LK': '112.134.0.0/15',
4255         'LR': '102.183.0.0/16',
4256         'LS': '129.232.0.0/17',
4257         'LT': '78.56.0.0/13',
4258         'LU': '188.42.0.0/16',
4259         'LV': '46.109.0.0/16',
4260         'LY': '41.252.0.0/14',
4261         'MA': '105.128.0.0/11',
4262         'MC': '88.209.64.0/18',
4263         'MD': '37.246.0.0/16',
4264         'ME': '178.175.0.0/17',
4265         'MF': '74.112.232.0/21',
4266         'MG': '154.126.0.0/17',
4267         'MH': '117.103.88.0/21',
4268         'MK': '77.28.0.0/15',
4269         'ML': '154.118.128.0/18',
4270         'MM': '37.111.0.0/17',
4271         'MN': '49.0.128.0/17',
4272         'MO': '60.246.0.0/16',
4273         'MP': '202.88.64.0/20',
4274         'MQ': '109.203.224.0/19',
4275         'MR': '41.188.64.0/18',
4276         'MS': '208.90.112.0/22',
4277         'MT': '46.11.0.0/16',
4278         'MU': '105.16.0.0/12',
4279         'MV': '27.114.128.0/18',
4280         'MW': '102.70.0.0/15',
4281         'MX': '187.192.0.0/11',
4282         'MY': '175.136.0.0/13',
4283         'MZ': '197.218.0.0/15',
4284         'NA': '41.182.0.0/16',
4285         'NC': '101.101.0.0/18',
4286         'NE': '197.214.0.0/18',
4287         'NF': '203.17.240.0/22',
4288         'NG': '105.112.0.0/12',
4289         'NI': '186.76.0.0/15',
4290         'NL': '145.96.0.0/11',
4291         'NO': '84.208.0.0/13',
4292         'NP': '36.252.0.0/15',
4293         'NR': '203.98.224.0/19',
4294         'NU': '49.156.48.0/22',
4295         'NZ': '49.224.0.0/14',
4296         'OM': '5.36.0.0/15',
4297         'PA': '186.72.0.0/15',
4298         'PE': '186.160.0.0/14',
4299         'PF': '123.50.64.0/18',
4300         'PG': '124.240.192.0/19',
4301         'PH': '49.144.0.0/13',
4302         'PK': '39.32.0.0/11',
4303         'PL': '83.0.0.0/11',
4304         'PM': '70.36.0.0/20',
4305         'PR': '66.50.0.0/16',
4306         'PS': '188.161.0.0/16',
4307         'PT': '85.240.0.0/13',
4308         'PW': '202.124.224.0/20',
4309         'PY': '181.120.0.0/14',
4310         'QA': '37.210.0.0/15',
4311         'RE': '102.35.0.0/16',
4312         'RO': '79.112.0.0/13',
4313         'RS': '93.86.0.0/15',
4314         'RU': '5.136.0.0/13',
4315         'RW': '41.186.0.0/16',
4316         'SA': '188.48.0.0/13',
4317         'SB': '202.1.160.0/19',
4318         'SC': '154.192.0.0/11',
4319         'SD': '102.120.0.0/13',
4320         'SE': '78.64.0.0/12',
4321         'SG': '8.128.0.0/10',
4322         'SI': '188.196.0.0/14',
4323         'SK': '78.98.0.0/15',
4324         'SL': '102.143.0.0/17',
4325         'SM': '89.186.32.0/19',
4326         'SN': '41.82.0.0/15',
4327         'SO': '154.115.192.0/18',
4328         'SR': '186.179.128.0/17',
4329         'SS': '105.235.208.0/21',
4330         'ST': '197.159.160.0/19',
4331         'SV': '168.243.0.0/16',
4332         'SX': '190.102.0.0/20',
4333         'SY': '5.0.0.0/16',
4334         'SZ': '41.84.224.0/19',
4335         'TC': '65.255.48.0/20',
4336         'TD': '154.68.128.0/19',
4337         'TG': '196.168.0.0/14',
4338         'TH': '171.96.0.0/13',
4339         'TJ': '85.9.128.0/18',
4340         'TK': '27.96.24.0/21',
4341         'TL': '180.189.160.0/20',
4342         'TM': '95.85.96.0/19',
4343         'TN': '197.0.0.0/11',
4344         'TO': '175.176.144.0/21',
4345         'TR': '78.160.0.0/11',
4346         'TT': '186.44.0.0/15',
4347         'TV': '202.2.96.0/19',
4348         'TW': '120.96.0.0/11',
4349         'TZ': '156.156.0.0/14',
4350         'UA': '37.52.0.0/14',
4351         'UG': '102.80.0.0/13',
4352         'US': '6.0.0.0/8',
4353         'UY': '167.56.0.0/13',
4354         'UZ': '84.54.64.0/18',
4355         'VA': '212.77.0.0/19',
4356         'VC': '207.191.240.0/21',
4357         'VE': '186.88.0.0/13',
4358         'VG': '66.81.192.0/20',
4359         'VI': '146.226.0.0/16',
4360         'VN': '14.160.0.0/11',
4361         'VU': '202.80.32.0/20',
4362         'WF': '117.20.32.0/21',
4363         'WS': '202.4.32.0/19',
4364         'YE': '134.35.0.0/16',
4365         'YT': '41.242.116.0/22',
4366         'ZA': '41.0.0.0/11',
4367         'ZM': '102.144.0.0/13',
4368         'ZW': '102.177.192.0/18',
4369     }
4370
4371     @classmethod
4372     def random_ipv4(cls, code_or_block):
4373         if len(code_or_block) == 2:
4374             block = cls._country_ip_map.get(code_or_block.upper())
4375             if not block:
4376                 return None
4377         else:
4378             block = code_or_block
4379         addr, preflen = block.split('/')
4380         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4381         addr_max = addr_min | (0xffffffff >> int(preflen))
4382         return compat_str(socket.inet_ntoa(
4383             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4384
4385
4386 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4387     def __init__(self, proxies=None):
4388         # Set default handlers
4389         for type in ('http', 'https'):
4390             setattr(self, '%s_open' % type,
4391                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4392                         meth(r, proxy, type))
4393         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4394
4395     def proxy_open(self, req, proxy, type):
4396         req_proxy = req.headers.get('Ytdl-request-proxy')
4397         if req_proxy is not None:
4398             proxy = req_proxy
4399             del req.headers['Ytdl-request-proxy']
4400
4401         if proxy == '__noproxy__':
4402             return None  # No Proxy
4403         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4404             req.add_header('Ytdl-socks-proxy', proxy)
4405             # yt-dlp's http/https handlers do wrapping the socket with socks
4406             return None
4407         return compat_urllib_request.ProxyHandler.proxy_open(
4408             self, req, proxy, type)
4409
4410
4411 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4412 # released into Public Domain
4413 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4414
4415 def long_to_bytes(n, blocksize=0):
4416     """long_to_bytes(n:long, blocksize:int) : string
4417     Convert a long integer to a byte string.
4418
4419     If optional blocksize is given and greater than zero, pad the front of the
4420     byte string with binary zeros so that the length is a multiple of
4421     blocksize.
4422     """
4423     # after much testing, this algorithm was deemed to be the fastest
4424     s = b''
4425     n = int(n)
4426     while n > 0:
4427         s = compat_struct_pack('>I', n & 0xffffffff) + s
4428         n = n >> 32
4429     # strip off leading zeros
4430     for i in range(len(s)):
4431         if s[i] != b'\000'[0]:
4432             break
4433     else:
4434         # only happens when n == 0
4435         s = b'\000'
4436         i = 0
4437     s = s[i:]
4438     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4439     # de-padding being done above, but sigh...
4440     if blocksize > 0 and len(s) % blocksize:
4441         s = (blocksize - len(s) % blocksize) * b'\000' + s
4442     return s
4443
4444
4445 def bytes_to_long(s):
4446     """bytes_to_long(string) : long
4447     Convert a byte string to a long integer.
4448
4449     This is (essentially) the inverse of long_to_bytes().
4450     """
4451     acc = 0
4452     length = len(s)
4453     if length % 4:
4454         extra = (4 - length % 4)
4455         s = b'\000' * extra + s
4456         length = length + extra
4457     for i in range(0, length, 4):
4458         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4459     return acc
4460
4461
4462 def ohdave_rsa_encrypt(data, exponent, modulus):
4463     '''
4464     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4465
4466     Input:
4467         data: data to encrypt, bytes-like object
4468         exponent, modulus: parameter e and N of RSA algorithm, both integer
4469     Output: hex string of encrypted data
4470
4471     Limitation: supports one block encryption only
4472     '''
4473
4474     payload = int(binascii.hexlify(data[::-1]), 16)
4475     encrypted = pow(payload, exponent, modulus)
4476     return '%x' % encrypted
4477
4478
4479 def pkcs1pad(data, length):
4480     """
4481     Padding input data with PKCS#1 scheme
4482
4483     @param {int[]} data        input data
4484     @param {int}   length      target length
4485     @returns {int[]}           padded data
4486     """
4487     if len(data) > length - 11:
4488         raise ValueError('Input data too long for PKCS#1 padding')
4489
4490     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4491     return [0, 2] + pseudo_random + [0] + data
4492
4493
4494 def encode_base_n(num, n, table=None):
4495     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4496     if not table:
4497         table = FULL_TABLE[:n]
4498
4499     if n > len(table):
4500         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4501
4502     if num == 0:
4503         return table[0]
4504
4505     ret = ''
4506     while num:
4507         ret = table[num % n] + ret
4508         num = num // n
4509     return ret
4510
4511
4512 def decode_packed_codes(code):
4513     mobj = re.search(PACKED_CODES_RE, code)
4514     obfuscated_code, base, count, symbols = mobj.groups()
4515     base = int(base)
4516     count = int(count)
4517     symbols = symbols.split('|')
4518     symbol_table = {}
4519
4520     while count:
4521         count -= 1
4522         base_n_count = encode_base_n(count, base)
4523         symbol_table[base_n_count] = symbols[count] or base_n_count
4524
4525     return re.sub(
4526         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4527         obfuscated_code)
4528
4529
4530 def caesar(s, alphabet, shift):
4531     if shift == 0:
4532         return s
4533     l = len(alphabet)
4534     return ''.join(
4535         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4536         for c in s)
4537
4538
4539 def rot47(s):
4540     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4541
4542
4543 def parse_m3u8_attributes(attrib):
4544     info = {}
4545     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4546         if val.startswith('"'):
4547             val = val[1:-1]
4548         info[key] = val
4549     return info
4550
4551
4552 def urshift(val, n):
4553     return val >> n if val >= 0 else (val + 0x100000000) >> n
4554
4555
4556 # Based on png2str() written by @gdkchan and improved by @yokrysty
4557 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4558 def decode_png(png_data):
4559     # Reference: https://www.w3.org/TR/PNG/
4560     header = png_data[8:]
4561
4562     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4563         raise IOError('Not a valid PNG file.')
4564
4565     int_map = {1: '>B', 2: '>H', 4: '>I'}
4566     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4567
4568     chunks = []
4569
4570     while header:
4571         length = unpack_integer(header[:4])
4572         header = header[4:]
4573
4574         chunk_type = header[:4]
4575         header = header[4:]
4576
4577         chunk_data = header[:length]
4578         header = header[length:]
4579
4580         header = header[4:]  # Skip CRC
4581
4582         chunks.append({
4583             'type': chunk_type,
4584             'length': length,
4585             'data': chunk_data
4586         })
4587
4588     ihdr = chunks[0]['data']
4589
4590     width = unpack_integer(ihdr[:4])
4591     height = unpack_integer(ihdr[4:8])
4592
4593     idat = b''
4594
4595     for chunk in chunks:
4596         if chunk['type'] == b'IDAT':
4597             idat += chunk['data']
4598
4599     if not idat:
4600         raise IOError('Unable to read PNG data.')
4601
4602     decompressed_data = bytearray(zlib.decompress(idat))
4603
4604     stride = width * 3
4605     pixels = []
4606
4607     def _get_pixel(idx):
4608         x = idx % stride
4609         y = idx // stride
4610         return pixels[y][x]
4611
4612     for y in range(height):
4613         basePos = y * (1 + stride)
4614         filter_type = decompressed_data[basePos]
4615
4616         current_row = []
4617
4618         pixels.append(current_row)
4619
4620         for x in range(stride):
4621             color = decompressed_data[1 + basePos + x]
4622             basex = y * stride + x
4623             left = 0
4624             up = 0
4625
4626             if x > 2:
4627                 left = _get_pixel(basex - 3)
4628             if y > 0:
4629                 up = _get_pixel(basex - stride)
4630
4631             if filter_type == 1:  # Sub
4632                 color = (color + left) & 0xff
4633             elif filter_type == 2:  # Up
4634                 color = (color + up) & 0xff
4635             elif filter_type == 3:  # Average
4636                 color = (color + ((left + up) >> 1)) & 0xff
4637             elif filter_type == 4:  # Paeth
4638                 a = left
4639                 b = up
4640                 c = 0
4641
4642                 if x > 2 and y > 0:
4643                     c = _get_pixel(basex - stride - 3)
4644
4645                 p = a + b - c
4646
4647                 pa = abs(p - a)
4648                 pb = abs(p - b)
4649                 pc = abs(p - c)
4650
4651                 if pa <= pb and pa <= pc:
4652                     color = (color + a) & 0xff
4653                 elif pb <= pc:
4654                     color = (color + b) & 0xff
4655                 else:
4656                     color = (color + c) & 0xff
4657
4658             current_row.append(color)
4659
4660     return width, height, pixels
4661
4662
4663 def write_xattr(path, key, value):
4664     # This mess below finds the best xattr tool for the job
4665     try:
4666         # try the pyxattr module...
4667         import xattr
4668
4669         if hasattr(xattr, 'set'):  # pyxattr
4670             # Unicode arguments are not supported in python-pyxattr until
4671             # version 0.5.0
4672             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4673             pyxattr_required_version = '0.5.0'
4674             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4675                 # TODO: fallback to CLI tools
4676                 raise XAttrUnavailableError(
4677                     'python-pyxattr is detected but is too old. '
4678                     'yt-dlp requires %s or above while your version is %s. '
4679                     'Falling back to other xattr implementations' % (
4680                         pyxattr_required_version, xattr.__version__))
4681
4682             setxattr = xattr.set
4683         else:  # xattr
4684             setxattr = xattr.setxattr
4685
4686         try:
4687             setxattr(path, key, value)
4688         except EnvironmentError as e:
4689             raise XAttrMetadataError(e.errno, e.strerror)
4690
4691     except ImportError:
4692         if compat_os_name == 'nt':
4693             # Write xattrs to NTFS Alternate Data Streams:
4694             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4695             assert ':' not in key
4696             assert os.path.exists(path)
4697
4698             ads_fn = path + ':' + key
4699             try:
4700                 with open(ads_fn, 'wb') as f:
4701                     f.write(value)
4702             except EnvironmentError as e:
4703                 raise XAttrMetadataError(e.errno, e.strerror)
4704         else:
4705             user_has_setfattr = check_executable('setfattr', ['--version'])
4706             user_has_xattr = check_executable('xattr', ['-h'])
4707
4708             if user_has_setfattr or user_has_xattr:
4709
4710                 value = value.decode('utf-8')
4711                 if user_has_setfattr:
4712                     executable = 'setfattr'
4713                     opts = ['-n', key, '-v', value]
4714                 elif user_has_xattr:
4715                     executable = 'xattr'
4716                     opts = ['-w', key, value]
4717
4718                 cmd = ([encodeFilename(executable, True)]
4719                        + [encodeArgument(o) for o in opts]
4720                        + [encodeFilename(path, True)])
4721
4722                 try:
4723                     p = Popen(
4724                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4725                 except EnvironmentError as e:
4726                     raise XAttrMetadataError(e.errno, e.strerror)
4727                 stdout, stderr = p.communicate_or_kill()
4728                 stderr = stderr.decode('utf-8', 'replace')
4729                 if p.returncode != 0:
4730                     raise XAttrMetadataError(p.returncode, stderr)
4731
4732             else:
4733                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4734                 if sys.platform.startswith('linux'):
4735                     raise XAttrUnavailableError(
4736                         "Couldn't find a tool to set the xattrs. "
4737                         "Install either the python 'pyxattr' or 'xattr' "
4738                         "modules, or the GNU 'attr' package "
4739                         "(which contains the 'setfattr' tool).")
4740                 else:
4741                     raise XAttrUnavailableError(
4742                         "Couldn't find a tool to set the xattrs. "
4743                         "Install either the python 'xattr' module, "
4744                         "or the 'xattr' binary.")
4745
4746
4747 def random_birthday(year_field, month_field, day_field):
4748     start_date = datetime.date(1950, 1, 1)
4749     end_date = datetime.date(1995, 12, 31)
4750     offset = random.randint(0, (end_date - start_date).days)
4751     random_date = start_date + datetime.timedelta(offset)
4752     return {
4753         year_field: str(random_date.year),
4754         month_field: str(random_date.month),
4755         day_field: str(random_date.day),
4756     }
4757
4758
4759 # Templates for internet shortcut files, which are plain text files.
4760 DOT_URL_LINK_TEMPLATE = '''
4761 [InternetShortcut]
4762 URL=%(url)s
4763 '''.lstrip()
4764
4765 DOT_WEBLOC_LINK_TEMPLATE = '''
4766 <?xml version="1.0" encoding="UTF-8"?>
4767 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4768 <plist version="1.0">
4769 <dict>
4770 \t<key>URL</key>
4771 \t<string>%(url)s</string>
4772 </dict>
4773 </plist>
4774 '''.lstrip()
4775
4776 DOT_DESKTOP_LINK_TEMPLATE = '''
4777 [Desktop Entry]
4778 Encoding=UTF-8
4779 Name=%(filename)s
4780 Type=Link
4781 URL=%(url)s
4782 Icon=text-html
4783 '''.lstrip()
4784
4785 LINK_TEMPLATES = {
4786     'url': DOT_URL_LINK_TEMPLATE,
4787     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4788     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4789 }
4790
4791
4792 def iri_to_uri(iri):
4793     """
4794     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4795
4796     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4797     """
4798
4799     iri_parts = compat_urllib_parse_urlparse(iri)
4800
4801     if '[' in iri_parts.netloc:
4802         raise ValueError('IPv6 URIs are not, yet, supported.')
4803         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4804
4805     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4806
4807     net_location = ''
4808     if iri_parts.username:
4809         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4810         if iri_parts.password is not None:
4811             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4812         net_location += '@'
4813
4814     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4815     # The 'idna' encoding produces ASCII text.
4816     if iri_parts.port is not None and iri_parts.port != 80:
4817         net_location += ':' + str(iri_parts.port)
4818
4819     return compat_urllib_parse_urlunparse(
4820         (iri_parts.scheme,
4821             net_location,
4822
4823             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4824
4825             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4826             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4827
4828             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4829             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4830
4831             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4832
4833     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4834
4835
4836 def to_high_limit_path(path):
4837     if sys.platform in ['win32', 'cygwin']:
4838         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4839         return r'\\?\ '.rstrip() + os.path.abspath(path)
4840
4841     return path
4842
4843
4844 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4845     if field is None:
4846         val = obj if obj is not None else default
4847     else:
4848         val = obj.get(field, default)
4849     if func and val not in ignore:
4850         val = func(val)
4851     return template % val if val not in ignore else default
4852
4853
4854 def clean_podcast_url(url):
4855     return re.sub(r'''(?x)
4856         (?:
4857             (?:
4858                 chtbl\.com/track|
4859                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4860                 play\.podtrac\.com
4861             )/[^/]+|
4862             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4863             flex\.acast\.com|
4864             pd(?:
4865                 cn\.co| # https://podcorn.com/analytics-prefix/
4866                 st\.fm # https://podsights.com/docs/
4867             )/e
4868         )/''', '', url)
4869
4870
4871 _HEX_TABLE = '0123456789abcdef'
4872
4873
4874 def random_uuidv4():
4875     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4876
4877
4878 def make_dir(path, to_screen=None):
4879     try:
4880         dn = os.path.dirname(path)
4881         if dn and not os.path.exists(dn):
4882             os.makedirs(dn)
4883         return True
4884     except (OSError, IOError) as err:
4885         if callable(to_screen) is not None:
4886             to_screen('unable to create directory ' + error_to_compat_str(err))
4887         return False
4888
4889
4890 def get_executable_path():
4891     from zipimport import zipimporter
4892     if hasattr(sys, 'frozen'):  # Running from PyInstaller
4893         path = os.path.dirname(sys.executable)
4894     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
4895         path = os.path.join(os.path.dirname(__file__), '../..')
4896     else:
4897         path = os.path.join(os.path.dirname(__file__), '..')
4898     return os.path.abspath(path)
4899
4900
4901 def load_plugins(name, suffix, namespace):
4902     classes = {}
4903     try:
4904         plugins_spec = importlib.util.spec_from_file_location(
4905             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4906         plugins = importlib.util.module_from_spec(plugins_spec)
4907         sys.modules[plugins_spec.name] = plugins
4908         plugins_spec.loader.exec_module(plugins)
4909         for name in dir(plugins):
4910             if name in namespace:
4911                 continue
4912             if not name.endswith(suffix):
4913                 continue
4914             klass = getattr(plugins, name)
4915             classes[name] = namespace[name] = klass
4916     except FileNotFoundError:
4917         pass
4918     return classes
4919
4920
4921 def traverse_obj(
4922         obj, *path_list, default=None, expected_type=None, get_all=True,
4923         casesense=True, is_user_input=False, traverse_string=False):
4924     ''' Traverse nested list/dict/tuple
4925     @param path_list        A list of paths which are checked one by one.
4926                             Each path is a list of keys where each key is a string,
4927                             a function, a tuple of strings or "...".
4928                             When a fuction is given, it takes the key as argument and
4929                             returns whether the key matches or not. When a tuple is given,
4930                             all the keys given in the tuple are traversed, and
4931                             "..." traverses all the keys in the object
4932     @param default          Default value to return
4933     @param expected_type    Only accept final value of this type (Can also be any callable)
4934     @param get_all          Return all the values obtained from a path or only the first one
4935     @param casesense        Whether to consider dictionary keys as case sensitive
4936     @param is_user_input    Whether the keys are generated from user input. If True,
4937                             strings are converted to int/slice if necessary
4938     @param traverse_string  Whether to traverse inside strings. If True, any
4939                             non-compatible object will also be converted into a string
4940     # TODO: Write tests
4941     '''
4942     if not casesense:
4943         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4944         path_list = (map(_lower, variadic(path)) for path in path_list)
4945
4946     def _traverse_obj(obj, path, _current_depth=0):
4947         nonlocal depth
4948         path = tuple(variadic(path))
4949         for i, key in enumerate(path):
4950             if obj is None:
4951                 return None
4952             if isinstance(key, (list, tuple)):
4953                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4954                 key = ...
4955             if key is ...:
4956                 obj = (obj.values() if isinstance(obj, dict)
4957                        else obj if isinstance(obj, (list, tuple, LazyList))
4958                        else str(obj) if traverse_string else [])
4959                 _current_depth += 1
4960                 depth = max(depth, _current_depth)
4961                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4962             elif callable(key):
4963                 if isinstance(obj, (list, tuple, LazyList)):
4964                     obj = enumerate(obj)
4965                 elif isinstance(obj, dict):
4966                     obj = obj.items()
4967                 else:
4968                     if not traverse_string:
4969                         return None
4970                     obj = str(obj)
4971                 _current_depth += 1
4972                 depth = max(depth, _current_depth)
4973                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
4974             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4975                 obj = (obj.get(key) if casesense or (key in obj)
4976                        else next((v for k, v in obj.items() if _lower(k) == key), None))
4977             else:
4978                 if is_user_input:
4979                     key = (int_or_none(key) if ':' not in key
4980                            else slice(*map(int_or_none, key.split(':'))))
4981                     if key == slice(None):
4982                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4983                 if not isinstance(key, (int, slice)):
4984                     return None
4985                 if not isinstance(obj, (list, tuple, LazyList)):
4986                     if not traverse_string:
4987                         return None
4988                     obj = str(obj)
4989                 try:
4990                     obj = obj[key]
4991                 except IndexError:
4992                     return None
4993         return obj
4994
4995     if isinstance(expected_type, type):
4996         type_test = lambda val: val if isinstance(val, expected_type) else None
4997     elif expected_type is not None:
4998         type_test = expected_type
4999     else:
5000         type_test = lambda val: val
5001
5002     for path in path_list:
5003         depth = 0
5004         val = _traverse_obj(obj, path)
5005         if val is not None:
5006             if depth:
5007                 for _ in range(depth - 1):
5008                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5009                 val = [v for v in map(type_test, val) if v is not None]
5010                 if val:
5011                     return val if get_all else val[0]
5012             else:
5013                 val = type_test(val)
5014                 if val is not None:
5015                     return val
5016     return default
5017
5018
5019 # Deprecated
5020 def traverse_dict(dictn, keys, casesense=True):
5021     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5022                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5023     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5024
5025
5026 def variadic(x, allowed_types=(str, bytes)):
5027     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5028
5029
5030 # create a JSON Web Signature (jws) with HS256 algorithm
5031 # the resulting format is in JWS Compact Serialization
5032 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5033 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5034 def jwt_encode_hs256(payload_data, key, headers={}):
5035     header_data = {
5036         'alg': 'HS256',
5037         'typ': 'JWT',
5038     }
5039     if headers:
5040         header_data.update(headers)
5041     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5042     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5043     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5044     signature_b64 = base64.b64encode(h.digest())
5045     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5046     return token
5047
5048
5049 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5050 def jwt_decode_hs256(jwt):
5051     header_b64, payload_b64, signature_b64 = jwt.split('.')
5052     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5053     return payload_data
5054
5055
5056 def supports_terminal_sequences(stream):
5057     if compat_os_name == 'nt':
5058         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5059         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5060             return False
5061     elif not os.getenv('TERM'):
5062         return False
5063     try:
5064         return stream.isatty()
5065     except BaseException:
5066         return False
5067
5068
5069 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5070
5071
5072 def remove_terminal_sequences(string):
5073     return _terminal_sequences_re.sub('', string)
5074
5075
5076 def number_of_digits(number):
5077     return len('%d' % number)
5078
5079
5080 def join_nonempty(*values, delim='-', from_dict=None):
5081     if from_dict is not None:
5082         values = map(from_dict.get, values)
5083     return delim.join(map(str, filter(None, values)))