yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_brotli,
  51     compat_chr,
  52     compat_cookiejar,
  53     compat_ctypes_WINFUNCTYPE,
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_html_entities,
  57     compat_html_entities_html5,
  58     compat_http_client,
  59     compat_integer_types,
  60     compat_numeric_types,
  61     compat_kwargs,
  62     compat_os_name,
  63     compat_parse_qs,
  64     compat_shlex_split,
  65     compat_shlex_quote,
  66     compat_str,
  67     compat_struct_pack,
  68     compat_struct_unpack,
  69     compat_urllib_error,
  70     compat_urllib_parse,
  71     compat_urllib_parse_urlencode,
  72     compat_urllib_parse_urlparse,
  73     compat_urllib_parse_urlunparse,
  74     compat_urllib_parse_quote,
  75     compat_urllib_parse_quote_plus,
  76     compat_urllib_parse_unquote_plus,
  77     compat_urllib_request,
  78     compat_urlparse,
  79     compat_websockets,
  80     compat_xpath,
  81 )
  82
  83 from .socks import (
  84     ProxyType,
  85     sockssocket,
  86 )
  87
  88 try:
  89     import certifi
  90     has_certifi = True
  91 except ImportError:
  92     has_certifi = False
  93
  94
  95 def register_socks_protocols():
  96     # "Register" SOCKS protocols
  97     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  98     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  99     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
 100         if scheme not in compat_urlparse.uses_netloc:
 101             compat_urlparse.uses_netloc.append(scheme)
 102
 103
 104 # This is not clearly defined otherwise
 105 compiled_regex_type = type(re.compile(''))
 106
 107
 108 def random_user_agent():
 109     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 110     _CHROME_VERSIONS = (
 111         '90.0.4430.212',
 112         '90.0.4430.24',
 113         '90.0.4430.70',
 114         '90.0.4430.72',
 115         '90.0.4430.85',
 116         '90.0.4430.93',
 117         '91.0.4472.101',
 118         '91.0.4472.106',
 119         '91.0.4472.114',
 120         '91.0.4472.124',
 121         '91.0.4472.164',
 122         '91.0.4472.19',
 123         '91.0.4472.77',
 124         '92.0.4515.107',
 125         '92.0.4515.115',
 126         '92.0.4515.131',
 127         '92.0.4515.159',
 128         '92.0.4515.43',
 129         '93.0.4556.0',
 130         '93.0.4577.15',
 131         '93.0.4577.63',
 132         '93.0.4577.82',
 133         '94.0.4606.41',
 134         '94.0.4606.54',
 135         '94.0.4606.61',
 136         '94.0.4606.71',
 137         '94.0.4606.81',
 138         '94.0.4606.85',
 139         '95.0.4638.17',
 140         '95.0.4638.50',
 141         '95.0.4638.54',
 142         '95.0.4638.69',
 143         '95.0.4638.74',
 144         '96.0.4664.18',
 145         '96.0.4664.45',
 146         '96.0.4664.55',
 147         '96.0.4664.93',
 148         '97.0.4692.20',
 149     )
 150     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 151
 152
 153 SUPPORTED_ENCODINGS = [
 154     'gzip', 'deflate'
 155 ]
 156 if compat_brotli:
 157     SUPPORTED_ENCODINGS.append('br')
 158
 159 std_headers = {
 160     'User-Agent': random_user_agent(),
 161     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 162     'Accept-Language': 'en-us,en;q=0.5',
 163     'Sec-Fetch-Mode': 'navigate',
 164 }
 165
 166
 167 USER_AGENTS = {
 168     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 169 }
 170
 171
 172 NO_DEFAULT = object()
 173
 174 ENGLISH_MONTH_NAMES = [
 175     'January', 'February', 'March', 'April', 'May', 'June',
 176     'July', 'August', 'September', 'October', 'November', 'December']
 177
 178 MONTH_NAMES = {
 179     'en': ENGLISH_MONTH_NAMES,
 180     'fr': [
 181         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 182         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 183 }
 184
 185 KNOWN_EXTENSIONS = (
 186     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 187     'flv', 'f4v', 'f4a', 'f4b',
 188     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 189     'mkv', 'mka', 'mk3d',
 190     'avi', 'divx',
 191     'mov',
 192     'asf', 'wmv', 'wma',
 193     '3gp', '3g2',
 194     'mp3',
 195     'flac',
 196     'ape',
 197     'wav',
 198     'f4f', 'f4m', 'm3u8', 'smil')
 199
 200 # needed for sanitizing filenames in restricted mode
 201 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 202                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 203                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 204
 205 DATE_FORMATS = (
 206     '%d %B %Y',
 207     '%d %b %Y',
 208     '%B %d %Y',
 209     '%B %dst %Y',
 210     '%B %dnd %Y',
 211     '%B %drd %Y',
 212     '%B %dth %Y',
 213     '%b %d %Y',
 214     '%b %dst %Y',
 215     '%b %dnd %Y',
 216     '%b %drd %Y',
 217     '%b %dth %Y',
 218     '%b %dst %Y %I:%M',
 219     '%b %dnd %Y %I:%M',
 220     '%b %drd %Y %I:%M',
 221     '%b %dth %Y %I:%M',
 222     '%Y %m %d',
 223     '%Y-%m-%d',
 224     '%Y.%m.%d.',
 225     '%Y/%m/%d',
 226     '%Y/%m/%d %H:%M',
 227     '%Y/%m/%d %H:%M:%S',
 228     '%Y%m%d%H%M',
 229     '%Y%m%d%H%M%S',
 230     '%Y%m%d',
 231     '%Y-%m-%d %H:%M',
 232     '%Y-%m-%d %H:%M:%S',
 233     '%Y-%m-%d %H:%M:%S.%f',
 234     '%Y-%m-%d %H:%M:%S:%f',
 235     '%d.%m.%Y %H:%M',
 236     '%d.%m.%Y %H.%M',
 237     '%Y-%m-%dT%H:%M:%SZ',
 238     '%Y-%m-%dT%H:%M:%S.%fZ',
 239     '%Y-%m-%dT%H:%M:%S.%f0Z',
 240     '%Y-%m-%dT%H:%M:%S',
 241     '%Y-%m-%dT%H:%M:%S.%f',
 242     '%Y-%m-%dT%H:%M',
 243     '%b %d %Y at %H:%M',
 244     '%b %d %Y at %H:%M:%S',
 245     '%B %d %Y at %H:%M',
 246     '%B %d %Y at %H:%M:%S',
 247     '%H:%M %d-%b-%Y',
 248 )
 249
 250 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 251 DATE_FORMATS_DAY_FIRST.extend([
 252     '%d-%m-%Y',
 253     '%d.%m.%Y',
 254     '%d.%m.%y',
 255     '%d/%m/%Y',
 256     '%d/%m/%y',
 257     '%d/%m/%Y %H:%M:%S',
 258 ])
 259
 260 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 261 DATE_FORMATS_MONTH_FIRST.extend([
 262     '%m-%d-%Y',
 263     '%m.%d.%Y',
 264     '%m/%d/%Y',
 265     '%m/%d/%y',
 266     '%m/%d/%Y %H:%M:%S',
 267 ])
 268
 269 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 270 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 271
 272
 273 def preferredencoding():
 274     """Get preferred encoding.
 275
 276     Returns the best encoding scheme for the system, based on
 277     locale.getpreferredencoding() and some further tweaks.
 278     """
 279     try:
 280         pref = locale.getpreferredencoding()
 281         'TEST'.encode(pref)
 282     except Exception:
 283         pref = 'UTF-8'
 284
 285     return pref
 286
 287
 288 def write_json_file(obj, fn):
 289     """ Encode obj as JSON and write it to fn, atomically if possible """
 290
 291     fn = encodeFilename(fn)
 292     if sys.version_info < (3, 0) and sys.platform != 'win32':
 293         encoding = get_filesystem_encoding()
 294         # os.path.basename returns a bytes object, but NamedTemporaryFile
 295         # will fail if the filename contains non ascii characters unless we
 296         # use a unicode object
 297         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 298         # the same for os.path.dirname
 299         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 300     else:
 301         path_basename = os.path.basename
 302         path_dirname = os.path.dirname
 303
 304     args = {
 305         'suffix': '.tmp',
 306         'prefix': path_basename(fn) + '.',
 307         'dir': path_dirname(fn),
 308         'delete': False,
 309     }
 310
 311     # In Python 2.x, json.dump expects a bytestream.
 312     # In Python 3.x, it writes to a character stream
 313     if sys.version_info < (3, 0):
 314         args['mode'] = 'wb'
 315     else:
 316         args.update({
 317             'mode': 'w',
 318             'encoding': 'utf-8',
 319         })
 320
 321     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 322
 323     try:
 324         with tf:
 325             json.dump(obj, tf, ensure_ascii=False)
 326         if sys.platform == 'win32':
 327             # Need to remove existing file on Windows, else os.rename raises
 328             # WindowsError or FileExistsError.
 329             try:
 330                 os.unlink(fn)
 331             except OSError:
 332                 pass
 333         try:
 334             mask = os.umask(0)
 335             os.umask(mask)
 336             os.chmod(tf.name, 0o666 & ~mask)
 337         except OSError:
 338             pass
 339         os.rename(tf.name, fn)
 340     except Exception:
 341         try:
 342             os.remove(tf.name)
 343         except OSError:
 344             pass
 345         raise
 346
 347
 348 if sys.version_info >= (2, 7):
 349     def find_xpath_attr(node, xpath, key, val=None):
 350         """ Find the xpath xpath[@key=val] """
 351         assert re.match(r'^[a-zA-Z_-]+$', key)
 352         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 353         return node.find(expr)
 354 else:
 355     def find_xpath_attr(node, xpath, key, val=None):
 356         for f in node.findall(compat_xpath(xpath)):
 357             if key not in f.attrib:
 358                 continue
 359             if val is None or f.attrib.get(key) == val:
 360                 return f
 361         return None
 362
 363 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 364 # the namespace parameter
 365
 366
 367 def xpath_with_ns(path, ns_map):
 368     components = [c.split(':') for c in path.split('/')]
 369     replaced = []
 370     for c in components:
 371         if len(c) == 1:
 372             replaced.append(c[0])
 373         else:
 374             ns, tag = c
 375             replaced.append('{%s}%s' % (ns_map[ns], tag))
 376     return '/'.join(replaced)
 377
 378
 379 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 380     def _find_xpath(xpath):
 381         return node.find(compat_xpath(xpath))
 382
 383     if isinstance(xpath, (str, compat_str)):
 384         n = _find_xpath(xpath)
 385     else:
 386         for xp in xpath:
 387             n = _find_xpath(xp)
 388             if n is not None:
 389                 break
 390
 391     if n is None:
 392         if default is not NO_DEFAULT:
 393             return default
 394         elif fatal:
 395             name = xpath if name is None else name
 396             raise ExtractorError('Could not find XML element %s' % name)
 397         else:
 398             return None
 399     return n
 400
 401
 402 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 403     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 404     if n is None or n == default:
 405         return n
 406     if n.text is None:
 407         if default is not NO_DEFAULT:
 408             return default
 409         elif fatal:
 410             name = xpath if name is None else name
 411             raise ExtractorError('Could not find XML element\'s text %s' % name)
 412         else:
 413             return None
 414     return n.text
 415
 416
 417 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 418     n = find_xpath_attr(node, xpath, key)
 419     if n is None:
 420         if default is not NO_DEFAULT:
 421             return default
 422         elif fatal:
 423             name = '%s[@%s]' % (xpath, key) if name is None else name
 424             raise ExtractorError('Could not find XML attribute %s' % name)
 425         else:
 426             return None
 427     return n.attrib[key]
 428
 429
 430 def get_element_by_id(id, html):
 431     """Return the content of the tag with the specified ID in the passed HTML document"""
 432     return get_element_by_attribute('id', id, html)
 433
 434
 435 def get_element_html_by_id(id, html):
 436     """Return the html of the tag with the specified ID in the passed HTML document"""
 437     return get_element_html_by_attribute('id', id, html)
 438
 439
 440 def get_element_by_class(class_name, html):
 441     """Return the content of the first tag with the specified class in the passed HTML document"""
 442     retval = get_elements_by_class(class_name, html)
 443     return retval[0] if retval else None
 444
 445
 446 def get_element_html_by_class(class_name, html):
 447     """Return the html of the first tag with the specified class in the passed HTML document"""
 448     retval = get_elements_html_by_class(class_name, html)
 449     return retval[0] if retval else None
 450
 451
 452 def get_element_by_attribute(attribute, value, html, escape_value=True):
 453     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 454     return retval[0] if retval else None
 455
 456
 457 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 458     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 459     return retval[0] if retval else None
 460
 461
 462 def get_elements_by_class(class_name, html):
 463     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 464     return get_elements_by_attribute(
 465         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 466         html, escape_value=False)
 467
 468
 469 def get_elements_html_by_class(class_name, html):
 470     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 471     return get_elements_html_by_attribute(
 472         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 473         html, escape_value=False)
 474
 475
 476 def get_elements_by_attribute(*args, **kwargs):
 477     """Return the content of the tag with the specified attribute in the passed HTML document"""
 478     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 479
 480
 481 def get_elements_html_by_attribute(*args, **kwargs):
 482     """Return the html of the tag with the specified attribute in the passed HTML document"""
 483     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 484
 485
 486 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 487     """
 488     Return the text (content) and the html (whole) of the tag with the specified
 489     attribute in the passed HTML document
 490     """
 491
 492     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 493
 494     value = re.escape(value) if escape_value else value
 495
 496     partial_element_re = r'''(?x)
 497         <(?P<tag>[a-zA-Z0-9:._-]+)
 498          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 499          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 500         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 501
 502     for m in re.finditer(partial_element_re, html):
 503         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 504
 505         yield (
 506             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 507             whole
 508         )
 509
 510
 511 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 512     """
 513     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 514     closing tag for the first opening tag it has encountered, and can be used
 515     as a context manager
 516     """
 517
 518     class HTMLBreakOnClosingTagException(Exception):
 519         pass
 520
 521     def __init__(self):
 522         self.tagstack = collections.deque()
 523         compat_HTMLParser.__init__(self)
 524
 525     def __enter__(self):
 526         return self
 527
 528     def __exit__(self, *_):
 529         self.close()
 530
 531     def close(self):
 532         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 533         # so data remains buffered; we no longer have any interest in it, thus
 534         # override this method to discard it
 535         pass
 536
 537     def handle_starttag(self, tag, _):
 538         self.tagstack.append(tag)
 539
 540     def handle_endtag(self, tag):
 541         if not self.tagstack:
 542             raise compat_HTMLParseError('no tags in the stack')
 543         while self.tagstack:
 544             inner_tag = self.tagstack.pop()
 545             if inner_tag == tag:
 546                 break
 547         else:
 548             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 549         if not self.tagstack:
 550             raise self.HTMLBreakOnClosingTagException()
 551
 552
 553 def get_element_text_and_html_by_tag(tag, html):
 554     """
 555     For the first element with the specified tag in the passed HTML document
 556     return its' content (text) and the whole element (html)
 557     """
 558     def find_or_raise(haystack, needle, exc):
 559         try:
 560             return haystack.index(needle)
 561         except ValueError:
 562             raise exc
 563     closing_tag = f'</{tag}>'
 564     whole_start = find_or_raise(
 565         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 566     content_start = find_or_raise(
 567         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 568     content_start += whole_start + 1
 569     with HTMLBreakOnClosingTagParser() as parser:
 570         parser.feed(html[whole_start:content_start])
 571         if not parser.tagstack or parser.tagstack[0] != tag:
 572             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 573         offset = content_start
 574         while offset < len(html):
 575             next_closing_tag_start = find_or_raise(
 576                 html[offset:], closing_tag,
 577                 compat_HTMLParseError(f'closing {tag} tag not found'))
 578             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 579             try:
 580                 parser.feed(html[offset:offset + next_closing_tag_end])
 581                 offset += next_closing_tag_end
 582             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 583                 return html[content_start:offset + next_closing_tag_start], \
 584                     html[whole_start:offset + next_closing_tag_end]
 585         raise compat_HTMLParseError('unexpected end of html')
 586
 587
 588 class HTMLAttributeParser(compat_HTMLParser):
 589     """Trivial HTML parser to gather the attributes for a single element"""
 590
 591     def __init__(self):
 592         self.attrs = {}
 593         compat_HTMLParser.__init__(self)
 594
 595     def handle_starttag(self, tag, attrs):
 596         self.attrs = dict(attrs)
 597
 598
 599 class HTMLListAttrsParser(compat_HTMLParser):
 600     """HTML parser to gather the attributes for the elements of a list"""
 601
 602     def __init__(self):
 603         compat_HTMLParser.__init__(self)
 604         self.items = []
 605         self._level = 0
 606
 607     def handle_starttag(self, tag, attrs):
 608         if tag == 'li' and self._level == 0:
 609             self.items.append(dict(attrs))
 610         self._level += 1
 611
 612     def handle_endtag(self, tag):
 613         self._level -= 1
 614
 615
 616 def extract_attributes(html_element):
 617     """Given a string for an HTML element such as
 618     <el
 619          a="foo" B="bar" c="&98;az" d=boz
 620          empty= noval entity="&amp;"
 621          sq='"' dq="'"
 622     >
 623     Decode and return a dictionary of attributes.
 624     {
 625         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 626         'empty': '', 'noval': None, 'entity': '&',
 627         'sq': '"', 'dq': '\''
 628     }.
 629     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 630     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 631     """
 632     parser = HTMLAttributeParser()
 633     try:
 634         parser.feed(html_element)
 635         parser.close()
 636     # Older Python may throw HTMLParseError in case of malformed HTML
 637     except compat_HTMLParseError:
 638         pass
 639     return parser.attrs
 640
 641
 642 def parse_list(webpage):
 643     """Given a string for an series of HTML <li> elements,
 644     return a dictionary of their attributes"""
 645     parser = HTMLListAttrsParser()
 646     parser.feed(webpage)
 647     parser.close()
 648     return parser.items
 649
 650
 651 def clean_html(html):
 652     """Clean an HTML snippet into a readable string"""
 653
 654     if html is None:  # Convenience for sanitizing descriptions etc.
 655         return html
 656
 657     html = re.sub(r'\s+', ' ', html)
 658     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 659     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 660     # Strip html tags
 661     html = re.sub('<.*?>', '', html)
 662     # Replace html entities
 663     html = unescapeHTML(html)
 664     return html.strip()
 665
 666
 667 def sanitize_open(filename, open_mode):
 668     """Try to open the given filename, and slightly tweak it if this fails.
 669
 670     Attempts to open the given filename. If this fails, it tries to change
 671     the filename slightly, step by step, until it's either able to open it
 672     or it fails and raises a final exception, like the standard open()
 673     function.
 674
 675     It returns the tuple (stream, definitive_file_name).
 676     """
 677     try:
 678         if filename == '-':
 679             if sys.platform == 'win32':
 680                 import msvcrt
 681                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 682             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 683         stream = locked_file(filename, open_mode, block=False).open()
 684         return (stream, filename)
 685     except (IOError, OSError) as err:
 686         if err.errno in (errno.EACCES,):
 687             raise
 688
 689         # In case of error, try to remove win32 forbidden chars
 690         alt_filename = sanitize_path(filename)
 691         if alt_filename == filename:
 692             raise
 693         else:
 694             # An exception here should be caught in the caller
 695             stream = locked_file(filename, open_mode, block=False).open()
 696             return (stream, alt_filename)
 697
 698
 699 def timeconvert(timestr):
 700     """Convert RFC 2822 defined time string into system timestamp"""
 701     timestamp = None
 702     timetuple = email.utils.parsedate_tz(timestr)
 703     if timetuple is not None:
 704         timestamp = email.utils.mktime_tz(timetuple)
 705     return timestamp
 706
 707
 708 def sanitize_filename(s, restricted=False, is_id=False):
 709     """Sanitizes a string so it could be used as part of a filename.
 710     If restricted is set, use a stricter subset of allowed characters.
 711     Set is_id if this is not an arbitrary string, but an ID that should be kept
 712     if possible.
 713     """
 714     def replace_insane(char):
 715         if restricted and char in ACCENT_CHARS:
 716             return ACCENT_CHARS[char]
 717         elif not restricted and char == '\n':
 718             return ' '
 719         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 720             return ''
 721         elif char == '"':
 722             return '' if restricted else '\''
 723         elif char == ':':
 724             return '_-' if restricted else ' -'
 725         elif char in '\\/|*<>':
 726             return '_'
 727         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 728             return '_'
 729         if restricted and ord(char) > 127:
 730             return '_'
 731         return char
 732
 733     if s == '':
 734         return ''
 735     # Handle timestamps
 736     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 737     result = ''.join(map(replace_insane, s))
 738     if not is_id:
 739         while '__' in result:
 740             result = result.replace('__', '_')
 741         result = result.strip('_')
 742         # Common case of "Foreign band name - English song title"
 743         if restricted and result.startswith('-_'):
 744             result = result[2:]
 745         if result.startswith('-'):
 746             result = '_' + result[len('-'):]
 747         result = result.lstrip('.')
 748         if not result:
 749             result = '_'
 750     return result
 751
 752
 753 def sanitize_path(s, force=False):
 754     """Sanitizes and normalizes path on Windows"""
 755     if sys.platform == 'win32':
 756         force = False
 757         drive_or_unc, _ = os.path.splitdrive(s)
 758         if sys.version_info < (2, 7) and not drive_or_unc:
 759             drive_or_unc, _ = os.path.splitunc(s)
 760     elif force:
 761         drive_or_unc = ''
 762     else:
 763         return s
 764
 765     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 766     if drive_or_unc:
 767         norm_path.pop(0)
 768     sanitized_path = [
 769         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 770         for path_part in norm_path]
 771     if drive_or_unc:
 772         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 773     elif force and s[0] == os.path.sep:
 774         sanitized_path.insert(0, os.path.sep)
 775     return os.path.join(*sanitized_path)
 776
 777
 778 def sanitize_url(url):
 779     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 780     # the number of unwanted failures due to missing protocol
 781     if url.startswith('//'):
 782         return 'http:%s' % url
 783     # Fix some common typos seen so far
 784     COMMON_TYPOS = (
 785         # https://github.com/ytdl-org/youtube-dl/issues/15649
 786         (r'^httpss://', r'https://'),
 787         # https://bx1.be/lives/direct-tv/
 788         (r'^rmtp([es]?)://', r'rtmp\1://'),
 789     )
 790     for mistake, fixup in COMMON_TYPOS:
 791         if re.match(mistake, url):
 792             return re.sub(mistake, fixup, url)
 793     return url
 794
 795
 796 def extract_basic_auth(url):
 797     parts = compat_urlparse.urlsplit(url)
 798     if parts.username is None:
 799         return url, None
 800     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 801         parts.hostname if parts.port is None
 802         else '%s:%d' % (parts.hostname, parts.port))))
 803     auth_payload = base64.b64encode(
 804         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 805     return url, 'Basic ' + auth_payload.decode('utf-8')
 806
 807
 808 def sanitized_Request(url, *args, **kwargs):
 809     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 810     if auth_header is not None:
 811         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 812         headers['Authorization'] = auth_header
 813     return compat_urllib_request.Request(url, *args, **kwargs)
 814
 815
 816 def expand_path(s):
 817     """Expand shell variables and ~"""
 818     return os.path.expandvars(compat_expanduser(s))
 819
 820
 821 def orderedSet(iterable):
 822     """ Remove all duplicates from the input iterable """
 823     res = []
 824     for el in iterable:
 825         if el not in res:
 826             res.append(el)
 827     return res
 828
 829
 830 def _htmlentity_transform(entity_with_semicolon):
 831     """Transforms an HTML entity to a character."""
 832     entity = entity_with_semicolon[:-1]
 833
 834     # Known non-numeric HTML entity
 835     if entity in compat_html_entities.name2codepoint:
 836         return compat_chr(compat_html_entities.name2codepoint[entity])
 837
 838     # TODO: HTML5 allows entities without a semicolon. For example,
 839     # '&Eacuteric' should be decoded as 'Éric'.
 840     if entity_with_semicolon in compat_html_entities_html5:
 841         return compat_html_entities_html5[entity_with_semicolon]
 842
 843     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 844     if mobj is not None:
 845         numstr = mobj.group(1)
 846         if numstr.startswith('x'):
 847             base = 16
 848             numstr = '0%s' % numstr
 849         else:
 850             base = 10
 851         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 852         try:
 853             return compat_chr(int(numstr, base))
 854         except ValueError:
 855             pass
 856
 857     # Unknown entity in name, return its literal representation
 858     return '&%s;' % entity
 859
 860
 861 def unescapeHTML(s):
 862     if s is None:
 863         return None
 864     assert type(s) == compat_str
 865
 866     return re.sub(
 867         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 868
 869
 870 def escapeHTML(text):
 871     return (
 872         text
 873         .replace('&', '&amp;')
 874         .replace('<', '&lt;')
 875         .replace('>', '&gt;')
 876         .replace('"', '&quot;')
 877         .replace("'", '&#39;')
 878     )
 879
 880
 881 def process_communicate_or_kill(p, *args, **kwargs):
 882     try:
 883         return p.communicate(*args, **kwargs)
 884     except BaseException:  # Including KeyboardInterrupt
 885         p.kill()
 886         p.wait()
 887         raise
 888
 889
 890 class Popen(subprocess.Popen):
 891     if sys.platform == 'win32':
 892         _startupinfo = subprocess.STARTUPINFO()
 893         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 894     else:
 895         _startupinfo = None
 896
 897     def __init__(self, *args, **kwargs):
 898         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 899
 900     def communicate_or_kill(self, *args, **kwargs):
 901         return process_communicate_or_kill(self, *args, **kwargs)
 902
 903
 904 def get_subprocess_encoding():
 905     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 906         # For subprocess calls, encode with locale encoding
 907         # Refer to http://stackoverflow.com/a/9951851/35070
 908         encoding = preferredencoding()
 909     else:
 910         encoding = sys.getfilesystemencoding()
 911     if encoding is None:
 912         encoding = 'utf-8'
 913     return encoding
 914
 915
 916 def encodeFilename(s, for_subprocess=False):
 917     """
 918     @param s The name of the file
 919     """
 920
 921     assert type(s) == compat_str
 922
 923     # Python 3 has a Unicode API
 924     if sys.version_info >= (3, 0):
 925         return s
 926
 927     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 928     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 929     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 930     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 931         return s
 932
 933     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 934     if sys.platform.startswith('java'):
 935         return s
 936
 937     return s.encode(get_subprocess_encoding(), 'ignore')
 938
 939
 940 def decodeFilename(b, for_subprocess=False):
 941
 942     if sys.version_info >= (3, 0):
 943         return b
 944
 945     if not isinstance(b, bytes):
 946         return b
 947
 948     return b.decode(get_subprocess_encoding(), 'ignore')
 949
 950
 951 def encodeArgument(s):
 952     if not isinstance(s, compat_str):
 953         # Legacy code that uses byte strings
 954         # Uncomment the following line after fixing all post processors
 955         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 956         s = s.decode('ascii')
 957     return encodeFilename(s, True)
 958
 959
 960 def decodeArgument(b):
 961     return decodeFilename(b, True)
 962
 963
 964 def decodeOption(optval):
 965     if optval is None:
 966         return optval
 967     if isinstance(optval, bytes):
 968         optval = optval.decode(preferredencoding())
 969
 970     assert isinstance(optval, compat_str)
 971     return optval
 972
 973
 974 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 975
 976
 977 def timetuple_from_msec(msec):
 978     secs, msec = divmod(msec, 1000)
 979     mins, secs = divmod(secs, 60)
 980     hrs, mins = divmod(mins, 60)
 981     return _timetuple(hrs, mins, secs, msec)
 982
 983
 984 def formatSeconds(secs, delim=':', msec=False):
 985     time = timetuple_from_msec(secs * 1000)
 986     if time.hours:
 987         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 988     elif time.minutes:
 989         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 990     else:
 991         ret = '%d' % time.seconds
 992     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 993
 994
 995 def _ssl_load_windows_store_certs(ssl_context, storename):
 996     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 997     try:
 998         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 999                  if encoding == 'x509_asn' and (
1000                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
1001     except PermissionError:
1002         return
1003     for cert in certs:
1004         try:
1005             ssl_context.load_verify_locations(cadata=cert)
1006         except ssl.SSLError:
1007             pass
1008
1009
1010 def make_HTTPS_handler(params, **kwargs):
1011     opts_check_certificate = not params.get('nocheckcertificate')
1012     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1013     context.check_hostname = opts_check_certificate
1014     if params.get('legacyserverconnect'):
1015         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1016     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1017     if opts_check_certificate:
1018         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1019             context.load_verify_locations(cafile=certifi.where())
1020         else:
1021             try:
1022                 context.load_default_certs()
1023                 # Work around the issue in load_default_certs when there are bad certificates. See:
1024                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1025                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1026             except ssl.SSLError:
1027                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1028                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1029                     # Create a new context to discard any certificates that were already loaded
1030                     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1031                     context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1032                     for storename in ('CA', 'ROOT'):
1033                         _ssl_load_windows_store_certs(context, storename)
1034                 context.set_default_verify_paths()
1035     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1036
1037
1038 def bug_reports_message(before=';'):
1039     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1040            'filling out the appropriate issue template. '
1041            'Confirm you are on the latest version using  yt-dlp -U')
1042
1043     before = before.rstrip()
1044     if not before or before.endswith(('.', '!', '?')):
1045         msg = msg[0].title() + msg[1:]
1046
1047     return (before + ' ' if before else '') + msg
1048
1049
1050 class YoutubeDLError(Exception):
1051     """Base exception for YoutubeDL errors."""
1052     msg = None
1053
1054     def __init__(self, msg=None):
1055         if msg is not None:
1056             self.msg = msg
1057         elif self.msg is None:
1058             self.msg = type(self).__name__
1059         super().__init__(self.msg)
1060
1061
1062 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1063 if hasattr(ssl, 'CertificateError'):
1064     network_exceptions.append(ssl.CertificateError)
1065 network_exceptions = tuple(network_exceptions)
1066
1067
1068 class ExtractorError(YoutubeDLError):
1069     """Error during info extraction."""
1070
1071     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1072         """ tb, if given, is the original traceback (so that it can be printed out).
1073         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1074         """
1075         if sys.exc_info()[0] in network_exceptions:
1076             expected = True
1077
1078         self.orig_msg = str(msg)
1079         self.traceback = tb
1080         self.expected = expected
1081         self.cause = cause
1082         self.video_id = video_id
1083         self.ie = ie
1084         self.exc_info = sys.exc_info()  # preserve original exception
1085
1086         super(ExtractorError, self).__init__(''.join((
1087             format_field(ie, template='[%s] '),
1088             format_field(video_id, template='%s: '),
1089             msg,
1090             format_field(cause, template=' (caused by %r)'),
1091             '' if expected else bug_reports_message())))
1092
1093     def format_traceback(self):
1094         return join_nonempty(
1095             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1096             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1097             delim='\n') or None
1098
1099
1100 class UnsupportedError(ExtractorError):
1101     def __init__(self, url):
1102         super(UnsupportedError, self).__init__(
1103             'Unsupported URL: %s' % url, expected=True)
1104         self.url = url
1105
1106
1107 class RegexNotFoundError(ExtractorError):
1108     """Error when a regex didn't match"""
1109     pass
1110
1111
1112 class GeoRestrictedError(ExtractorError):
1113     """Geographic restriction Error exception.
1114
1115     This exception may be thrown when a video is not available from your
1116     geographic location due to geographic restrictions imposed by a website.
1117     """
1118
1119     def __init__(self, msg, countries=None, **kwargs):
1120         kwargs['expected'] = True
1121         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1122         self.countries = countries
1123
1124
1125 class DownloadError(YoutubeDLError):
1126     """Download Error exception.
1127
1128     This exception may be thrown by FileDownloader objects if they are not
1129     configured to continue on errors. They will contain the appropriate
1130     error message.
1131     """
1132
1133     def __init__(self, msg, exc_info=None):
1134         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1135         super(DownloadError, self).__init__(msg)
1136         self.exc_info = exc_info
1137
1138
1139 class EntryNotInPlaylist(YoutubeDLError):
1140     """Entry not in playlist exception.
1141
1142     This exception will be thrown by YoutubeDL when a requested entry
1143     is not found in the playlist info_dict
1144     """
1145     msg = 'Entry not found in info'
1146
1147
1148 class SameFileError(YoutubeDLError):
1149     """Same File exception.
1150
1151     This exception will be thrown by FileDownloader objects if they detect
1152     multiple files would have to be downloaded to the same file on disk.
1153     """
1154     msg = 'Fixed output name but more than one file to download'
1155
1156     def __init__(self, filename=None):
1157         if filename is not None:
1158             self.msg += f': {filename}'
1159         super().__init__(self.msg)
1160
1161
1162 class PostProcessingError(YoutubeDLError):
1163     """Post Processing exception.
1164
1165     This exception may be raised by PostProcessor's .run() method to
1166     indicate an error in the postprocessing task.
1167     """
1168
1169
1170 class DownloadCancelled(YoutubeDLError):
1171     """ Exception raised when the download queue should be interrupted """
1172     msg = 'The download was cancelled'
1173
1174
1175 class ExistingVideoReached(DownloadCancelled):
1176     """ --break-on-existing triggered """
1177     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1178
1179
1180 class RejectedVideoReached(DownloadCancelled):
1181     """ --break-on-reject triggered """
1182     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1183
1184
1185 class MaxDownloadsReached(DownloadCancelled):
1186     """ --max-downloads limit has been reached. """
1187     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1188
1189
1190 class ReExtractInfo(YoutubeDLError):
1191     """ Video info needs to be re-extracted. """
1192
1193     def __init__(self, msg, expected=False):
1194         super().__init__(msg)
1195         self.expected = expected
1196
1197
1198 class ThrottledDownload(ReExtractInfo):
1199     """ Download speed below --throttled-rate. """
1200     msg = 'The download speed is below throttle limit'
1201
1202     def __init__(self):
1203         super().__init__(self.msg, expected=False)
1204
1205
1206 class UnavailableVideoError(YoutubeDLError):
1207     """Unavailable Format exception.
1208
1209     This exception will be thrown when a video is requested
1210     in a format that is not available for that video.
1211     """
1212     msg = 'Unable to download video'
1213
1214     def __init__(self, err=None):
1215         if err is not None:
1216             self.msg += f': {err}'
1217         super().__init__(self.msg)
1218
1219
1220 class ContentTooShortError(YoutubeDLError):
1221     """Content Too Short exception.
1222
1223     This exception may be raised by FileDownloader objects when a file they
1224     download is too small for what the server announced first, indicating
1225     the connection was probably interrupted.
1226     """
1227
1228     def __init__(self, downloaded, expected):
1229         super(ContentTooShortError, self).__init__(
1230             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1231         )
1232         # Both in bytes
1233         self.downloaded = downloaded
1234         self.expected = expected
1235
1236
1237 class XAttrMetadataError(YoutubeDLError):
1238     def __init__(self, code=None, msg='Unknown error'):
1239         super(XAttrMetadataError, self).__init__(msg)
1240         self.code = code
1241         self.msg = msg
1242
1243         # Parsing code and msg
1244         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1245                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1246             self.reason = 'NO_SPACE'
1247         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1248             self.reason = 'VALUE_TOO_LONG'
1249         else:
1250             self.reason = 'NOT_SUPPORTED'
1251
1252
1253 class XAttrUnavailableError(YoutubeDLError):
1254     pass
1255
1256
1257 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1258     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1259     # expected HTTP responses to meet HTTP/1.0 or later (see also
1260     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1261     if sys.version_info < (3, 0):
1262         kwargs['strict'] = True
1263     hc = http_class(*args, **compat_kwargs(kwargs))
1264     source_address = ydl_handler._params.get('source_address')
1265
1266     if source_address is not None:
1267         # This is to workaround _create_connection() from socket where it will try all
1268         # address data from getaddrinfo() including IPv6. This filters the result from
1269         # getaddrinfo() based on the source_address value.
1270         # This is based on the cpython socket.create_connection() function.
1271         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1272         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1273             host, port = address
1274             err = None
1275             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1276             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1277             ip_addrs = [addr for addr in addrs if addr[0] == af]
1278             if addrs and not ip_addrs:
1279                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1280                 raise socket.error(
1281                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1282                     % (ip_version, source_address[0]))
1283             for res in ip_addrs:
1284                 af, socktype, proto, canonname, sa = res
1285                 sock = None
1286                 try:
1287                     sock = socket.socket(af, socktype, proto)
1288                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1289                         sock.settimeout(timeout)
1290                     sock.bind(source_address)
1291                     sock.connect(sa)
1292                     err = None  # Explicitly break reference cycle
1293                     return sock
1294                 except socket.error as _:
1295                     err = _
1296                     if sock is not None:
1297                         sock.close()
1298             if err is not None:
1299                 raise err
1300             else:
1301                 raise socket.error('getaddrinfo returns an empty list')
1302         if hasattr(hc, '_create_connection'):
1303             hc._create_connection = _create_connection
1304         sa = (source_address, 0)
1305         if hasattr(hc, 'source_address'):  # Python 2.7+
1306             hc.source_address = sa
1307         else:  # Python 2.6
1308             def _hc_connect(self, *args, **kwargs):
1309                 sock = _create_connection(
1310                     (self.host, self.port), self.timeout, sa)
1311                 if is_https:
1312                     self.sock = ssl.wrap_socket(
1313                         sock, self.key_file, self.cert_file,
1314                         ssl_version=ssl.PROTOCOL_TLSv1)
1315                 else:
1316                     self.sock = sock
1317             hc.connect = functools.partial(_hc_connect, hc)
1318
1319     return hc
1320
1321
1322 def handle_youtubedl_headers(headers):
1323     filtered_headers = headers
1324
1325     if 'Youtubedl-no-compression' in filtered_headers:
1326         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1327         del filtered_headers['Youtubedl-no-compression']
1328
1329     return filtered_headers
1330
1331
1332 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1333     """Handler for HTTP requests and responses.
1334
1335     This class, when installed with an OpenerDirector, automatically adds
1336     the standard headers to every HTTP request and handles gzipped and
1337     deflated responses from web servers. If compression is to be avoided in
1338     a particular request, the original request in the program code only has
1339     to include the HTTP header "Youtubedl-no-compression", which will be
1340     removed before making the real request.
1341
1342     Part of this code was copied from:
1343
1344     http://techknack.net/python-urllib2-handlers/
1345
1346     Andrew Rowls, the author of that code, agreed to release it to the
1347     public domain.
1348     """
1349
1350     def __init__(self, params, *args, **kwargs):
1351         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1352         self._params = params
1353
1354     def http_open(self, req):
1355         conn_class = compat_http_client.HTTPConnection
1356
1357         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1358         if socks_proxy:
1359             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1360             del req.headers['Ytdl-socks-proxy']
1361
1362         return self.do_open(functools.partial(
1363             _create_http_connection, self, conn_class, False),
1364             req)
1365
1366     @staticmethod
1367     def deflate(data):
1368         if not data:
1369             return data
1370         try:
1371             return zlib.decompress(data, -zlib.MAX_WBITS)
1372         except zlib.error:
1373             return zlib.decompress(data)
1374
1375     @staticmethod
1376     def brotli(data):
1377         if not data:
1378             return data
1379         return compat_brotli.decompress(data)
1380
1381     def http_request(self, req):
1382         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1383         # always respected by websites, some tend to give out URLs with non percent-encoded
1384         # non-ASCII characters (see telemb.py, ard.py [#3412])
1385         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1386         # To work around aforementioned issue we will replace request's original URL with
1387         # percent-encoded one
1388         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1389         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1390         url = req.get_full_url()
1391         url_escaped = escape_url(url)
1392
1393         # Substitute URL if any change after escaping
1394         if url != url_escaped:
1395             req = update_Request(req, url=url_escaped)
1396
1397         for h, v in self._params.get('http_headers', std_headers).items():
1398             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1399             # The dict keys are capitalized because of this bug by urllib
1400             if h.capitalize() not in req.headers:
1401                 req.add_header(h, v)
1402
1403         if 'Accept-encoding' not in req.headers:
1404             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1405
1406         req.headers = handle_youtubedl_headers(req.headers)
1407
1408         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1409             # Python 2.6 is brain-dead when it comes to fragments
1410             req._Request__original = req._Request__original.partition('#')[0]
1411             req._Request__r_type = req._Request__r_type.partition('#')[0]
1412
1413         return req
1414
1415     def http_response(self, req, resp):
1416         old_resp = resp
1417         # gzip
1418         if resp.headers.get('Content-encoding', '') == 'gzip':
1419             content = resp.read()
1420             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1421             try:
1422                 uncompressed = io.BytesIO(gz.read())
1423             except IOError as original_ioerror:
1424                 # There may be junk add the end of the file
1425                 # See http://stackoverflow.com/q/4928560/35070 for details
1426                 for i in range(1, 1024):
1427                     try:
1428                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1429                         uncompressed = io.BytesIO(gz.read())
1430                     except IOError:
1431                         continue
1432                     break
1433                 else:
1434                     raise original_ioerror
1435             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1436             resp.msg = old_resp.msg
1437             del resp.headers['Content-encoding']
1438         # deflate
1439         if resp.headers.get('Content-encoding', '') == 'deflate':
1440             gz = io.BytesIO(self.deflate(resp.read()))
1441             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1442             resp.msg = old_resp.msg
1443             del resp.headers['Content-encoding']
1444         # brotli
1445         if resp.headers.get('Content-encoding', '') == 'br':
1446             resp = compat_urllib_request.addinfourl(
1447                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1448             resp.msg = old_resp.msg
1449             del resp.headers['Content-encoding']
1450         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1451         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1452         if 300 <= resp.code < 400:
1453             location = resp.headers.get('Location')
1454             if location:
1455                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1456                 if sys.version_info >= (3, 0):
1457                     location = location.encode('iso-8859-1').decode('utf-8')
1458                 else:
1459                     location = location.decode('utf-8')
1460                 location_escaped = escape_url(location)
1461                 if location != location_escaped:
1462                     del resp.headers['Location']
1463                     if sys.version_info < (3, 0):
1464                         location_escaped = location_escaped.encode('utf-8')
1465                     resp.headers['Location'] = location_escaped
1466         return resp
1467
1468     https_request = http_request
1469     https_response = http_response
1470
1471
1472 def make_socks_conn_class(base_class, socks_proxy):
1473     assert issubclass(base_class, (
1474         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1475
1476     url_components = compat_urlparse.urlparse(socks_proxy)
1477     if url_components.scheme.lower() == 'socks5':
1478         socks_type = ProxyType.SOCKS5
1479     elif url_components.scheme.lower() in ('socks', 'socks4'):
1480         socks_type = ProxyType.SOCKS4
1481     elif url_components.scheme.lower() == 'socks4a':
1482         socks_type = ProxyType.SOCKS4A
1483
1484     def unquote_if_non_empty(s):
1485         if not s:
1486             return s
1487         return compat_urllib_parse_unquote_plus(s)
1488
1489     proxy_args = (
1490         socks_type,
1491         url_components.hostname, url_components.port or 1080,
1492         True,  # Remote DNS
1493         unquote_if_non_empty(url_components.username),
1494         unquote_if_non_empty(url_components.password),
1495     )
1496
1497     class SocksConnection(base_class):
1498         def connect(self):
1499             self.sock = sockssocket()
1500             self.sock.setproxy(*proxy_args)
1501             if type(self.timeout) in (int, float):
1502                 self.sock.settimeout(self.timeout)
1503             self.sock.connect((self.host, self.port))
1504
1505             if isinstance(self, compat_http_client.HTTPSConnection):
1506                 if hasattr(self, '_context'):  # Python > 2.6
1507                     self.sock = self._context.wrap_socket(
1508                         self.sock, server_hostname=self.host)
1509                 else:
1510                     self.sock = ssl.wrap_socket(self.sock)
1511
1512     return SocksConnection
1513
1514
1515 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1516     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1517         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1518         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1519         self._params = params
1520
1521     def https_open(self, req):
1522         kwargs = {}
1523         conn_class = self._https_conn_class
1524
1525         if hasattr(self, '_context'):  # python > 2.6
1526             kwargs['context'] = self._context
1527         if hasattr(self, '_check_hostname'):  # python 3.x
1528             kwargs['check_hostname'] = self._check_hostname
1529
1530         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531         if socks_proxy:
1532             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533             del req.headers['Ytdl-socks-proxy']
1534
1535         return self.do_open(functools.partial(
1536             _create_http_connection, self, conn_class, True),
1537             req, **kwargs)
1538
1539
1540 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1541     """
1542     See [1] for cookie file format.
1543
1544     1. https://curl.haxx.se/docs/http-cookies.html
1545     """
1546     _HTTPONLY_PREFIX = '#HttpOnly_'
1547     _ENTRY_LEN = 7
1548     _HEADER = '''# Netscape HTTP Cookie File
1549 # This file is generated by yt-dlp.  Do not edit.
1550
1551 '''
1552     _CookieFileEntry = collections.namedtuple(
1553         'CookieFileEntry',
1554         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1555
1556     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1557         """
1558         Save cookies to a file.
1559
1560         Most of the code is taken from CPython 3.8 and slightly adapted
1561         to support cookie files with UTF-8 in both python 2 and 3.
1562         """
1563         if filename is None:
1564             if self.filename is not None:
1565                 filename = self.filename
1566             else:
1567                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1568
1569         # Store session cookies with `expires` set to 0 instead of an empty
1570         # string
1571         for cookie in self:
1572             if cookie.expires is None:
1573                 cookie.expires = 0
1574
1575         with io.open(filename, 'w', encoding='utf-8') as f:
1576             f.write(self._HEADER)
1577             now = time.time()
1578             for cookie in self:
1579                 if not ignore_discard and cookie.discard:
1580                     continue
1581                 if not ignore_expires and cookie.is_expired(now):
1582                     continue
1583                 if cookie.secure:
1584                     secure = 'TRUE'
1585                 else:
1586                     secure = 'FALSE'
1587                 if cookie.domain.startswith('.'):
1588                     initial_dot = 'TRUE'
1589                 else:
1590                     initial_dot = 'FALSE'
1591                 if cookie.expires is not None:
1592                     expires = compat_str(cookie.expires)
1593                 else:
1594                     expires = ''
1595                 if cookie.value is None:
1596                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1597                     # with no name, whereas http.cookiejar regards it as a
1598                     # cookie with no value.
1599                     name = ''
1600                     value = cookie.name
1601                 else:
1602                     name = cookie.name
1603                     value = cookie.value
1604                 f.write(
1605                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1606                                secure, expires, name, value]) + '\n')
1607
1608     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1609         """Load cookies from a file."""
1610         if filename is None:
1611             if self.filename is not None:
1612                 filename = self.filename
1613             else:
1614                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1615
1616         def prepare_line(line):
1617             if line.startswith(self._HTTPONLY_PREFIX):
1618                 line = line[len(self._HTTPONLY_PREFIX):]
1619             # comments and empty lines are fine
1620             if line.startswith('#') or not line.strip():
1621                 return line
1622             cookie_list = line.split('\t')
1623             if len(cookie_list) != self._ENTRY_LEN:
1624                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1625             cookie = self._CookieFileEntry(*cookie_list)
1626             if cookie.expires_at and not cookie.expires_at.isdigit():
1627                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1628             return line
1629
1630         cf = io.StringIO()
1631         with io.open(filename, encoding='utf-8') as f:
1632             for line in f:
1633                 try:
1634                     cf.write(prepare_line(line))
1635                 except compat_cookiejar.LoadError as e:
1636                     write_string(
1637                         'WARNING: skipping cookie file entry due to %s: %r\n'
1638                         % (e, line), sys.stderr)
1639                     continue
1640         cf.seek(0)
1641         self._really_load(cf, filename, ignore_discard, ignore_expires)
1642         # Session cookies are denoted by either `expires` field set to
1643         # an empty string or 0. MozillaCookieJar only recognizes the former
1644         # (see [1]). So we need force the latter to be recognized as session
1645         # cookies on our own.
1646         # Session cookies may be important for cookies-based authentication,
1647         # e.g. usually, when user does not check 'Remember me' check box while
1648         # logging in on a site, some important cookies are stored as session
1649         # cookies so that not recognizing them will result in failed login.
1650         # 1. https://bugs.python.org/issue17164
1651         for cookie in self:
1652             # Treat `expires=0` cookies as session cookies
1653             if cookie.expires == 0:
1654                 cookie.expires = None
1655                 cookie.discard = True
1656
1657
1658 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1659     def __init__(self, cookiejar=None):
1660         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1661
1662     def http_response(self, request, response):
1663         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1664         # characters in Set-Cookie HTTP header of last response (see
1665         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1666         # In order to at least prevent crashing we will percent encode Set-Cookie
1667         # header before HTTPCookieProcessor starts processing it.
1668         # if sys.version_info < (3, 0) and response.headers:
1669         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1670         #         set_cookie = response.headers.get(set_cookie_header)
1671         #         if set_cookie:
1672         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1673         #             if set_cookie != set_cookie_escaped:
1674         #                 del response.headers[set_cookie_header]
1675         #                 response.headers[set_cookie_header] = set_cookie_escaped
1676         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1677
1678     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1679     https_response = http_response
1680
1681
1682 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1683     """YoutubeDL redirect handler
1684
1685     The code is based on HTTPRedirectHandler implementation from CPython [1].
1686
1687     This redirect handler solves two issues:
1688      - ensures redirect URL is always unicode under python 2
1689      - introduces support for experimental HTTP response status code
1690        308 Permanent Redirect [2] used by some sites [3]
1691
1692     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1693     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1694     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1695     """
1696
1697     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1698
1699     def redirect_request(self, req, fp, code, msg, headers, newurl):
1700         """Return a Request or None in response to a redirect.
1701
1702         This is called by the http_error_30x methods when a
1703         redirection response is received.  If a redirection should
1704         take place, return a new Request to allow http_error_30x to
1705         perform the redirect.  Otherwise, raise HTTPError if no-one
1706         else should try to handle this url.  Return None if you can't
1707         but another Handler might.
1708         """
1709         m = req.get_method()
1710         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1711                  or code in (301, 302, 303) and m == "POST")):
1712             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1713         # Strictly (according to RFC 2616), 301 or 302 in response to
1714         # a POST MUST NOT cause a redirection without confirmation
1715         # from the user (of urllib.request, in this case).  In practice,
1716         # essentially all clients do redirect in this case, so we do
1717         # the same.
1718
1719         # On python 2 urlh.geturl() may sometimes return redirect URL
1720         # as byte string instead of unicode. This workaround allows
1721         # to force it always return unicode.
1722         if sys.version_info[0] < 3:
1723             newurl = compat_str(newurl)
1724
1725         # Be conciliant with URIs containing a space.  This is mainly
1726         # redundant with the more complete encoding done in http_error_302(),
1727         # but it is kept for compatibility with other callers.
1728         newurl = newurl.replace(' ', '%20')
1729
1730         CONTENT_HEADERS = ("content-length", "content-type")
1731         # NB: don't use dict comprehension for python 2.6 compatibility
1732         newheaders = dict((k, v) for k, v in req.headers.items()
1733                           if k.lower() not in CONTENT_HEADERS)
1734         return compat_urllib_request.Request(
1735             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1736             unverifiable=True)
1737
1738
1739 def extract_timezone(date_str):
1740     m = re.search(
1741         r'''(?x)
1742             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1743             (?P<tz>Z|                                            # just the UTC Z, or
1744                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1745                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1746                    [ ]?                                          # optional space
1747                 (?P<sign>\+|-)                                   # +/-
1748                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1749             $)
1750         ''', date_str)
1751     if not m:
1752         timezone = datetime.timedelta()
1753     else:
1754         date_str = date_str[:-len(m.group('tz'))]
1755         if not m.group('sign'):
1756             timezone = datetime.timedelta()
1757         else:
1758             sign = 1 if m.group('sign') == '+' else -1
1759             timezone = datetime.timedelta(
1760                 hours=sign * int(m.group('hours')),
1761                 minutes=sign * int(m.group('minutes')))
1762     return timezone, date_str
1763
1764
1765 def parse_iso8601(date_str, delimiter='T', timezone=None):
1766     """ Return a UNIX timestamp from the given date """
1767
1768     if date_str is None:
1769         return None
1770
1771     date_str = re.sub(r'\.[0-9]+', '', date_str)
1772
1773     if timezone is None:
1774         timezone, date_str = extract_timezone(date_str)
1775
1776     try:
1777         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1778         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1779         return calendar.timegm(dt.timetuple())
1780     except ValueError:
1781         pass
1782
1783
1784 def date_formats(day_first=True):
1785     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1786
1787
1788 def unified_strdate(date_str, day_first=True):
1789     """Return a string with the date in the format YYYYMMDD"""
1790
1791     if date_str is None:
1792         return None
1793     upload_date = None
1794     # Replace commas
1795     date_str = date_str.replace(',', ' ')
1796     # Remove AM/PM + timezone
1797     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1798     _, date_str = extract_timezone(date_str)
1799
1800     for expression in date_formats(day_first):
1801         try:
1802             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1803         except ValueError:
1804             pass
1805     if upload_date is None:
1806         timetuple = email.utils.parsedate_tz(date_str)
1807         if timetuple:
1808             try:
1809                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1810             except ValueError:
1811                 pass
1812     if upload_date is not None:
1813         return compat_str(upload_date)
1814
1815
1816 def unified_timestamp(date_str, day_first=True):
1817     if date_str is None:
1818         return None
1819
1820     date_str = re.sub(r'[,|]', '', date_str)
1821
1822     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1823     timezone, date_str = extract_timezone(date_str)
1824
1825     # Remove AM/PM + timezone
1826     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1827
1828     # Remove unrecognized timezones from ISO 8601 alike timestamps
1829     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1830     if m:
1831         date_str = date_str[:-len(m.group('tz'))]
1832
1833     # Python only supports microseconds, so remove nanoseconds
1834     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1835     if m:
1836         date_str = m.group(1)
1837
1838     for expression in date_formats(day_first):
1839         try:
1840             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1841             return calendar.timegm(dt.timetuple())
1842         except ValueError:
1843             pass
1844     timetuple = email.utils.parsedate_tz(date_str)
1845     if timetuple:
1846         return calendar.timegm(timetuple) + pm_delta * 3600
1847
1848
1849 def determine_ext(url, default_ext='unknown_video'):
1850     if url is None or '.' not in url:
1851         return default_ext
1852     guess = url.partition('?')[0].rpartition('.')[2]
1853     if re.match(r'^[A-Za-z0-9]+$', guess):
1854         return guess
1855     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1856     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1857         return guess.rstrip('/')
1858     else:
1859         return default_ext
1860
1861
1862 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1863     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1864
1865
1866 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1867     """
1868     Return a datetime object from a string in the format YYYYMMDD or
1869     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1870
1871     format: string date format used to return datetime object from
1872     precision: round the time portion of a datetime object.
1873                 auto|microsecond|second|minute|hour|day.
1874                 auto: round to the unit provided in date_str (if applicable).
1875     """
1876     auto_precision = False
1877     if precision == 'auto':
1878         auto_precision = True
1879         precision = 'microsecond'
1880     today = datetime_round(datetime.datetime.utcnow(), precision)
1881     if date_str in ('now', 'today'):
1882         return today
1883     if date_str == 'yesterday':
1884         return today - datetime.timedelta(days=1)
1885     match = re.match(
1886         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1887         date_str)
1888     if match is not None:
1889         start_time = datetime_from_str(match.group('start'), precision, format)
1890         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1891         unit = match.group('unit')
1892         if unit == 'month' or unit == 'year':
1893             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1894             unit = 'day'
1895         else:
1896             if unit == 'week':
1897                 unit = 'day'
1898                 time *= 7
1899             delta = datetime.timedelta(**{unit + 's': time})
1900             new_date = start_time + delta
1901         if auto_precision:
1902             return datetime_round(new_date, unit)
1903         return new_date
1904
1905     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1906
1907
1908 def date_from_str(date_str, format='%Y%m%d', strict=False):
1909     """
1910     Return a datetime object from a string in the format YYYYMMDD or
1911     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1912
1913     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1914
1915     format: string date format used to return datetime object from
1916     """
1917     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1918         raise ValueError(f'Invalid date format {date_str}')
1919     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1920
1921
1922 def datetime_add_months(dt, months):
1923     """Increment/Decrement a datetime object by months."""
1924     month = dt.month + months - 1
1925     year = dt.year + month // 12
1926     month = month % 12 + 1
1927     day = min(dt.day, calendar.monthrange(year, month)[1])
1928     return dt.replace(year, month, day)
1929
1930
1931 def datetime_round(dt, precision='day'):
1932     """
1933     Round a datetime object's time to a specific precision
1934     """
1935     if precision == 'microsecond':
1936         return dt
1937
1938     unit_seconds = {
1939         'day': 86400,
1940         'hour': 3600,
1941         'minute': 60,
1942         'second': 1,
1943     }
1944     roundto = lambda x, n: ((x + n / 2) // n) * n
1945     timestamp = calendar.timegm(dt.timetuple())
1946     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1947
1948
1949 def hyphenate_date(date_str):
1950     """
1951     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1952     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1953     if match is not None:
1954         return '-'.join(match.groups())
1955     else:
1956         return date_str
1957
1958
1959 class DateRange(object):
1960     """Represents a time interval between two dates"""
1961
1962     def __init__(self, start=None, end=None):
1963         """start and end must be strings in the format accepted by date"""
1964         if start is not None:
1965             self.start = date_from_str(start, strict=True)
1966         else:
1967             self.start = datetime.datetime.min.date()
1968         if end is not None:
1969             self.end = date_from_str(end, strict=True)
1970         else:
1971             self.end = datetime.datetime.max.date()
1972         if self.start > self.end:
1973             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1974
1975     @classmethod
1976     def day(cls, day):
1977         """Returns a range that only contains the given day"""
1978         return cls(day, day)
1979
1980     def __contains__(self, date):
1981         """Check if the date is in the range"""
1982         if not isinstance(date, datetime.date):
1983             date = date_from_str(date)
1984         return self.start <= date <= self.end
1985
1986     def __str__(self):
1987         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1988
1989
1990 def platform_name():
1991     """ Returns the platform name as a compat_str """
1992     res = platform.platform()
1993     if isinstance(res, bytes):
1994         res = res.decode(preferredencoding())
1995
1996     assert isinstance(res, compat_str)
1997     return res
1998
1999
2000 def get_windows_version():
2001     ''' Get Windows version. None if it's not running on Windows '''
2002     if compat_os_name == 'nt':
2003         return version_tuple(platform.win32_ver()[1])
2004     else:
2005         return None
2006
2007
2008 def _windows_write_string(s, out):
2009     """ Returns True if the string was written using special methods,
2010     False if it has yet to be written out."""
2011     # Adapted from http://stackoverflow.com/a/3259271/35070
2012
2013     import ctypes.wintypes
2014
2015     WIN_OUTPUT_IDS = {
2016         1: -11,
2017         2: -12,
2018     }
2019
2020     try:
2021         fileno = out.fileno()
2022     except AttributeError:
2023         # If the output stream doesn't have a fileno, it's virtual
2024         return False
2025     except io.UnsupportedOperation:
2026         # Some strange Windows pseudo files?
2027         return False
2028     if fileno not in WIN_OUTPUT_IDS:
2029         return False
2030
2031     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2032         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2033         ('GetStdHandle', ctypes.windll.kernel32))
2034     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2035
2036     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2037         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2038         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2039         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2040     written = ctypes.wintypes.DWORD(0)
2041
2042     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2043     FILE_TYPE_CHAR = 0x0002
2044     FILE_TYPE_REMOTE = 0x8000
2045     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2046         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2047         ctypes.POINTER(ctypes.wintypes.DWORD))(
2048         ('GetConsoleMode', ctypes.windll.kernel32))
2049     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2050
2051     def not_a_console(handle):
2052         if handle == INVALID_HANDLE_VALUE or handle is None:
2053             return True
2054         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2055                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2056
2057     if not_a_console(h):
2058         return False
2059
2060     def next_nonbmp_pos(s):
2061         try:
2062             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2063         except StopIteration:
2064             return len(s)
2065
2066     while s:
2067         count = min(next_nonbmp_pos(s), 1024)
2068
2069         ret = WriteConsoleW(
2070             h, s, count if count else 2, ctypes.byref(written), None)
2071         if ret == 0:
2072             raise OSError('Failed to write string')
2073         if not count:  # We just wrote a non-BMP character
2074             assert written.value == 2
2075             s = s[1:]
2076         else:
2077             assert written.value > 0
2078             s = s[written.value:]
2079     return True
2080
2081
2082 def write_string(s, out=None, encoding=None):
2083     if out is None:
2084         out = sys.stderr
2085     assert type(s) == compat_str
2086
2087     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2088         if _windows_write_string(s, out):
2089             return
2090
2091     if ('b' in getattr(out, 'mode', '')
2092             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2093         byt = s.encode(encoding or preferredencoding(), 'ignore')
2094         out.write(byt)
2095     elif hasattr(out, 'buffer'):
2096         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2097         byt = s.encode(enc, 'ignore')
2098         out.buffer.write(byt)
2099     else:
2100         out.write(s)
2101     out.flush()
2102
2103
2104 def bytes_to_intlist(bs):
2105     if not bs:
2106         return []
2107     if isinstance(bs[0], int):  # Python 3
2108         return list(bs)
2109     else:
2110         return [ord(c) for c in bs]
2111
2112
2113 def intlist_to_bytes(xs):
2114     if not xs:
2115         return b''
2116     return compat_struct_pack('%dB' % len(xs), *xs)
2117
2118
2119 # Cross-platform file locking
2120 if sys.platform == 'win32':
2121     import ctypes.wintypes
2122     import msvcrt
2123
2124     class OVERLAPPED(ctypes.Structure):
2125         _fields_ = [
2126             ('Internal', ctypes.wintypes.LPVOID),
2127             ('InternalHigh', ctypes.wintypes.LPVOID),
2128             ('Offset', ctypes.wintypes.DWORD),
2129             ('OffsetHigh', ctypes.wintypes.DWORD),
2130             ('hEvent', ctypes.wintypes.HANDLE),
2131         ]
2132
2133     kernel32 = ctypes.windll.kernel32
2134     LockFileEx = kernel32.LockFileEx
2135     LockFileEx.argtypes = [
2136         ctypes.wintypes.HANDLE,     # hFile
2137         ctypes.wintypes.DWORD,      # dwFlags
2138         ctypes.wintypes.DWORD,      # dwReserved
2139         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2140         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2141         ctypes.POINTER(OVERLAPPED)  # Overlapped
2142     ]
2143     LockFileEx.restype = ctypes.wintypes.BOOL
2144     UnlockFileEx = kernel32.UnlockFileEx
2145     UnlockFileEx.argtypes = [
2146         ctypes.wintypes.HANDLE,     # hFile
2147         ctypes.wintypes.DWORD,      # dwReserved
2148         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2149         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2150         ctypes.POINTER(OVERLAPPED)  # Overlapped
2151     ]
2152     UnlockFileEx.restype = ctypes.wintypes.BOOL
2153     whole_low = 0xffffffff
2154     whole_high = 0x7fffffff
2155
2156     def _lock_file(f, exclusive, block):
2157         overlapped = OVERLAPPED()
2158         overlapped.Offset = 0
2159         overlapped.OffsetHigh = 0
2160         overlapped.hEvent = 0
2161         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2162
2163         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2164                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2165                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2166             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2167
2168     def _unlock_file(f):
2169         assert f._lock_file_overlapped_p
2170         handle = msvcrt.get_osfhandle(f.fileno())
2171         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2172             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2173
2174 else:
2175     try:
2176         import fcntl
2177
2178         def _lock_file(f, exclusive, block):
2179             try:
2180                 fcntl.flock(f,
2181                             fcntl.LOCK_SH if not exclusive
2182                             else fcntl.LOCK_EX if block
2183                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2184             except BlockingIOError:
2185                 raise
2186             except OSError:  # AOSP does not have flock()
2187                 fcntl.lockf(f,
2188                             fcntl.LOCK_SH if not exclusive
2189                             else fcntl.LOCK_EX if block
2190                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2191
2192         def _unlock_file(f):
2193             try:
2194                 fcntl.flock(f, fcntl.LOCK_UN)
2195             except OSError:
2196                 fcntl.lockf(f, fcntl.LOCK_UN)
2197
2198     except ImportError:
2199         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2200
2201         def _lock_file(f, exclusive, block):
2202             raise IOError(UNSUPPORTED_MSG)
2203
2204         def _unlock_file(f):
2205             raise IOError(UNSUPPORTED_MSG)
2206
2207
2208 class locked_file(object):
2209     _closed = False
2210
2211     def __init__(self, filename, mode, block=True, encoding=None):
2212         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2213         self.f = io.open(filename, mode, encoding=encoding)
2214         self.mode = mode
2215         self.block = block
2216
2217     def __enter__(self):
2218         exclusive = 'r' not in self.mode
2219         try:
2220             _lock_file(self.f, exclusive, self.block)
2221         except IOError:
2222             self.f.close()
2223             raise
2224         return self
2225
2226     def __exit__(self, etype, value, traceback):
2227         try:
2228             if not self._closed:
2229                 _unlock_file(self.f)
2230         finally:
2231             self.f.close()
2232             self._closed = True
2233
2234     def __iter__(self):
2235         return iter(self.f)
2236
2237     def write(self, *args):
2238         return self.f.write(*args)
2239
2240     def read(self, *args):
2241         return self.f.read(*args)
2242
2243     def flush(self):
2244         self.f.flush()
2245
2246     def open(self):
2247         return self.__enter__()
2248
2249     def close(self, *args):
2250         self.__exit__(self, *args, value=False, traceback=False)
2251
2252
2253 def get_filesystem_encoding():
2254     encoding = sys.getfilesystemencoding()
2255     return encoding if encoding is not None else 'utf-8'
2256
2257
2258 def shell_quote(args):
2259     quoted_args = []
2260     encoding = get_filesystem_encoding()
2261     for a in args:
2262         if isinstance(a, bytes):
2263             # We may get a filename encoded with 'encodeFilename'
2264             a = a.decode(encoding)
2265         quoted_args.append(compat_shlex_quote(a))
2266     return ' '.join(quoted_args)
2267
2268
2269 def smuggle_url(url, data):
2270     """ Pass additional data in a URL for internal use. """
2271
2272     url, idata = unsmuggle_url(url, {})
2273     data.update(idata)
2274     sdata = compat_urllib_parse_urlencode(
2275         {'__youtubedl_smuggle': json.dumps(data)})
2276     return url + '#' + sdata
2277
2278
2279 def unsmuggle_url(smug_url, default=None):
2280     if '#__youtubedl_smuggle' not in smug_url:
2281         return smug_url, default
2282     url, _, sdata = smug_url.rpartition('#')
2283     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2284     data = json.loads(jsond)
2285     return url, data
2286
2287
2288 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2289     """ Formats numbers with decimal sufixes like K, M, etc """
2290     num, factor = float_or_none(num), float(factor)
2291     if num is None or num < 0:
2292         return None
2293     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2294     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2295     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2296     if factor == 1024:
2297         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2298     converted = num / (factor ** exponent)
2299     return fmt % (converted, suffix)
2300
2301
2302 def format_bytes(bytes):
2303     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2304
2305
2306 def lookup_unit_table(unit_table, s):
2307     units_re = '|'.join(re.escape(u) for u in unit_table)
2308     m = re.match(
2309         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2310     if not m:
2311         return None
2312     num_str = m.group('num').replace(',', '.')
2313     mult = unit_table[m.group('unit')]
2314     return int(float(num_str) * mult)
2315
2316
2317 def parse_filesize(s):
2318     if s is None:
2319         return None
2320
2321     # The lower-case forms are of course incorrect and unofficial,
2322     # but we support those too
2323     _UNIT_TABLE = {
2324         'B': 1,
2325         'b': 1,
2326         'bytes': 1,
2327         'KiB': 1024,
2328         'KB': 1000,
2329         'kB': 1024,
2330         'Kb': 1000,
2331         'kb': 1000,
2332         'kilobytes': 1000,
2333         'kibibytes': 1024,
2334         'MiB': 1024 ** 2,
2335         'MB': 1000 ** 2,
2336         'mB': 1024 ** 2,
2337         'Mb': 1000 ** 2,
2338         'mb': 1000 ** 2,
2339         'megabytes': 1000 ** 2,
2340         'mebibytes': 1024 ** 2,
2341         'GiB': 1024 ** 3,
2342         'GB': 1000 ** 3,
2343         'gB': 1024 ** 3,
2344         'Gb': 1000 ** 3,
2345         'gb': 1000 ** 3,
2346         'gigabytes': 1000 ** 3,
2347         'gibibytes': 1024 ** 3,
2348         'TiB': 1024 ** 4,
2349         'TB': 1000 ** 4,
2350         'tB': 1024 ** 4,
2351         'Tb': 1000 ** 4,
2352         'tb': 1000 ** 4,
2353         'terabytes': 1000 ** 4,
2354         'tebibytes': 1024 ** 4,
2355         'PiB': 1024 ** 5,
2356         'PB': 1000 ** 5,
2357         'pB': 1024 ** 5,
2358         'Pb': 1000 ** 5,
2359         'pb': 1000 ** 5,
2360         'petabytes': 1000 ** 5,
2361         'pebibytes': 1024 ** 5,
2362         'EiB': 1024 ** 6,
2363         'EB': 1000 ** 6,
2364         'eB': 1024 ** 6,
2365         'Eb': 1000 ** 6,
2366         'eb': 1000 ** 6,
2367         'exabytes': 1000 ** 6,
2368         'exbibytes': 1024 ** 6,
2369         'ZiB': 1024 ** 7,
2370         'ZB': 1000 ** 7,
2371         'zB': 1024 ** 7,
2372         'Zb': 1000 ** 7,
2373         'zb': 1000 ** 7,
2374         'zettabytes': 1000 ** 7,
2375         'zebibytes': 1024 ** 7,
2376         'YiB': 1024 ** 8,
2377         'YB': 1000 ** 8,
2378         'yB': 1024 ** 8,
2379         'Yb': 1000 ** 8,
2380         'yb': 1000 ** 8,
2381         'yottabytes': 1000 ** 8,
2382         'yobibytes': 1024 ** 8,
2383     }
2384
2385     return lookup_unit_table(_UNIT_TABLE, s)
2386
2387
2388 def parse_count(s):
2389     if s is None:
2390         return None
2391
2392     s = re.sub(r'^[^\d]+\s', '', s).strip()
2393
2394     if re.match(r'^[\d,.]+$', s):
2395         return str_to_int(s)
2396
2397     _UNIT_TABLE = {
2398         'k': 1000,
2399         'K': 1000,
2400         'm': 1000 ** 2,
2401         'M': 1000 ** 2,
2402         'kk': 1000 ** 2,
2403         'KK': 1000 ** 2,
2404         'b': 1000 ** 3,
2405         'B': 1000 ** 3,
2406     }
2407
2408     ret = lookup_unit_table(_UNIT_TABLE, s)
2409     if ret is not None:
2410         return ret
2411
2412     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2413     if mobj:
2414         return str_to_int(mobj.group(1))
2415
2416
2417 def parse_resolution(s):
2418     if s is None:
2419         return {}
2420
2421     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2422     if mobj:
2423         return {
2424             'width': int(mobj.group('w')),
2425             'height': int(mobj.group('h')),
2426         }
2427
2428     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2429     if mobj:
2430         return {'height': int(mobj.group(1))}
2431
2432     mobj = re.search(r'\b([48])[kK]\b', s)
2433     if mobj:
2434         return {'height': int(mobj.group(1)) * 540}
2435
2436     return {}
2437
2438
2439 def parse_bitrate(s):
2440     if not isinstance(s, compat_str):
2441         return
2442     mobj = re.search(r'\b(\d+)\s*kbps', s)
2443     if mobj:
2444         return int(mobj.group(1))
2445
2446
2447 def month_by_name(name, lang='en'):
2448     """ Return the number of a month by (locale-independently) English name """
2449
2450     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2451
2452     try:
2453         return month_names.index(name) + 1
2454     except ValueError:
2455         return None
2456
2457
2458 def month_by_abbreviation(abbrev):
2459     """ Return the number of a month by (locale-independently) English
2460         abbreviations """
2461
2462     try:
2463         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2464     except ValueError:
2465         return None
2466
2467
2468 def fix_xml_ampersands(xml_str):
2469     """Replace all the '&' by '&amp;' in XML"""
2470     return re.sub(
2471         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2472         '&amp;',
2473         xml_str)
2474
2475
2476 def setproctitle(title):
2477     assert isinstance(title, compat_str)
2478
2479     # ctypes in Jython is not complete
2480     # http://bugs.jython.org/issue2148
2481     if sys.platform.startswith('java'):
2482         return
2483
2484     try:
2485         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2486     except OSError:
2487         return
2488     except TypeError:
2489         # LoadLibrary in Windows Python 2.7.13 only expects
2490         # a bytestring, but since unicode_literals turns
2491         # every string into a unicode string, it fails.
2492         return
2493     title_bytes = title.encode('utf-8')
2494     buf = ctypes.create_string_buffer(len(title_bytes))
2495     buf.value = title_bytes
2496     try:
2497         libc.prctl(15, buf, 0, 0, 0)
2498     except AttributeError:
2499         return  # Strange libc, just skip this
2500
2501
2502 def remove_start(s, start):
2503     return s[len(start):] if s is not None and s.startswith(start) else s
2504
2505
2506 def remove_end(s, end):
2507     return s[:-len(end)] if s is not None and s.endswith(end) else s
2508
2509
2510 def remove_quotes(s):
2511     if s is None or len(s) < 2:
2512         return s
2513     for quote in ('"', "'", ):
2514         if s[0] == quote and s[-1] == quote:
2515             return s[1:-1]
2516     return s
2517
2518
2519 def get_domain(url):
2520     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2521     return domain.group('domain') if domain else None
2522
2523
2524 def url_basename(url):
2525     path = compat_urlparse.urlparse(url).path
2526     return path.strip('/').split('/')[-1]
2527
2528
2529 def base_url(url):
2530     return re.match(r'https?://[^?#&]+/', url).group()
2531
2532
2533 def urljoin(base, path):
2534     if isinstance(path, bytes):
2535         path = path.decode('utf-8')
2536     if not isinstance(path, compat_str) or not path:
2537         return None
2538     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2539         return path
2540     if isinstance(base, bytes):
2541         base = base.decode('utf-8')
2542     if not isinstance(base, compat_str) or not re.match(
2543             r'^(?:https?:)?//', base):
2544         return None
2545     return compat_urlparse.urljoin(base, path)
2546
2547
2548 class HEADRequest(compat_urllib_request.Request):
2549     def get_method(self):
2550         return 'HEAD'
2551
2552
2553 class PUTRequest(compat_urllib_request.Request):
2554     def get_method(self):
2555         return 'PUT'
2556
2557
2558 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2559     if get_attr and v is not None:
2560         v = getattr(v, get_attr, None)
2561     try:
2562         return int(v) * invscale // scale
2563     except (ValueError, TypeError, OverflowError):
2564         return default
2565
2566
2567 def str_or_none(v, default=None):
2568     return default if v is None else compat_str(v)
2569
2570
2571 def str_to_int(int_str):
2572     """ A more relaxed version of int_or_none """
2573     if isinstance(int_str, compat_integer_types):
2574         return int_str
2575     elif isinstance(int_str, compat_str):
2576         int_str = re.sub(r'[,\.\+]', '', int_str)
2577         return int_or_none(int_str)
2578
2579
2580 def float_or_none(v, scale=1, invscale=1, default=None):
2581     if v is None:
2582         return default
2583     try:
2584         return float(v) * invscale / scale
2585     except (ValueError, TypeError):
2586         return default
2587
2588
2589 def bool_or_none(v, default=None):
2590     return v if isinstance(v, bool) else default
2591
2592
2593 def strip_or_none(v, default=None):
2594     return v.strip() if isinstance(v, compat_str) else default
2595
2596
2597 def url_or_none(url):
2598     if not url or not isinstance(url, compat_str):
2599         return None
2600     url = url.strip()
2601     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2602
2603
2604 def request_to_url(req):
2605     if isinstance(req, compat_urllib_request.Request):
2606         return req.get_full_url()
2607     else:
2608         return req
2609
2610
2611 def strftime_or_none(timestamp, date_format, default=None):
2612     datetime_object = None
2613     try:
2614         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2615             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2616         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2617             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2618         return datetime_object.strftime(date_format)
2619     except (ValueError, TypeError, AttributeError):
2620         return default
2621
2622
2623 def parse_duration(s):
2624     if not isinstance(s, compat_basestring):
2625         return None
2626     s = s.strip()
2627     if not s:
2628         return None
2629
2630     days, hours, mins, secs, ms = [None] * 5
2631     m = re.match(r'''(?x)
2632             (?P<before_secs>
2633                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2634             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2635             (?P<ms>[.:][0-9]+)?Z?$
2636         ''', s)
2637     if m:
2638         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2639     else:
2640         m = re.match(
2641             r'''(?ix)(?:P?
2642                 (?:
2643                     [0-9]+\s*y(?:ears?)?\s*
2644                 )?
2645                 (?:
2646                     [0-9]+\s*m(?:onths?)?\s*
2647                 )?
2648                 (?:
2649                     [0-9]+\s*w(?:eeks?)?\s*
2650                 )?
2651                 (?:
2652                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2653                 )?
2654                 T)?
2655                 (?:
2656                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2657                 )?
2658                 (?:
2659                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2660                 )?
2661                 (?:
2662                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2663                 )?Z?$''', s)
2664         if m:
2665             days, hours, mins, secs, ms = m.groups()
2666         else:
2667             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2668             if m:
2669                 hours, mins = m.groups()
2670             else:
2671                 return None
2672
2673     duration = 0
2674     if secs:
2675         duration += float(secs)
2676     if mins:
2677         duration += float(mins) * 60
2678     if hours:
2679         duration += float(hours) * 60 * 60
2680     if days:
2681         duration += float(days) * 24 * 60 * 60
2682     if ms:
2683         duration += float(ms.replace(':', '.'))
2684     return duration
2685
2686
2687 def prepend_extension(filename, ext, expected_real_ext=None):
2688     name, real_ext = os.path.splitext(filename)
2689     return (
2690         '{0}.{1}{2}'.format(name, ext, real_ext)
2691         if not expected_real_ext or real_ext[1:] == expected_real_ext
2692         else '{0}.{1}'.format(filename, ext))
2693
2694
2695 def replace_extension(filename, ext, expected_real_ext=None):
2696     name, real_ext = os.path.splitext(filename)
2697     return '{0}.{1}'.format(
2698         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2699         ext)
2700
2701
2702 def check_executable(exe, args=[]):
2703     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2704     args can be a list of arguments for a short output (like -version) """
2705     try:
2706         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2707     except OSError:
2708         return False
2709     return exe
2710
2711
2712 def _get_exe_version_output(exe, args):
2713     try:
2714         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2715         # SIGTTOU if yt-dlp is run in the background.
2716         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2717         out, _ = Popen(
2718             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2719             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2720     except OSError:
2721         return False
2722     if isinstance(out, bytes):  # Python 2.x
2723         out = out.decode('ascii', 'ignore')
2724     return out
2725
2726
2727 def detect_exe_version(output, version_re=None, unrecognized='present'):
2728     assert isinstance(output, compat_str)
2729     if version_re is None:
2730         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2731     m = re.search(version_re, output)
2732     if m:
2733         return m.group(1)
2734     else:
2735         return unrecognized
2736
2737
2738 def get_exe_version(exe, args=['--version'],
2739                     version_re=None, unrecognized='present'):
2740     """ Returns the version of the specified executable,
2741     or False if the executable is not present """
2742     out = _get_exe_version_output(exe, args)
2743     return detect_exe_version(out, version_re, unrecognized) if out else False
2744
2745
2746 class LazyList(collections.abc.Sequence):
2747     ''' Lazy immutable list from an iterable
2748     Note that slices of a LazyList are lists and not LazyList'''
2749
2750     class IndexError(IndexError):
2751         pass
2752
2753     def __init__(self, iterable, *, reverse=False, _cache=None):
2754         self.__iterable = iter(iterable)
2755         self.__cache = [] if _cache is None else _cache
2756         self.__reversed = reverse
2757
2758     def __iter__(self):
2759         if self.__reversed:
2760             # We need to consume the entire iterable to iterate in reverse
2761             yield from self.exhaust()
2762             return
2763         yield from self.__cache
2764         for item in self.__iterable:
2765             self.__cache.append(item)
2766             yield item
2767
2768     def __exhaust(self):
2769         self.__cache.extend(self.__iterable)
2770         # Discard the emptied iterable to make it pickle-able
2771         self.__iterable = []
2772         return self.__cache
2773
2774     def exhaust(self):
2775         ''' Evaluate the entire iterable '''
2776         return self.__exhaust()[::-1 if self.__reversed else 1]
2777
2778     @staticmethod
2779     def __reverse_index(x):
2780         return None if x is None else -(x + 1)
2781
2782     def __getitem__(self, idx):
2783         if isinstance(idx, slice):
2784             if self.__reversed:
2785                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2786             start, stop, step = idx.start, idx.stop, idx.step or 1
2787         elif isinstance(idx, int):
2788             if self.__reversed:
2789                 idx = self.__reverse_index(idx)
2790             start, stop, step = idx, idx, 0
2791         else:
2792             raise TypeError('indices must be integers or slices')
2793         if ((start or 0) < 0 or (stop or 0) < 0
2794                 or (start is None and step < 0)
2795                 or (stop is None and step > 0)):
2796             # We need to consume the entire iterable to be able to slice from the end
2797             # Obviously, never use this with infinite iterables
2798             self.__exhaust()
2799             try:
2800                 return self.__cache[idx]
2801             except IndexError as e:
2802                 raise self.IndexError(e) from e
2803         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2804         if n > 0:
2805             self.__cache.extend(itertools.islice(self.__iterable, n))
2806         try:
2807             return self.__cache[idx]
2808         except IndexError as e:
2809             raise self.IndexError(e) from e
2810
2811     def __bool__(self):
2812         try:
2813             self[-1] if self.__reversed else self[0]
2814         except self.IndexError:
2815             return False
2816         return True
2817
2818     def __len__(self):
2819         self.__exhaust()
2820         return len(self.__cache)
2821
2822     def __reversed__(self):
2823         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2824
2825     def __copy__(self):
2826         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2827
2828     def __repr__(self):
2829         # repr and str should mimic a list. So we exhaust the iterable
2830         return repr(self.exhaust())
2831
2832     def __str__(self):
2833         return repr(self.exhaust())
2834
2835
2836 class PagedList:
2837
2838     class IndexError(IndexError):
2839         pass
2840
2841     def __len__(self):
2842         # This is only useful for tests
2843         return len(self.getslice())
2844
2845     def __init__(self, pagefunc, pagesize, use_cache=True):
2846         self._pagefunc = pagefunc
2847         self._pagesize = pagesize
2848         self._pagecount = float('inf')
2849         self._use_cache = use_cache
2850         self._cache = {}
2851
2852     def getpage(self, pagenum):
2853         page_results = self._cache.get(pagenum)
2854         if page_results is None:
2855             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2856         if self._use_cache:
2857             self._cache[pagenum] = page_results
2858         return page_results
2859
2860     def getslice(self, start=0, end=None):
2861         return list(self._getslice(start, end))
2862
2863     def _getslice(self, start, end):
2864         raise NotImplementedError('This method must be implemented by subclasses')
2865
2866     def __getitem__(self, idx):
2867         assert self._use_cache, 'Indexing PagedList requires cache'
2868         if not isinstance(idx, int) or idx < 0:
2869             raise TypeError('indices must be non-negative integers')
2870         entries = self.getslice(idx, idx + 1)
2871         if not entries:
2872             raise self.IndexError()
2873         return entries[0]
2874
2875
2876 class OnDemandPagedList(PagedList):
2877     def _getslice(self, start, end):
2878         for pagenum in itertools.count(start // self._pagesize):
2879             firstid = pagenum * self._pagesize
2880             nextfirstid = pagenum * self._pagesize + self._pagesize
2881             if start >= nextfirstid:
2882                 continue
2883
2884             startv = (
2885                 start % self._pagesize
2886                 if firstid <= start < nextfirstid
2887                 else 0)
2888             endv = (
2889                 ((end - 1) % self._pagesize) + 1
2890                 if (end is not None and firstid <= end <= nextfirstid)
2891                 else None)
2892
2893             try:
2894                 page_results = self.getpage(pagenum)
2895             except Exception:
2896                 self._pagecount = pagenum - 1
2897                 raise
2898             if startv != 0 or endv is not None:
2899                 page_results = page_results[startv:endv]
2900             yield from page_results
2901
2902             # A little optimization - if current page is not "full", ie. does
2903             # not contain page_size videos then we can assume that this page
2904             # is the last one - there are no more ids on further pages -
2905             # i.e. no need to query again.
2906             if len(page_results) + startv < self._pagesize:
2907                 break
2908
2909             # If we got the whole page, but the next page is not interesting,
2910             # break out early as well
2911             if end == nextfirstid:
2912                 break
2913
2914
2915 class InAdvancePagedList(PagedList):
2916     def __init__(self, pagefunc, pagecount, pagesize):
2917         PagedList.__init__(self, pagefunc, pagesize, True)
2918         self._pagecount = pagecount
2919
2920     def _getslice(self, start, end):
2921         start_page = start // self._pagesize
2922         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2923         skip_elems = start - start_page * self._pagesize
2924         only_more = None if end is None else end - start
2925         for pagenum in range(start_page, end_page):
2926             page_results = self.getpage(pagenum)
2927             if skip_elems:
2928                 page_results = page_results[skip_elems:]
2929                 skip_elems = None
2930             if only_more is not None:
2931                 if len(page_results) < only_more:
2932                     only_more -= len(page_results)
2933                 else:
2934                     yield from page_results[:only_more]
2935                     break
2936             yield from page_results
2937
2938
2939 def uppercase_escape(s):
2940     unicode_escape = codecs.getdecoder('unicode_escape')
2941     return re.sub(
2942         r'\\U[0-9a-fA-F]{8}',
2943         lambda m: unicode_escape(m.group(0))[0],
2944         s)
2945
2946
2947 def lowercase_escape(s):
2948     unicode_escape = codecs.getdecoder('unicode_escape')
2949     return re.sub(
2950         r'\\u[0-9a-fA-F]{4}',
2951         lambda m: unicode_escape(m.group(0))[0],
2952         s)
2953
2954
2955 def escape_rfc3986(s):
2956     """Escape non-ASCII characters as suggested by RFC 3986"""
2957     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2958         s = s.encode('utf-8')
2959     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2960
2961
2962 def escape_url(url):
2963     """Escape URL as suggested by RFC 3986"""
2964     url_parsed = compat_urllib_parse_urlparse(url)
2965     return url_parsed._replace(
2966         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2967         path=escape_rfc3986(url_parsed.path),
2968         params=escape_rfc3986(url_parsed.params),
2969         query=escape_rfc3986(url_parsed.query),
2970         fragment=escape_rfc3986(url_parsed.fragment)
2971     ).geturl()
2972
2973
2974 def parse_qs(url):
2975     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2976
2977
2978 def read_batch_urls(batch_fd):
2979     def fixup(url):
2980         if not isinstance(url, compat_str):
2981             url = url.decode('utf-8', 'replace')
2982         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2983         for bom in BOM_UTF8:
2984             if url.startswith(bom):
2985                 url = url[len(bom):]
2986         url = url.lstrip()
2987         if not url or url.startswith(('#', ';', ']')):
2988             return False
2989         # "#" cannot be stripped out since it is part of the URI
2990         # However, it can be safely stipped out if follwing a whitespace
2991         return re.split(r'\s#', url, 1)[0].rstrip()
2992
2993     with contextlib.closing(batch_fd) as fd:
2994         return [url for url in map(fixup, fd) if url]
2995
2996
2997 def urlencode_postdata(*args, **kargs):
2998     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2999
3000
3001 def update_url_query(url, query):
3002     if not query:
3003         return url
3004     parsed_url = compat_urlparse.urlparse(url)
3005     qs = compat_parse_qs(parsed_url.query)
3006     qs.update(query)
3007     return compat_urlparse.urlunparse(parsed_url._replace(
3008         query=compat_urllib_parse_urlencode(qs, True)))
3009
3010
3011 def update_Request(req, url=None, data=None, headers={}, query={}):
3012     req_headers = req.headers.copy()
3013     req_headers.update(headers)
3014     req_data = data or req.data
3015     req_url = update_url_query(url or req.get_full_url(), query)
3016     req_get_method = req.get_method()
3017     if req_get_method == 'HEAD':
3018         req_type = HEADRequest
3019     elif req_get_method == 'PUT':
3020         req_type = PUTRequest
3021     else:
3022         req_type = compat_urllib_request.Request
3023     new_req = req_type(
3024         req_url, data=req_data, headers=req_headers,
3025         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3026     if hasattr(req, 'timeout'):
3027         new_req.timeout = req.timeout
3028     return new_req
3029
3030
3031 def _multipart_encode_impl(data, boundary):
3032     content_type = 'multipart/form-data; boundary=%s' % boundary
3033
3034     out = b''
3035     for k, v in data.items():
3036         out += b'--' + boundary.encode('ascii') + b'\r\n'
3037         if isinstance(k, compat_str):
3038             k = k.encode('utf-8')
3039         if isinstance(v, compat_str):
3040             v = v.encode('utf-8')
3041         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3042         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3043         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3044         if boundary.encode('ascii') in content:
3045             raise ValueError('Boundary overlaps with data')
3046         out += content
3047
3048     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3049
3050     return out, content_type
3051
3052
3053 def multipart_encode(data, boundary=None):
3054     '''
3055     Encode a dict to RFC 7578-compliant form-data
3056
3057     data:
3058         A dict where keys and values can be either Unicode or bytes-like
3059         objects.
3060     boundary:
3061         If specified a Unicode object, it's used as the boundary. Otherwise
3062         a random boundary is generated.
3063
3064     Reference: https://tools.ietf.org/html/rfc7578
3065     '''
3066     has_specified_boundary = boundary is not None
3067
3068     while True:
3069         if boundary is None:
3070             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3071
3072         try:
3073             out, content_type = _multipart_encode_impl(data, boundary)
3074             break
3075         except ValueError:
3076             if has_specified_boundary:
3077                 raise
3078             boundary = None
3079
3080     return out, content_type
3081
3082
3083 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3084     if isinstance(key_or_keys, (list, tuple)):
3085         for key in key_or_keys:
3086             if key not in d or d[key] is None or skip_false_values and not d[key]:
3087                 continue
3088             return d[key]
3089         return default
3090     return d.get(key_or_keys, default)
3091
3092
3093 def try_get(src, getter, expected_type=None):
3094     for get in variadic(getter):
3095         try:
3096             v = get(src)
3097         except (AttributeError, KeyError, TypeError, IndexError):
3098             pass
3099         else:
3100             if expected_type is None or isinstance(v, expected_type):
3101                 return v
3102
3103
3104 def merge_dicts(*dicts):
3105     merged = {}
3106     for a_dict in dicts:
3107         for k, v in a_dict.items():
3108             if v is None:
3109                 continue
3110             if (k not in merged
3111                     or (isinstance(v, compat_str) and v
3112                         and isinstance(merged[k], compat_str)
3113                         and not merged[k])):
3114                 merged[k] = v
3115     return merged
3116
3117
3118 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3119     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3120
3121
3122 US_RATINGS = {
3123     'G': 0,
3124     'PG': 10,
3125     'PG-13': 13,
3126     'R': 16,
3127     'NC': 18,
3128 }
3129
3130
3131 TV_PARENTAL_GUIDELINES = {
3132     'TV-Y': 0,
3133     'TV-Y7': 7,
3134     'TV-G': 0,
3135     'TV-PG': 0,
3136     'TV-14': 14,
3137     'TV-MA': 17,
3138 }
3139
3140
3141 def parse_age_limit(s):
3142     if type(s) == int:
3143         return s if 0 <= s <= 21 else None
3144     if not isinstance(s, compat_basestring):
3145         return None
3146     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3147     if m:
3148         return int(m.group('age'))
3149     s = s.upper()
3150     if s in US_RATINGS:
3151         return US_RATINGS[s]
3152     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3153     if m:
3154         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3155     return None
3156
3157
3158 def strip_jsonp(code):
3159     return re.sub(
3160         r'''(?sx)^
3161             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3162             (?:\s*&&\s*(?P=func_name))?
3163             \s*\(\s*(?P<callback_data>.*)\);?
3164             \s*?(?://[^\n]*)*$''',
3165         r'\g<callback_data>', code)
3166
3167
3168 def js_to_json(code, vars={}):
3169     # vars is a dict of var, val pairs to substitute
3170     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3171     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3172     INTEGER_TABLE = (
3173         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3174         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3175     )
3176
3177     def fix_kv(m):
3178         v = m.group(0)
3179         if v in ('true', 'false', 'null'):
3180             return v
3181         elif v in ('undefined', 'void 0'):
3182             return 'null'
3183         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3184             return ""
3185
3186         if v[0] in ("'", '"'):
3187             v = re.sub(r'(?s)\\.|"', lambda m: {
3188                 '"': '\\"',
3189                 "\\'": "'",
3190                 '\\\n': '',
3191                 '\\x': '\\u00',
3192             }.get(m.group(0), m.group(0)), v[1:-1])
3193         else:
3194             for regex, base in INTEGER_TABLE:
3195                 im = re.match(regex, v)
3196                 if im:
3197                     i = int(im.group(1), base)
3198                     return '"%d":' % i if v.endswith(':') else '%d' % i
3199
3200             if v in vars:
3201                 return vars[v]
3202
3203         return '"%s"' % v
3204
3205     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3206
3207     return re.sub(r'''(?sx)
3208         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3209         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3210         {comment}|,(?={skip}[\]}}])|
3211         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3212         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3213         [0-9]+(?={skip}:)|
3214         !+
3215         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3216
3217
3218 def qualities(quality_ids):
3219     """ Get a numeric quality value out of a list of possible values """
3220     def q(qid):
3221         try:
3222             return quality_ids.index(qid)
3223         except ValueError:
3224             return -1
3225     return q
3226
3227
3228 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3229
3230
3231 DEFAULT_OUTTMPL = {
3232     'default': '%(title)s [%(id)s].%(ext)s',
3233     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3234 }
3235 OUTTMPL_TYPES = {
3236     'chapter': None,
3237     'subtitle': None,
3238     'thumbnail': None,
3239     'description': 'description',
3240     'annotation': 'annotations.xml',
3241     'infojson': 'info.json',
3242     'link': None,
3243     'pl_video': None,
3244     'pl_thumbnail': None,
3245     'pl_description': 'description',
3246     'pl_infojson': 'info.json',
3247 }
3248
3249 # As of [1] format syntax is:
3250 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3251 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3252 STR_FORMAT_RE_TMPL = r'''(?x)
3253     (?<!%)(?P<prefix>(?:%%)*)
3254     %
3255     (?P<has_key>\((?P<key>{0})\))?
3256     (?P<format>
3257         (?P<conversion>[#0\-+ ]+)?
3258         (?P<min_width>\d+)?
3259         (?P<precision>\.\d+)?
3260         (?P<len_mod>[hlL])?  # unused in python
3261         {1}  # conversion type
3262     )
3263 '''
3264
3265
3266 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3267
3268
3269 def limit_length(s, length):
3270     """ Add ellipses to overly long strings """
3271     if s is None:
3272         return None
3273     ELLIPSES = '...'
3274     if len(s) > length:
3275         return s[:length - len(ELLIPSES)] + ELLIPSES
3276     return s
3277
3278
3279 def version_tuple(v):
3280     return tuple(int(e) for e in re.split(r'[-.]', v))
3281
3282
3283 def is_outdated_version(version, limit, assume_new=True):
3284     if not version:
3285         return not assume_new
3286     try:
3287         return version_tuple(version) < version_tuple(limit)
3288     except ValueError:
3289         return not assume_new
3290
3291
3292 def ytdl_is_updateable():
3293     """ Returns if yt-dlp can be updated with -U """
3294
3295     from .update import is_non_updateable
3296
3297     return not is_non_updateable()
3298
3299
3300 def args_to_str(args):
3301     # Get a short string representation for a subprocess command
3302     return ' '.join(compat_shlex_quote(a) for a in args)
3303
3304
3305 def error_to_compat_str(err):
3306     err_str = str(err)
3307     # On python 2 error byte string must be decoded with proper
3308     # encoding rather than ascii
3309     if sys.version_info[0] < 3:
3310         err_str = err_str.decode(preferredencoding())
3311     return err_str
3312
3313
3314 def mimetype2ext(mt):
3315     if mt is None:
3316         return None
3317
3318     mt, _, params = mt.partition(';')
3319     mt = mt.strip()
3320
3321     FULL_MAP = {
3322         'audio/mp4': 'm4a',
3323         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3324         # it's the most popular one
3325         'audio/mpeg': 'mp3',
3326         'audio/x-wav': 'wav',
3327         'audio/wav': 'wav',
3328         'audio/wave': 'wav',
3329     }
3330
3331     ext = FULL_MAP.get(mt)
3332     if ext is not None:
3333         return ext
3334
3335     SUBTYPE_MAP = {
3336         '3gpp': '3gp',
3337         'smptett+xml': 'tt',
3338         'ttaf+xml': 'dfxp',
3339         'ttml+xml': 'ttml',
3340         'x-flv': 'flv',
3341         'x-mp4-fragmented': 'mp4',
3342         'x-ms-sami': 'sami',
3343         'x-ms-wmv': 'wmv',
3344         'mpegurl': 'm3u8',
3345         'x-mpegurl': 'm3u8',
3346         'vnd.apple.mpegurl': 'm3u8',
3347         'dash+xml': 'mpd',
3348         'f4m+xml': 'f4m',
3349         'hds+xml': 'f4m',
3350         'vnd.ms-sstr+xml': 'ism',
3351         'quicktime': 'mov',
3352         'mp2t': 'ts',
3353         'x-wav': 'wav',
3354         'filmstrip+json': 'fs',
3355         'svg+xml': 'svg',
3356     }
3357
3358     _, _, subtype = mt.rpartition('/')
3359     ext = SUBTYPE_MAP.get(subtype.lower())
3360     if ext is not None:
3361         return ext
3362
3363     SUFFIX_MAP = {
3364         'json': 'json',
3365         'xml': 'xml',
3366         'zip': 'zip',
3367         'gzip': 'gz',
3368     }
3369
3370     _, _, suffix = subtype.partition('+')
3371     ext = SUFFIX_MAP.get(suffix)
3372     if ext is not None:
3373         return ext
3374
3375     return subtype.replace('+', '.')
3376
3377
3378 def ext2mimetype(ext_or_url):
3379     if not ext_or_url:
3380         return None
3381     if '.' not in ext_or_url:
3382         ext_or_url = f'file.{ext_or_url}'
3383     return mimetypes.guess_type(ext_or_url)[0]
3384
3385
3386 def parse_codecs(codecs_str):
3387     # http://tools.ietf.org/html/rfc6381
3388     if not codecs_str:
3389         return {}
3390     split_codecs = list(filter(None, map(
3391         str.strip, codecs_str.strip().strip(',').split(','))))
3392     vcodec, acodec, tcodec, hdr = None, None, None, None
3393     for full_codec in split_codecs:
3394         parts = full_codec.split('.')
3395         codec = parts[0].replace('0', '')
3396         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3397                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3398             if not vcodec:
3399                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3400                 if codec in ('dvh1', 'dvhe'):
3401                     hdr = 'DV'
3402                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3403                     hdr = 'HDR10'
3404                 elif full_codec.replace('0', '').startswith('vp9.2'):
3405                     hdr = 'HDR10'
3406         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3407             if not acodec:
3408                 acodec = full_codec
3409         elif codec in ('stpp', 'wvtt',):
3410             if not tcodec:
3411                 tcodec = full_codec
3412         else:
3413             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3414     if vcodec or acodec or tcodec:
3415         return {
3416             'vcodec': vcodec or 'none',
3417             'acodec': acodec or 'none',
3418             'dynamic_range': hdr,
3419             **({'tcodec': tcodec} if tcodec is not None else {}),
3420         }
3421     elif len(split_codecs) == 2:
3422         return {
3423             'vcodec': split_codecs[0],
3424             'acodec': split_codecs[1],
3425         }
3426     return {}
3427
3428
3429 def urlhandle_detect_ext(url_handle):
3430     getheader = url_handle.headers.get
3431
3432     cd = getheader('Content-Disposition')
3433     if cd:
3434         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3435         if m:
3436             e = determine_ext(m.group('filename'), default_ext=None)
3437             if e:
3438                 return e
3439
3440     return mimetype2ext(getheader('Content-Type'))
3441
3442
3443 def encode_data_uri(data, mime_type):
3444     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3445
3446
3447 def age_restricted(content_limit, age_limit):
3448     """ Returns True iff the content should be blocked """
3449
3450     if age_limit is None:  # No limit set
3451         return False
3452     if content_limit is None:
3453         return False  # Content available for everyone
3454     return age_limit < content_limit
3455
3456
3457 def is_html(first_bytes):
3458     """ Detect whether a file contains HTML by examining its first bytes. """
3459
3460     BOMS = [
3461         (b'\xef\xbb\xbf', 'utf-8'),
3462         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3463         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3464         (b'\xff\xfe', 'utf-16-le'),
3465         (b'\xfe\xff', 'utf-16-be'),
3466     ]
3467     for bom, enc in BOMS:
3468         if first_bytes.startswith(bom):
3469             s = first_bytes[len(bom):].decode(enc, 'replace')
3470             break
3471     else:
3472         s = first_bytes.decode('utf-8', 'replace')
3473
3474     return re.match(r'^\s*<', s)
3475
3476
3477 def determine_protocol(info_dict):
3478     protocol = info_dict.get('protocol')
3479     if protocol is not None:
3480         return protocol
3481
3482     url = sanitize_url(info_dict['url'])
3483     if url.startswith('rtmp'):
3484         return 'rtmp'
3485     elif url.startswith('mms'):
3486         return 'mms'
3487     elif url.startswith('rtsp'):
3488         return 'rtsp'
3489
3490     ext = determine_ext(url)
3491     if ext == 'm3u8':
3492         return 'm3u8'
3493     elif ext == 'f4m':
3494         return 'f4m'
3495
3496     return compat_urllib_parse_urlparse(url).scheme
3497
3498
3499 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3500     """ Render a list of rows, each as a list of values.
3501     Text after a \t will be right aligned """
3502     def width(string):
3503         return len(remove_terminal_sequences(string).replace('\t', ''))
3504
3505     def get_max_lens(table):
3506         return [max(width(str(v)) for v in col) for col in zip(*table)]
3507
3508     def filter_using_list(row, filterArray):
3509         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3510
3511     max_lens = get_max_lens(data) if hide_empty else []
3512     header_row = filter_using_list(header_row, max_lens)
3513     data = [filter_using_list(row, max_lens) for row in data]
3514
3515     table = [header_row] + data
3516     max_lens = get_max_lens(table)
3517     extra_gap += 1
3518     if delim:
3519         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3520         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3521     for row in table:
3522         for pos, text in enumerate(map(str, row)):
3523             if '\t' in text:
3524                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3525             else:
3526                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3527     ret = '\n'.join(''.join(row).rstrip() for row in table)
3528     return ret
3529
3530
3531 def _match_one(filter_part, dct, incomplete):
3532     # TODO: Generalize code with YoutubeDL._build_format_filter
3533     STRING_OPERATORS = {
3534         '*=': operator.contains,
3535         '^=': lambda attr, value: attr.startswith(value),
3536         '$=': lambda attr, value: attr.endswith(value),
3537         '~=': lambda attr, value: re.search(value, attr),
3538     }
3539     COMPARISON_OPERATORS = {
3540         **STRING_OPERATORS,
3541         '<=': operator.le,  # "<=" must be defined above "<"
3542         '<': operator.lt,
3543         '>=': operator.ge,
3544         '>': operator.gt,
3545         '=': operator.eq,
3546     }
3547
3548     if isinstance(incomplete, bool):
3549         is_incomplete = lambda _: incomplete
3550     else:
3551         is_incomplete = lambda k: k in incomplete
3552
3553     operator_rex = re.compile(r'''(?x)\s*
3554         (?P<key>[a-z_]+)
3555         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3556         (?:
3557             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3558             (?P<strval>.+?)
3559         )
3560         \s*$
3561         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3562     m = operator_rex.search(filter_part)
3563     if m:
3564         m = m.groupdict()
3565         unnegated_op = COMPARISON_OPERATORS[m['op']]
3566         if m['negation']:
3567             op = lambda attr, value: not unnegated_op(attr, value)
3568         else:
3569             op = unnegated_op
3570         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3571         if m['quote']:
3572             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3573         actual_value = dct.get(m['key'])
3574         numeric_comparison = None
3575         if isinstance(actual_value, compat_numeric_types):
3576             # If the original field is a string and matching comparisonvalue is
3577             # a number we should respect the origin of the original field
3578             # and process comparison value as a string (see
3579             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3580             try:
3581                 numeric_comparison = int(comparison_value)
3582             except ValueError:
3583                 numeric_comparison = parse_filesize(comparison_value)
3584                 if numeric_comparison is None:
3585                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3586                 if numeric_comparison is None:
3587                     numeric_comparison = parse_duration(comparison_value)
3588         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3589             raise ValueError('Operator %s only supports string values!' % m['op'])
3590         if actual_value is None:
3591             return is_incomplete(m['key']) or m['none_inclusive']
3592         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3593
3594     UNARY_OPERATORS = {
3595         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3596         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3597     }
3598     operator_rex = re.compile(r'''(?x)\s*
3599         (?P<op>%s)\s*(?P<key>[a-z_]+)
3600         \s*$
3601         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3602     m = operator_rex.search(filter_part)
3603     if m:
3604         op = UNARY_OPERATORS[m.group('op')]
3605         actual_value = dct.get(m.group('key'))
3606         if is_incomplete(m.group('key')) and actual_value is None:
3607             return True
3608         return op(actual_value)
3609
3610     raise ValueError('Invalid filter part %r' % filter_part)
3611
3612
3613 def match_str(filter_str, dct, incomplete=False):
3614     """ Filter a dictionary with a simple string syntax.
3615     @returns           Whether the filter passes
3616     @param incomplete  Set of keys that is expected to be missing from dct.
3617                        Can be True/False to indicate all/none of the keys may be missing.
3618                        All conditions on incomplete keys pass if the key is missing
3619     """
3620     return all(
3621         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3622         for filter_part in re.split(r'(?<!\\)&', filter_str))
3623
3624
3625 def match_filter_func(filters):
3626     if not filters:
3627         return None
3628     filters = variadic(filters)
3629
3630     def _match_func(info_dict, *args, **kwargs):
3631         if any(match_str(f, info_dict, *args, **kwargs) for f in filters):
3632             return None
3633         else:
3634             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3635             filter_str = ') | ('.join(map(str.strip, filters))
3636             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3637     return _match_func
3638
3639
3640 def parse_dfxp_time_expr(time_expr):
3641     if not time_expr:
3642         return
3643
3644     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3645     if mobj:
3646         return float(mobj.group('time_offset'))
3647
3648     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3649     if mobj:
3650         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3651
3652
3653 def srt_subtitles_timecode(seconds):
3654     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3655
3656
3657 def ass_subtitles_timecode(seconds):
3658     time = timetuple_from_msec(seconds * 1000)
3659     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3660
3661
3662 def dfxp2srt(dfxp_data):
3663     '''
3664     @param dfxp_data A bytes-like object containing DFXP data
3665     @returns A unicode object containing converted SRT data
3666     '''
3667     LEGACY_NAMESPACES = (
3668         (b'http://www.w3.org/ns/ttml', [
3669             b'http://www.w3.org/2004/11/ttaf1',
3670             b'http://www.w3.org/2006/04/ttaf1',
3671             b'http://www.w3.org/2006/10/ttaf1',
3672         ]),
3673         (b'http://www.w3.org/ns/ttml#styling', [
3674             b'http://www.w3.org/ns/ttml#style',
3675         ]),
3676     )
3677
3678     SUPPORTED_STYLING = [
3679         'color',
3680         'fontFamily',
3681         'fontSize',
3682         'fontStyle',
3683         'fontWeight',
3684         'textDecoration'
3685     ]
3686
3687     _x = functools.partial(xpath_with_ns, ns_map={
3688         'xml': 'http://www.w3.org/XML/1998/namespace',
3689         'ttml': 'http://www.w3.org/ns/ttml',
3690         'tts': 'http://www.w3.org/ns/ttml#styling',
3691     })
3692
3693     styles = {}
3694     default_style = {}
3695
3696     class TTMLPElementParser(object):
3697         _out = ''
3698         _unclosed_elements = []
3699         _applied_styles = []
3700
3701         def start(self, tag, attrib):
3702             if tag in (_x('ttml:br'), 'br'):
3703                 self._out += '\n'
3704             else:
3705                 unclosed_elements = []
3706                 style = {}
3707                 element_style_id = attrib.get('style')
3708                 if default_style:
3709                     style.update(default_style)
3710                 if element_style_id:
3711                     style.update(styles.get(element_style_id, {}))
3712                 for prop in SUPPORTED_STYLING:
3713                     prop_val = attrib.get(_x('tts:' + prop))
3714                     if prop_val:
3715                         style[prop] = prop_val
3716                 if style:
3717                     font = ''
3718                     for k, v in sorted(style.items()):
3719                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3720                             continue
3721                         if k == 'color':
3722                             font += ' color="%s"' % v
3723                         elif k == 'fontSize':
3724                             font += ' size="%s"' % v
3725                         elif k == 'fontFamily':
3726                             font += ' face="%s"' % v
3727                         elif k == 'fontWeight' and v == 'bold':
3728                             self._out += '<b>'
3729                             unclosed_elements.append('b')
3730                         elif k == 'fontStyle' and v == 'italic':
3731                             self._out += '<i>'
3732                             unclosed_elements.append('i')
3733                         elif k == 'textDecoration' and v == 'underline':
3734                             self._out += '<u>'
3735                             unclosed_elements.append('u')
3736                     if font:
3737                         self._out += '<font' + font + '>'
3738                         unclosed_elements.append('font')
3739                     applied_style = {}
3740                     if self._applied_styles:
3741                         applied_style.update(self._applied_styles[-1])
3742                     applied_style.update(style)
3743                     self._applied_styles.append(applied_style)
3744                 self._unclosed_elements.append(unclosed_elements)
3745
3746         def end(self, tag):
3747             if tag not in (_x('ttml:br'), 'br'):
3748                 unclosed_elements = self._unclosed_elements.pop()
3749                 for element in reversed(unclosed_elements):
3750                     self._out += '</%s>' % element
3751                 if unclosed_elements and self._applied_styles:
3752                     self._applied_styles.pop()
3753
3754         def data(self, data):
3755             self._out += data
3756
3757         def close(self):
3758             return self._out.strip()
3759
3760     def parse_node(node):
3761         target = TTMLPElementParser()
3762         parser = xml.etree.ElementTree.XMLParser(target=target)
3763         parser.feed(xml.etree.ElementTree.tostring(node))
3764         return parser.close()
3765
3766     for k, v in LEGACY_NAMESPACES:
3767         for ns in v:
3768             dfxp_data = dfxp_data.replace(ns, k)
3769
3770     dfxp = compat_etree_fromstring(dfxp_data)
3771     out = []
3772     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3773
3774     if not paras:
3775         raise ValueError('Invalid dfxp/TTML subtitle')
3776
3777     repeat = False
3778     while True:
3779         for style in dfxp.findall(_x('.//ttml:style')):
3780             style_id = style.get('id') or style.get(_x('xml:id'))
3781             if not style_id:
3782                 continue
3783             parent_style_id = style.get('style')
3784             if parent_style_id:
3785                 if parent_style_id not in styles:
3786                     repeat = True
3787                     continue
3788                 styles[style_id] = styles[parent_style_id].copy()
3789             for prop in SUPPORTED_STYLING:
3790                 prop_val = style.get(_x('tts:' + prop))
3791                 if prop_val:
3792                     styles.setdefault(style_id, {})[prop] = prop_val
3793         if repeat:
3794             repeat = False
3795         else:
3796             break
3797
3798     for p in ('body', 'div'):
3799         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3800         if ele is None:
3801             continue
3802         style = styles.get(ele.get('style'))
3803         if not style:
3804             continue
3805         default_style.update(style)
3806
3807     for para, index in zip(paras, itertools.count(1)):
3808         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3809         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3810         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3811         if begin_time is None:
3812             continue
3813         if not end_time:
3814             if not dur:
3815                 continue
3816             end_time = begin_time + dur
3817         out.append('%d\n%s --> %s\n%s\n\n' % (
3818             index,
3819             srt_subtitles_timecode(begin_time),
3820             srt_subtitles_timecode(end_time),
3821             parse_node(para)))
3822
3823     return ''.join(out)
3824
3825
3826 def cli_option(params, command_option, param):
3827     param = params.get(param)
3828     if param:
3829         param = compat_str(param)
3830     return [command_option, param] if param is not None else []
3831
3832
3833 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3834     param = params.get(param)
3835     if param is None:
3836         return []
3837     assert isinstance(param, bool)
3838     if separator:
3839         return [command_option + separator + (true_value if param else false_value)]
3840     return [command_option, true_value if param else false_value]
3841
3842
3843 def cli_valueless_option(params, command_option, param, expected_value=True):
3844     param = params.get(param)
3845     return [command_option] if param == expected_value else []
3846
3847
3848 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3849     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3850         if use_compat:
3851             return argdict
3852         else:
3853             argdict = None
3854     if argdict is None:
3855         return default
3856     assert isinstance(argdict, dict)
3857
3858     assert isinstance(keys, (list, tuple))
3859     for key_list in keys:
3860         arg_list = list(filter(
3861             lambda x: x is not None,
3862             [argdict.get(key.lower()) for key in variadic(key_list)]))
3863         if arg_list:
3864             return [arg for args in arg_list for arg in args]
3865     return default
3866
3867
3868 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3869     main_key, exe = main_key.lower(), exe.lower()
3870     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3871     keys = [f'{root_key}{k}' for k in (keys or [''])]
3872     if root_key in keys:
3873         if main_key != exe:
3874             keys.append((main_key, exe))
3875         keys.append('default')
3876     else:
3877         use_compat = False
3878     return cli_configuration_args(argdict, keys, default, use_compat)
3879
3880
3881 class ISO639Utils(object):
3882     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3883     _lang_map = {
3884         'aa': 'aar',
3885         'ab': 'abk',
3886         'ae': 'ave',
3887         'af': 'afr',
3888         'ak': 'aka',
3889         'am': 'amh',
3890         'an': 'arg',
3891         'ar': 'ara',
3892         'as': 'asm',
3893         'av': 'ava',
3894         'ay': 'aym',
3895         'az': 'aze',
3896         'ba': 'bak',
3897         'be': 'bel',
3898         'bg': 'bul',
3899         'bh': 'bih',
3900         'bi': 'bis',
3901         'bm': 'bam',
3902         'bn': 'ben',
3903         'bo': 'bod',
3904         'br': 'bre',
3905         'bs': 'bos',
3906         'ca': 'cat',
3907         'ce': 'che',
3908         'ch': 'cha',
3909         'co': 'cos',
3910         'cr': 'cre',
3911         'cs': 'ces',
3912         'cu': 'chu',
3913         'cv': 'chv',
3914         'cy': 'cym',
3915         'da': 'dan',
3916         'de': 'deu',
3917         'dv': 'div',
3918         'dz': 'dzo',
3919         'ee': 'ewe',
3920         'el': 'ell',
3921         'en': 'eng',
3922         'eo': 'epo',
3923         'es': 'spa',
3924         'et': 'est',
3925         'eu': 'eus',
3926         'fa': 'fas',
3927         'ff': 'ful',
3928         'fi': 'fin',
3929         'fj': 'fij',
3930         'fo': 'fao',
3931         'fr': 'fra',
3932         'fy': 'fry',
3933         'ga': 'gle',
3934         'gd': 'gla',
3935         'gl': 'glg',
3936         'gn': 'grn',
3937         'gu': 'guj',
3938         'gv': 'glv',
3939         'ha': 'hau',
3940         'he': 'heb',
3941         'iw': 'heb',  # Replaced by he in 1989 revision
3942         'hi': 'hin',
3943         'ho': 'hmo',
3944         'hr': 'hrv',
3945         'ht': 'hat',
3946         'hu': 'hun',
3947         'hy': 'hye',
3948         'hz': 'her',
3949         'ia': 'ina',
3950         'id': 'ind',
3951         'in': 'ind',  # Replaced by id in 1989 revision
3952         'ie': 'ile',
3953         'ig': 'ibo',
3954         'ii': 'iii',
3955         'ik': 'ipk',
3956         'io': 'ido',
3957         'is': 'isl',
3958         'it': 'ita',
3959         'iu': 'iku',
3960         'ja': 'jpn',
3961         'jv': 'jav',
3962         'ka': 'kat',
3963         'kg': 'kon',
3964         'ki': 'kik',
3965         'kj': 'kua',
3966         'kk': 'kaz',
3967         'kl': 'kal',
3968         'km': 'khm',
3969         'kn': 'kan',
3970         'ko': 'kor',
3971         'kr': 'kau',
3972         'ks': 'kas',
3973         'ku': 'kur',
3974         'kv': 'kom',
3975         'kw': 'cor',
3976         'ky': 'kir',
3977         'la': 'lat',
3978         'lb': 'ltz',
3979         'lg': 'lug',
3980         'li': 'lim',
3981         'ln': 'lin',
3982         'lo': 'lao',
3983         'lt': 'lit',
3984         'lu': 'lub',
3985         'lv': 'lav',
3986         'mg': 'mlg',
3987         'mh': 'mah',
3988         'mi': 'mri',
3989         'mk': 'mkd',
3990         'ml': 'mal',
3991         'mn': 'mon',
3992         'mr': 'mar',
3993         'ms': 'msa',
3994         'mt': 'mlt',
3995         'my': 'mya',
3996         'na': 'nau',
3997         'nb': 'nob',
3998         'nd': 'nde',
3999         'ne': 'nep',
4000         'ng': 'ndo',
4001         'nl': 'nld',
4002         'nn': 'nno',
4003         'no': 'nor',
4004         'nr': 'nbl',
4005         'nv': 'nav',
4006         'ny': 'nya',
4007         'oc': 'oci',
4008         'oj': 'oji',
4009         'om': 'orm',
4010         'or': 'ori',
4011         'os': 'oss',
4012         'pa': 'pan',
4013         'pi': 'pli',
4014         'pl': 'pol',
4015         'ps': 'pus',
4016         'pt': 'por',
4017         'qu': 'que',
4018         'rm': 'roh',
4019         'rn': 'run',
4020         'ro': 'ron',
4021         'ru': 'rus',
4022         'rw': 'kin',
4023         'sa': 'san',
4024         'sc': 'srd',
4025         'sd': 'snd',
4026         'se': 'sme',
4027         'sg': 'sag',
4028         'si': 'sin',
4029         'sk': 'slk',
4030         'sl': 'slv',
4031         'sm': 'smo',
4032         'sn': 'sna',
4033         'so': 'som',
4034         'sq': 'sqi',
4035         'sr': 'srp',
4036         'ss': 'ssw',
4037         'st': 'sot',
4038         'su': 'sun',
4039         'sv': 'swe',
4040         'sw': 'swa',
4041         'ta': 'tam',
4042         'te': 'tel',
4043         'tg': 'tgk',
4044         'th': 'tha',
4045         'ti': 'tir',
4046         'tk': 'tuk',
4047         'tl': 'tgl',
4048         'tn': 'tsn',
4049         'to': 'ton',
4050         'tr': 'tur',
4051         'ts': 'tso',
4052         'tt': 'tat',
4053         'tw': 'twi',
4054         'ty': 'tah',
4055         'ug': 'uig',
4056         'uk': 'ukr',
4057         'ur': 'urd',
4058         'uz': 'uzb',
4059         've': 'ven',
4060         'vi': 'vie',
4061         'vo': 'vol',
4062         'wa': 'wln',
4063         'wo': 'wol',
4064         'xh': 'xho',
4065         'yi': 'yid',
4066         'ji': 'yid',  # Replaced by yi in 1989 revision
4067         'yo': 'yor',
4068         'za': 'zha',
4069         'zh': 'zho',
4070         'zu': 'zul',
4071     }
4072
4073     @classmethod
4074     def short2long(cls, code):
4075         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4076         return cls._lang_map.get(code[:2])
4077
4078     @classmethod
4079     def long2short(cls, code):
4080         """Convert language code from ISO 639-2/T to ISO 639-1"""
4081         for short_name, long_name in cls._lang_map.items():
4082             if long_name == code:
4083                 return short_name
4084
4085
4086 class ISO3166Utils(object):
4087     # From http://data.okfn.org/data/core/country-list
4088     _country_map = {
4089         'AF': 'Afghanistan',
4090         'AX': 'Åland Islands',
4091         'AL': 'Albania',
4092         'DZ': 'Algeria',
4093         'AS': 'American Samoa',
4094         'AD': 'Andorra',
4095         'AO': 'Angola',
4096         'AI': 'Anguilla',
4097         'AQ': 'Antarctica',
4098         'AG': 'Antigua and Barbuda',
4099         'AR': 'Argentina',
4100         'AM': 'Armenia',
4101         'AW': 'Aruba',
4102         'AU': 'Australia',
4103         'AT': 'Austria',
4104         'AZ': 'Azerbaijan',
4105         'BS': 'Bahamas',
4106         'BH': 'Bahrain',
4107         'BD': 'Bangladesh',
4108         'BB': 'Barbados',
4109         'BY': 'Belarus',
4110         'BE': 'Belgium',
4111         'BZ': 'Belize',
4112         'BJ': 'Benin',
4113         'BM': 'Bermuda',
4114         'BT': 'Bhutan',
4115         'BO': 'Bolivia, Plurinational State of',
4116         'BQ': 'Bonaire, Sint Eustatius and Saba',
4117         'BA': 'Bosnia and Herzegovina',
4118         'BW': 'Botswana',
4119         'BV': 'Bouvet Island',
4120         'BR': 'Brazil',
4121         'IO': 'British Indian Ocean Territory',
4122         'BN': 'Brunei Darussalam',
4123         'BG': 'Bulgaria',
4124         'BF': 'Burkina Faso',
4125         'BI': 'Burundi',
4126         'KH': 'Cambodia',
4127         'CM': 'Cameroon',
4128         'CA': 'Canada',
4129         'CV': 'Cape Verde',
4130         'KY': 'Cayman Islands',
4131         'CF': 'Central African Republic',
4132         'TD': 'Chad',
4133         'CL': 'Chile',
4134         'CN': 'China',
4135         'CX': 'Christmas Island',
4136         'CC': 'Cocos (Keeling) Islands',
4137         'CO': 'Colombia',
4138         'KM': 'Comoros',
4139         'CG': 'Congo',
4140         'CD': 'Congo, the Democratic Republic of the',
4141         'CK': 'Cook Islands',
4142         'CR': 'Costa Rica',
4143         'CI': 'Côte d\'Ivoire',
4144         'HR': 'Croatia',
4145         'CU': 'Cuba',
4146         'CW': 'Curaçao',
4147         'CY': 'Cyprus',
4148         'CZ': 'Czech Republic',
4149         'DK': 'Denmark',
4150         'DJ': 'Djibouti',
4151         'DM': 'Dominica',
4152         'DO': 'Dominican Republic',
4153         'EC': 'Ecuador',
4154         'EG': 'Egypt',
4155         'SV': 'El Salvador',
4156         'GQ': 'Equatorial Guinea',
4157         'ER': 'Eritrea',
4158         'EE': 'Estonia',
4159         'ET': 'Ethiopia',
4160         'FK': 'Falkland Islands (Malvinas)',
4161         'FO': 'Faroe Islands',
4162         'FJ': 'Fiji',
4163         'FI': 'Finland',
4164         'FR': 'France',
4165         'GF': 'French Guiana',
4166         'PF': 'French Polynesia',
4167         'TF': 'French Southern Territories',
4168         'GA': 'Gabon',
4169         'GM': 'Gambia',
4170         'GE': 'Georgia',
4171         'DE': 'Germany',
4172         'GH': 'Ghana',
4173         'GI': 'Gibraltar',
4174         'GR': 'Greece',
4175         'GL': 'Greenland',
4176         'GD': 'Grenada',
4177         'GP': 'Guadeloupe',
4178         'GU': 'Guam',
4179         'GT': 'Guatemala',
4180         'GG': 'Guernsey',
4181         'GN': 'Guinea',
4182         'GW': 'Guinea-Bissau',
4183         'GY': 'Guyana',
4184         'HT': 'Haiti',
4185         'HM': 'Heard Island and McDonald Islands',
4186         'VA': 'Holy See (Vatican City State)',
4187         'HN': 'Honduras',
4188         'HK': 'Hong Kong',
4189         'HU': 'Hungary',
4190         'IS': 'Iceland',
4191         'IN': 'India',
4192         'ID': 'Indonesia',
4193         'IR': 'Iran, Islamic Republic of',
4194         'IQ': 'Iraq',
4195         'IE': 'Ireland',
4196         'IM': 'Isle of Man',
4197         'IL': 'Israel',
4198         'IT': 'Italy',
4199         'JM': 'Jamaica',
4200         'JP': 'Japan',
4201         'JE': 'Jersey',
4202         'JO': 'Jordan',
4203         'KZ': 'Kazakhstan',
4204         'KE': 'Kenya',
4205         'KI': 'Kiribati',
4206         'KP': 'Korea, Democratic People\'s Republic of',
4207         'KR': 'Korea, Republic of',
4208         'KW': 'Kuwait',
4209         'KG': 'Kyrgyzstan',
4210         'LA': 'Lao People\'s Democratic Republic',
4211         'LV': 'Latvia',
4212         'LB': 'Lebanon',
4213         'LS': 'Lesotho',
4214         'LR': 'Liberia',
4215         'LY': 'Libya',
4216         'LI': 'Liechtenstein',
4217         'LT': 'Lithuania',
4218         'LU': 'Luxembourg',
4219         'MO': 'Macao',
4220         'MK': 'Macedonia, the Former Yugoslav Republic of',
4221         'MG': 'Madagascar',
4222         'MW': 'Malawi',
4223         'MY': 'Malaysia',
4224         'MV': 'Maldives',
4225         'ML': 'Mali',
4226         'MT': 'Malta',
4227         'MH': 'Marshall Islands',
4228         'MQ': 'Martinique',
4229         'MR': 'Mauritania',
4230         'MU': 'Mauritius',
4231         'YT': 'Mayotte',
4232         'MX': 'Mexico',
4233         'FM': 'Micronesia, Federated States of',
4234         'MD': 'Moldova, Republic of',
4235         'MC': 'Monaco',
4236         'MN': 'Mongolia',
4237         'ME': 'Montenegro',
4238         'MS': 'Montserrat',
4239         'MA': 'Morocco',
4240         'MZ': 'Mozambique',
4241         'MM': 'Myanmar',
4242         'NA': 'Namibia',
4243         'NR': 'Nauru',
4244         'NP': 'Nepal',
4245         'NL': 'Netherlands',
4246         'NC': 'New Caledonia',
4247         'NZ': 'New Zealand',
4248         'NI': 'Nicaragua',
4249         'NE': 'Niger',
4250         'NG': 'Nigeria',
4251         'NU': 'Niue',
4252         'NF': 'Norfolk Island',
4253         'MP': 'Northern Mariana Islands',
4254         'NO': 'Norway',
4255         'OM': 'Oman',
4256         'PK': 'Pakistan',
4257         'PW': 'Palau',
4258         'PS': 'Palestine, State of',
4259         'PA': 'Panama',
4260         'PG': 'Papua New Guinea',
4261         'PY': 'Paraguay',
4262         'PE': 'Peru',
4263         'PH': 'Philippines',
4264         'PN': 'Pitcairn',
4265         'PL': 'Poland',
4266         'PT': 'Portugal',
4267         'PR': 'Puerto Rico',
4268         'QA': 'Qatar',
4269         'RE': 'Réunion',
4270         'RO': 'Romania',
4271         'RU': 'Russian Federation',
4272         'RW': 'Rwanda',
4273         'BL': 'Saint Barthélemy',
4274         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4275         'KN': 'Saint Kitts and Nevis',
4276         'LC': 'Saint Lucia',
4277         'MF': 'Saint Martin (French part)',
4278         'PM': 'Saint Pierre and Miquelon',
4279         'VC': 'Saint Vincent and the Grenadines',
4280         'WS': 'Samoa',
4281         'SM': 'San Marino',
4282         'ST': 'Sao Tome and Principe',
4283         'SA': 'Saudi Arabia',
4284         'SN': 'Senegal',
4285         'RS': 'Serbia',
4286         'SC': 'Seychelles',
4287         'SL': 'Sierra Leone',
4288         'SG': 'Singapore',
4289         'SX': 'Sint Maarten (Dutch part)',
4290         'SK': 'Slovakia',
4291         'SI': 'Slovenia',
4292         'SB': 'Solomon Islands',
4293         'SO': 'Somalia',
4294         'ZA': 'South Africa',
4295         'GS': 'South Georgia and the South Sandwich Islands',
4296         'SS': 'South Sudan',
4297         'ES': 'Spain',
4298         'LK': 'Sri Lanka',
4299         'SD': 'Sudan',
4300         'SR': 'Suriname',
4301         'SJ': 'Svalbard and Jan Mayen',
4302         'SZ': 'Swaziland',
4303         'SE': 'Sweden',
4304         'CH': 'Switzerland',
4305         'SY': 'Syrian Arab Republic',
4306         'TW': 'Taiwan, Province of China',
4307         'TJ': 'Tajikistan',
4308         'TZ': 'Tanzania, United Republic of',
4309         'TH': 'Thailand',
4310         'TL': 'Timor-Leste',
4311         'TG': 'Togo',
4312         'TK': 'Tokelau',
4313         'TO': 'Tonga',
4314         'TT': 'Trinidad and Tobago',
4315         'TN': 'Tunisia',
4316         'TR': 'Turkey',
4317         'TM': 'Turkmenistan',
4318         'TC': 'Turks and Caicos Islands',
4319         'TV': 'Tuvalu',
4320         'UG': 'Uganda',
4321         'UA': 'Ukraine',
4322         'AE': 'United Arab Emirates',
4323         'GB': 'United Kingdom',
4324         'US': 'United States',
4325         'UM': 'United States Minor Outlying Islands',
4326         'UY': 'Uruguay',
4327         'UZ': 'Uzbekistan',
4328         'VU': 'Vanuatu',
4329         'VE': 'Venezuela, Bolivarian Republic of',
4330         'VN': 'Viet Nam',
4331         'VG': 'Virgin Islands, British',
4332         'VI': 'Virgin Islands, U.S.',
4333         'WF': 'Wallis and Futuna',
4334         'EH': 'Western Sahara',
4335         'YE': 'Yemen',
4336         'ZM': 'Zambia',
4337         'ZW': 'Zimbabwe',
4338     }
4339
4340     @classmethod
4341     def short2full(cls, code):
4342         """Convert an ISO 3166-2 country code to the corresponding full name"""
4343         return cls._country_map.get(code.upper())
4344
4345
4346 class GeoUtils(object):
4347     # Major IPv4 address blocks per country
4348     _country_ip_map = {
4349         'AD': '46.172.224.0/19',
4350         'AE': '94.200.0.0/13',
4351         'AF': '149.54.0.0/17',
4352         'AG': '209.59.64.0/18',
4353         'AI': '204.14.248.0/21',
4354         'AL': '46.99.0.0/16',
4355         'AM': '46.70.0.0/15',
4356         'AO': '105.168.0.0/13',
4357         'AP': '182.50.184.0/21',
4358         'AQ': '23.154.160.0/24',
4359         'AR': '181.0.0.0/12',
4360         'AS': '202.70.112.0/20',
4361         'AT': '77.116.0.0/14',
4362         'AU': '1.128.0.0/11',
4363         'AW': '181.41.0.0/18',
4364         'AX': '185.217.4.0/22',
4365         'AZ': '5.197.0.0/16',
4366         'BA': '31.176.128.0/17',
4367         'BB': '65.48.128.0/17',
4368         'BD': '114.130.0.0/16',
4369         'BE': '57.0.0.0/8',
4370         'BF': '102.178.0.0/15',
4371         'BG': '95.42.0.0/15',
4372         'BH': '37.131.0.0/17',
4373         'BI': '154.117.192.0/18',
4374         'BJ': '137.255.0.0/16',
4375         'BL': '185.212.72.0/23',
4376         'BM': '196.12.64.0/18',
4377         'BN': '156.31.0.0/16',
4378         'BO': '161.56.0.0/16',
4379         'BQ': '161.0.80.0/20',
4380         'BR': '191.128.0.0/12',
4381         'BS': '24.51.64.0/18',
4382         'BT': '119.2.96.0/19',
4383         'BW': '168.167.0.0/16',
4384         'BY': '178.120.0.0/13',
4385         'BZ': '179.42.192.0/18',
4386         'CA': '99.224.0.0/11',
4387         'CD': '41.243.0.0/16',
4388         'CF': '197.242.176.0/21',
4389         'CG': '160.113.0.0/16',
4390         'CH': '85.0.0.0/13',
4391         'CI': '102.136.0.0/14',
4392         'CK': '202.65.32.0/19',
4393         'CL': '152.172.0.0/14',
4394         'CM': '102.244.0.0/14',
4395         'CN': '36.128.0.0/10',
4396         'CO': '181.240.0.0/12',
4397         'CR': '201.192.0.0/12',
4398         'CU': '152.206.0.0/15',
4399         'CV': '165.90.96.0/19',
4400         'CW': '190.88.128.0/17',
4401         'CY': '31.153.0.0/16',
4402         'CZ': '88.100.0.0/14',
4403         'DE': '53.0.0.0/8',
4404         'DJ': '197.241.0.0/17',
4405         'DK': '87.48.0.0/12',
4406         'DM': '192.243.48.0/20',
4407         'DO': '152.166.0.0/15',
4408         'DZ': '41.96.0.0/12',
4409         'EC': '186.68.0.0/15',
4410         'EE': '90.190.0.0/15',
4411         'EG': '156.160.0.0/11',
4412         'ER': '196.200.96.0/20',
4413         'ES': '88.0.0.0/11',
4414         'ET': '196.188.0.0/14',
4415         'EU': '2.16.0.0/13',
4416         'FI': '91.152.0.0/13',
4417         'FJ': '144.120.0.0/16',
4418         'FK': '80.73.208.0/21',
4419         'FM': '119.252.112.0/20',
4420         'FO': '88.85.32.0/19',
4421         'FR': '90.0.0.0/9',
4422         'GA': '41.158.0.0/15',
4423         'GB': '25.0.0.0/8',
4424         'GD': '74.122.88.0/21',
4425         'GE': '31.146.0.0/16',
4426         'GF': '161.22.64.0/18',
4427         'GG': '62.68.160.0/19',
4428         'GH': '154.160.0.0/12',
4429         'GI': '95.164.0.0/16',
4430         'GL': '88.83.0.0/19',
4431         'GM': '160.182.0.0/15',
4432         'GN': '197.149.192.0/18',
4433         'GP': '104.250.0.0/19',
4434         'GQ': '105.235.224.0/20',
4435         'GR': '94.64.0.0/13',
4436         'GT': '168.234.0.0/16',
4437         'GU': '168.123.0.0/16',
4438         'GW': '197.214.80.0/20',
4439         'GY': '181.41.64.0/18',
4440         'HK': '113.252.0.0/14',
4441         'HN': '181.210.0.0/16',
4442         'HR': '93.136.0.0/13',
4443         'HT': '148.102.128.0/17',
4444         'HU': '84.0.0.0/14',
4445         'ID': '39.192.0.0/10',
4446         'IE': '87.32.0.0/12',
4447         'IL': '79.176.0.0/13',
4448         'IM': '5.62.80.0/20',
4449         'IN': '117.192.0.0/10',
4450         'IO': '203.83.48.0/21',
4451         'IQ': '37.236.0.0/14',
4452         'IR': '2.176.0.0/12',
4453         'IS': '82.221.0.0/16',
4454         'IT': '79.0.0.0/10',
4455         'JE': '87.244.64.0/18',
4456         'JM': '72.27.0.0/17',
4457         'JO': '176.29.0.0/16',
4458         'JP': '133.0.0.0/8',
4459         'KE': '105.48.0.0/12',
4460         'KG': '158.181.128.0/17',
4461         'KH': '36.37.128.0/17',
4462         'KI': '103.25.140.0/22',
4463         'KM': '197.255.224.0/20',
4464         'KN': '198.167.192.0/19',
4465         'KP': '175.45.176.0/22',
4466         'KR': '175.192.0.0/10',
4467         'KW': '37.36.0.0/14',
4468         'KY': '64.96.0.0/15',
4469         'KZ': '2.72.0.0/13',
4470         'LA': '115.84.64.0/18',
4471         'LB': '178.135.0.0/16',
4472         'LC': '24.92.144.0/20',
4473         'LI': '82.117.0.0/19',
4474         'LK': '112.134.0.0/15',
4475         'LR': '102.183.0.0/16',
4476         'LS': '129.232.0.0/17',
4477         'LT': '78.56.0.0/13',
4478         'LU': '188.42.0.0/16',
4479         'LV': '46.109.0.0/16',
4480         'LY': '41.252.0.0/14',
4481         'MA': '105.128.0.0/11',
4482         'MC': '88.209.64.0/18',
4483         'MD': '37.246.0.0/16',
4484         'ME': '178.175.0.0/17',
4485         'MF': '74.112.232.0/21',
4486         'MG': '154.126.0.0/17',
4487         'MH': '117.103.88.0/21',
4488         'MK': '77.28.0.0/15',
4489         'ML': '154.118.128.0/18',
4490         'MM': '37.111.0.0/17',
4491         'MN': '49.0.128.0/17',
4492         'MO': '60.246.0.0/16',
4493         'MP': '202.88.64.0/20',
4494         'MQ': '109.203.224.0/19',
4495         'MR': '41.188.64.0/18',
4496         'MS': '208.90.112.0/22',
4497         'MT': '46.11.0.0/16',
4498         'MU': '105.16.0.0/12',
4499         'MV': '27.114.128.0/18',
4500         'MW': '102.70.0.0/15',
4501         'MX': '187.192.0.0/11',
4502         'MY': '175.136.0.0/13',
4503         'MZ': '197.218.0.0/15',
4504         'NA': '41.182.0.0/16',
4505         'NC': '101.101.0.0/18',
4506         'NE': '197.214.0.0/18',
4507         'NF': '203.17.240.0/22',
4508         'NG': '105.112.0.0/12',
4509         'NI': '186.76.0.0/15',
4510         'NL': '145.96.0.0/11',
4511         'NO': '84.208.0.0/13',
4512         'NP': '36.252.0.0/15',
4513         'NR': '203.98.224.0/19',
4514         'NU': '49.156.48.0/22',
4515         'NZ': '49.224.0.0/14',
4516         'OM': '5.36.0.0/15',
4517         'PA': '186.72.0.0/15',
4518         'PE': '186.160.0.0/14',
4519         'PF': '123.50.64.0/18',
4520         'PG': '124.240.192.0/19',
4521         'PH': '49.144.0.0/13',
4522         'PK': '39.32.0.0/11',
4523         'PL': '83.0.0.0/11',
4524         'PM': '70.36.0.0/20',
4525         'PR': '66.50.0.0/16',
4526         'PS': '188.161.0.0/16',
4527         'PT': '85.240.0.0/13',
4528         'PW': '202.124.224.0/20',
4529         'PY': '181.120.0.0/14',
4530         'QA': '37.210.0.0/15',
4531         'RE': '102.35.0.0/16',
4532         'RO': '79.112.0.0/13',
4533         'RS': '93.86.0.0/15',
4534         'RU': '5.136.0.0/13',
4535         'RW': '41.186.0.0/16',
4536         'SA': '188.48.0.0/13',
4537         'SB': '202.1.160.0/19',
4538         'SC': '154.192.0.0/11',
4539         'SD': '102.120.0.0/13',
4540         'SE': '78.64.0.0/12',
4541         'SG': '8.128.0.0/10',
4542         'SI': '188.196.0.0/14',
4543         'SK': '78.98.0.0/15',
4544         'SL': '102.143.0.0/17',
4545         'SM': '89.186.32.0/19',
4546         'SN': '41.82.0.0/15',
4547         'SO': '154.115.192.0/18',
4548         'SR': '186.179.128.0/17',
4549         'SS': '105.235.208.0/21',
4550         'ST': '197.159.160.0/19',
4551         'SV': '168.243.0.0/16',
4552         'SX': '190.102.0.0/20',
4553         'SY': '5.0.0.0/16',
4554         'SZ': '41.84.224.0/19',
4555         'TC': '65.255.48.0/20',
4556         'TD': '154.68.128.0/19',
4557         'TG': '196.168.0.0/14',
4558         'TH': '171.96.0.0/13',
4559         'TJ': '85.9.128.0/18',
4560         'TK': '27.96.24.0/21',
4561         'TL': '180.189.160.0/20',
4562         'TM': '95.85.96.0/19',
4563         'TN': '197.0.0.0/11',
4564         'TO': '175.176.144.0/21',
4565         'TR': '78.160.0.0/11',
4566         'TT': '186.44.0.0/15',
4567         'TV': '202.2.96.0/19',
4568         'TW': '120.96.0.0/11',
4569         'TZ': '156.156.0.0/14',
4570         'UA': '37.52.0.0/14',
4571         'UG': '102.80.0.0/13',
4572         'US': '6.0.0.0/8',
4573         'UY': '167.56.0.0/13',
4574         'UZ': '84.54.64.0/18',
4575         'VA': '212.77.0.0/19',
4576         'VC': '207.191.240.0/21',
4577         'VE': '186.88.0.0/13',
4578         'VG': '66.81.192.0/20',
4579         'VI': '146.226.0.0/16',
4580         'VN': '14.160.0.0/11',
4581         'VU': '202.80.32.0/20',
4582         'WF': '117.20.32.0/21',
4583         'WS': '202.4.32.0/19',
4584         'YE': '134.35.0.0/16',
4585         'YT': '41.242.116.0/22',
4586         'ZA': '41.0.0.0/11',
4587         'ZM': '102.144.0.0/13',
4588         'ZW': '102.177.192.0/18',
4589     }
4590
4591     @classmethod
4592     def random_ipv4(cls, code_or_block):
4593         if len(code_or_block) == 2:
4594             block = cls._country_ip_map.get(code_or_block.upper())
4595             if not block:
4596                 return None
4597         else:
4598             block = code_or_block
4599         addr, preflen = block.split('/')
4600         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4601         addr_max = addr_min | (0xffffffff >> int(preflen))
4602         return compat_str(socket.inet_ntoa(
4603             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4604
4605
4606 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4607     def __init__(self, proxies=None):
4608         # Set default handlers
4609         for type in ('http', 'https'):
4610             setattr(self, '%s_open' % type,
4611                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4612                         meth(r, proxy, type))
4613         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4614
4615     def proxy_open(self, req, proxy, type):
4616         req_proxy = req.headers.get('Ytdl-request-proxy')
4617         if req_proxy is not None:
4618             proxy = req_proxy
4619             del req.headers['Ytdl-request-proxy']
4620
4621         if proxy == '__noproxy__':
4622             return None  # No Proxy
4623         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4624             req.add_header('Ytdl-socks-proxy', proxy)
4625             # yt-dlp's http/https handlers do wrapping the socket with socks
4626             return None
4627         return compat_urllib_request.ProxyHandler.proxy_open(
4628             self, req, proxy, type)
4629
4630
4631 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4632 # released into Public Domain
4633 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4634
4635 def long_to_bytes(n, blocksize=0):
4636     """long_to_bytes(n:long, blocksize:int) : string
4637     Convert a long integer to a byte string.
4638
4639     If optional blocksize is given and greater than zero, pad the front of the
4640     byte string with binary zeros so that the length is a multiple of
4641     blocksize.
4642     """
4643     # after much testing, this algorithm was deemed to be the fastest
4644     s = b''
4645     n = int(n)
4646     while n > 0:
4647         s = compat_struct_pack('>I', n & 0xffffffff) + s
4648         n = n >> 32
4649     # strip off leading zeros
4650     for i in range(len(s)):
4651         if s[i] != b'\000'[0]:
4652             break
4653     else:
4654         # only happens when n == 0
4655         s = b'\000'
4656         i = 0
4657     s = s[i:]
4658     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4659     # de-padding being done above, but sigh...
4660     if blocksize > 0 and len(s) % blocksize:
4661         s = (blocksize - len(s) % blocksize) * b'\000' + s
4662     return s
4663
4664
4665 def bytes_to_long(s):
4666     """bytes_to_long(string) : long
4667     Convert a byte string to a long integer.
4668
4669     This is (essentially) the inverse of long_to_bytes().
4670     """
4671     acc = 0
4672     length = len(s)
4673     if length % 4:
4674         extra = (4 - length % 4)
4675         s = b'\000' * extra + s
4676         length = length + extra
4677     for i in range(0, length, 4):
4678         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4679     return acc
4680
4681
4682 def ohdave_rsa_encrypt(data, exponent, modulus):
4683     '''
4684     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4685
4686     Input:
4687         data: data to encrypt, bytes-like object
4688         exponent, modulus: parameter e and N of RSA algorithm, both integer
4689     Output: hex string of encrypted data
4690
4691     Limitation: supports one block encryption only
4692     '''
4693
4694     payload = int(binascii.hexlify(data[::-1]), 16)
4695     encrypted = pow(payload, exponent, modulus)
4696     return '%x' % encrypted
4697
4698
4699 def pkcs1pad(data, length):
4700     """
4701     Padding input data with PKCS#1 scheme
4702
4703     @param {int[]} data        input data
4704     @param {int}   length      target length
4705     @returns {int[]}           padded data
4706     """
4707     if len(data) > length - 11:
4708         raise ValueError('Input data too long for PKCS#1 padding')
4709
4710     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4711     return [0, 2] + pseudo_random + [0] + data
4712
4713
4714 def encode_base_n(num, n, table=None):
4715     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4716     if not table:
4717         table = FULL_TABLE[:n]
4718
4719     if n > len(table):
4720         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4721
4722     if num == 0:
4723         return table[0]
4724
4725     ret = ''
4726     while num:
4727         ret = table[num % n] + ret
4728         num = num // n
4729     return ret
4730
4731
4732 def decode_packed_codes(code):
4733     mobj = re.search(PACKED_CODES_RE, code)
4734     obfuscated_code, base, count, symbols = mobj.groups()
4735     base = int(base)
4736     count = int(count)
4737     symbols = symbols.split('|')
4738     symbol_table = {}
4739
4740     while count:
4741         count -= 1
4742         base_n_count = encode_base_n(count, base)
4743         symbol_table[base_n_count] = symbols[count] or base_n_count
4744
4745     return re.sub(
4746         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4747         obfuscated_code)
4748
4749
4750 def caesar(s, alphabet, shift):
4751     if shift == 0:
4752         return s
4753     l = len(alphabet)
4754     return ''.join(
4755         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4756         for c in s)
4757
4758
4759 def rot47(s):
4760     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4761
4762
4763 def parse_m3u8_attributes(attrib):
4764     info = {}
4765     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4766         if val.startswith('"'):
4767             val = val[1:-1]
4768         info[key] = val
4769     return info
4770
4771
4772 def urshift(val, n):
4773     return val >> n if val >= 0 else (val + 0x100000000) >> n
4774
4775
4776 # Based on png2str() written by @gdkchan and improved by @yokrysty
4777 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4778 def decode_png(png_data):
4779     # Reference: https://www.w3.org/TR/PNG/
4780     header = png_data[8:]
4781
4782     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4783         raise IOError('Not a valid PNG file.')
4784
4785     int_map = {1: '>B', 2: '>H', 4: '>I'}
4786     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4787
4788     chunks = []
4789
4790     while header:
4791         length = unpack_integer(header[:4])
4792         header = header[4:]
4793
4794         chunk_type = header[:4]
4795         header = header[4:]
4796
4797         chunk_data = header[:length]
4798         header = header[length:]
4799
4800         header = header[4:]  # Skip CRC
4801
4802         chunks.append({
4803             'type': chunk_type,
4804             'length': length,
4805             'data': chunk_data
4806         })
4807
4808     ihdr = chunks[0]['data']
4809
4810     width = unpack_integer(ihdr[:4])
4811     height = unpack_integer(ihdr[4:8])
4812
4813     idat = b''
4814
4815     for chunk in chunks:
4816         if chunk['type'] == b'IDAT':
4817             idat += chunk['data']
4818
4819     if not idat:
4820         raise IOError('Unable to read PNG data.')
4821
4822     decompressed_data = bytearray(zlib.decompress(idat))
4823
4824     stride = width * 3
4825     pixels = []
4826
4827     def _get_pixel(idx):
4828         x = idx % stride
4829         y = idx // stride
4830         return pixels[y][x]
4831
4832     for y in range(height):
4833         basePos = y * (1 + stride)
4834         filter_type = decompressed_data[basePos]
4835
4836         current_row = []
4837
4838         pixels.append(current_row)
4839
4840         for x in range(stride):
4841             color = decompressed_data[1 + basePos + x]
4842             basex = y * stride + x
4843             left = 0
4844             up = 0
4845
4846             if x > 2:
4847                 left = _get_pixel(basex - 3)
4848             if y > 0:
4849                 up = _get_pixel(basex - stride)
4850
4851             if filter_type == 1:  # Sub
4852                 color = (color + left) & 0xff
4853             elif filter_type == 2:  # Up
4854                 color = (color + up) & 0xff
4855             elif filter_type == 3:  # Average
4856                 color = (color + ((left + up) >> 1)) & 0xff
4857             elif filter_type == 4:  # Paeth
4858                 a = left
4859                 b = up
4860                 c = 0
4861
4862                 if x > 2 and y > 0:
4863                     c = _get_pixel(basex - stride - 3)
4864
4865                 p = a + b - c
4866
4867                 pa = abs(p - a)
4868                 pb = abs(p - b)
4869                 pc = abs(p - c)
4870
4871                 if pa <= pb and pa <= pc:
4872                     color = (color + a) & 0xff
4873                 elif pb <= pc:
4874                     color = (color + b) & 0xff
4875                 else:
4876                     color = (color + c) & 0xff
4877
4878             current_row.append(color)
4879
4880     return width, height, pixels
4881
4882
4883 def write_xattr(path, key, value):
4884     # This mess below finds the best xattr tool for the job
4885     try:
4886         # try the pyxattr module...
4887         import xattr
4888
4889         if hasattr(xattr, 'set'):  # pyxattr
4890             # Unicode arguments are not supported in python-pyxattr until
4891             # version 0.5.0
4892             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4893             pyxattr_required_version = '0.5.0'
4894             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4895                 # TODO: fallback to CLI tools
4896                 raise XAttrUnavailableError(
4897                     'python-pyxattr is detected but is too old. '
4898                     'yt-dlp requires %s or above while your version is %s. '
4899                     'Falling back to other xattr implementations' % (
4900                         pyxattr_required_version, xattr.__version__))
4901
4902             setxattr = xattr.set
4903         else:  # xattr
4904             setxattr = xattr.setxattr
4905
4906         try:
4907             setxattr(path, key, value)
4908         except EnvironmentError as e:
4909             raise XAttrMetadataError(e.errno, e.strerror)
4910
4911     except ImportError:
4912         if compat_os_name == 'nt':
4913             # Write xattrs to NTFS Alternate Data Streams:
4914             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4915             assert ':' not in key
4916             assert os.path.exists(path)
4917
4918             ads_fn = path + ':' + key
4919             try:
4920                 with open(ads_fn, 'wb') as f:
4921                     f.write(value)
4922             except EnvironmentError as e:
4923                 raise XAttrMetadataError(e.errno, e.strerror)
4924         else:
4925             user_has_setfattr = check_executable('setfattr', ['--version'])
4926             user_has_xattr = check_executable('xattr', ['-h'])
4927
4928             if user_has_setfattr or user_has_xattr:
4929
4930                 value = value.decode('utf-8')
4931                 if user_has_setfattr:
4932                     executable = 'setfattr'
4933                     opts = ['-n', key, '-v', value]
4934                 elif user_has_xattr:
4935                     executable = 'xattr'
4936                     opts = ['-w', key, value]
4937
4938                 cmd = ([encodeFilename(executable, True)]
4939                        + [encodeArgument(o) for o in opts]
4940                        + [encodeFilename(path, True)])
4941
4942                 try:
4943                     p = Popen(
4944                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4945                 except EnvironmentError as e:
4946                     raise XAttrMetadataError(e.errno, e.strerror)
4947                 stdout, stderr = p.communicate_or_kill()
4948                 stderr = stderr.decode('utf-8', 'replace')
4949                 if p.returncode != 0:
4950                     raise XAttrMetadataError(p.returncode, stderr)
4951
4952             else:
4953                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4954                 if sys.platform.startswith('linux'):
4955                     raise XAttrUnavailableError(
4956                         "Couldn't find a tool to set the xattrs. "
4957                         "Install either the python 'pyxattr' or 'xattr' "
4958                         "modules, or the GNU 'attr' package "
4959                         "(which contains the 'setfattr' tool).")
4960                 else:
4961                     raise XAttrUnavailableError(
4962                         "Couldn't find a tool to set the xattrs. "
4963                         "Install either the python 'xattr' module, "
4964                         "or the 'xattr' binary.")
4965
4966
4967 def random_birthday(year_field, month_field, day_field):
4968     start_date = datetime.date(1950, 1, 1)
4969     end_date = datetime.date(1995, 12, 31)
4970     offset = random.randint(0, (end_date - start_date).days)
4971     random_date = start_date + datetime.timedelta(offset)
4972     return {
4973         year_field: str(random_date.year),
4974         month_field: str(random_date.month),
4975         day_field: str(random_date.day),
4976     }
4977
4978
4979 # Templates for internet shortcut files, which are plain text files.
4980 DOT_URL_LINK_TEMPLATE = '''
4981 [InternetShortcut]
4982 URL=%(url)s
4983 '''.lstrip()
4984
4985 DOT_WEBLOC_LINK_TEMPLATE = '''
4986 <?xml version="1.0" encoding="UTF-8"?>
4987 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4988 <plist version="1.0">
4989 <dict>
4990 \t<key>URL</key>
4991 \t<string>%(url)s</string>
4992 </dict>
4993 </plist>
4994 '''.lstrip()
4995
4996 DOT_DESKTOP_LINK_TEMPLATE = '''
4997 [Desktop Entry]
4998 Encoding=UTF-8
4999 Name=%(filename)s
5000 Type=Link
5001 URL=%(url)s
5002 Icon=text-html
5003 '''.lstrip()
5004
5005 LINK_TEMPLATES = {
5006     'url': DOT_URL_LINK_TEMPLATE,
5007     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5008     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5009 }
5010
5011
5012 def iri_to_uri(iri):
5013     """
5014     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5015
5016     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5017     """
5018
5019     iri_parts = compat_urllib_parse_urlparse(iri)
5020
5021     if '[' in iri_parts.netloc:
5022         raise ValueError('IPv6 URIs are not, yet, supported.')
5023         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5024
5025     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5026
5027     net_location = ''
5028     if iri_parts.username:
5029         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
5030         if iri_parts.password is not None:
5031             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
5032         net_location += '@'
5033
5034     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
5035     # The 'idna' encoding produces ASCII text.
5036     if iri_parts.port is not None and iri_parts.port != 80:
5037         net_location += ':' + str(iri_parts.port)
5038
5039     return compat_urllib_parse_urlunparse(
5040         (iri_parts.scheme,
5041             net_location,
5042
5043             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5044
5045             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5046             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5047
5048             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5049             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5050
5051             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5052
5053     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5054
5055
5056 def to_high_limit_path(path):
5057     if sys.platform in ['win32', 'cygwin']:
5058         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5059         return r'\\?\ '.rstrip() + os.path.abspath(path)
5060
5061     return path
5062
5063
5064 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5065     val = traverse_obj(obj, *variadic(field))
5066     if val in ignore:
5067         return default
5068     return template % (func(val) if func else val)
5069
5070
5071 def clean_podcast_url(url):
5072     return re.sub(r'''(?x)
5073         (?:
5074             (?:
5075                 chtbl\.com/track|
5076                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5077                 play\.podtrac\.com
5078             )/[^/]+|
5079             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5080             flex\.acast\.com|
5081             pd(?:
5082                 cn\.co| # https://podcorn.com/analytics-prefix/
5083                 st\.fm # https://podsights.com/docs/
5084             )/e
5085         )/''', '', url)
5086
5087
5088 _HEX_TABLE = '0123456789abcdef'
5089
5090
5091 def random_uuidv4():
5092     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5093
5094
5095 def make_dir(path, to_screen=None):
5096     try:
5097         dn = os.path.dirname(path)
5098         if dn and not os.path.exists(dn):
5099             os.makedirs(dn)
5100         return True
5101     except (OSError, IOError) as err:
5102         if callable(to_screen) is not None:
5103             to_screen('unable to create directory ' + error_to_compat_str(err))
5104         return False
5105
5106
5107 def get_executable_path():
5108     from zipimport import zipimporter
5109     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5110         path = os.path.dirname(sys.executable)
5111     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5112         path = os.path.join(os.path.dirname(__file__), '../..')
5113     else:
5114         path = os.path.join(os.path.dirname(__file__), '..')
5115     return os.path.abspath(path)
5116
5117
5118 def load_plugins(name, suffix, namespace):
5119     classes = {}
5120     try:
5121         plugins_spec = importlib.util.spec_from_file_location(
5122             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5123         plugins = importlib.util.module_from_spec(plugins_spec)
5124         sys.modules[plugins_spec.name] = plugins
5125         plugins_spec.loader.exec_module(plugins)
5126         for name in dir(plugins):
5127             if name in namespace:
5128                 continue
5129             if not name.endswith(suffix):
5130                 continue
5131             klass = getattr(plugins, name)
5132             classes[name] = namespace[name] = klass
5133     except FileNotFoundError:
5134         pass
5135     return classes
5136
5137
5138 def traverse_obj(
5139         obj, *path_list, default=None, expected_type=None, get_all=True,
5140         casesense=True, is_user_input=False, traverse_string=False):
5141     ''' Traverse nested list/dict/tuple
5142     @param path_list        A list of paths which are checked one by one.
5143                             Each path is a list of keys where each key is a string,
5144                             a function, a tuple of strings/None or "...".
5145                             When a fuction is given, it takes the key as argument and
5146                             returns whether the key matches or not. When a tuple is given,
5147                             all the keys given in the tuple are traversed, and
5148                             "..." traverses all the keys in the object
5149                             "None" returns the object without traversal
5150     @param default          Default value to return
5151     @param expected_type    Only accept final value of this type (Can also be any callable)
5152     @param get_all          Return all the values obtained from a path or only the first one
5153     @param casesense        Whether to consider dictionary keys as case sensitive
5154     @param is_user_input    Whether the keys are generated from user input. If True,
5155                             strings are converted to int/slice if necessary
5156     @param traverse_string  Whether to traverse inside strings. If True, any
5157                             non-compatible object will also be converted into a string
5158     # TODO: Write tests
5159     '''
5160     if not casesense:
5161         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5162         path_list = (map(_lower, variadic(path)) for path in path_list)
5163
5164     def _traverse_obj(obj, path, _current_depth=0):
5165         nonlocal depth
5166         path = tuple(variadic(path))
5167         for i, key in enumerate(path):
5168             if None in (key, obj):
5169                 return obj
5170             if isinstance(key, (list, tuple)):
5171                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5172                 key = ...
5173             if key is ...:
5174                 obj = (obj.values() if isinstance(obj, dict)
5175                        else obj if isinstance(obj, (list, tuple, LazyList))
5176                        else str(obj) if traverse_string else [])
5177                 _current_depth += 1
5178                 depth = max(depth, _current_depth)
5179                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5180             elif callable(key):
5181                 if isinstance(obj, (list, tuple, LazyList)):
5182                     obj = enumerate(obj)
5183                 elif isinstance(obj, dict):
5184                     obj = obj.items()
5185                 else:
5186                     if not traverse_string:
5187                         return None
5188                     obj = str(obj)
5189                 _current_depth += 1
5190                 depth = max(depth, _current_depth)
5191                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5192             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5193                 obj = (obj.get(key) if casesense or (key in obj)
5194                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5195             else:
5196                 if is_user_input:
5197                     key = (int_or_none(key) if ':' not in key
5198                            else slice(*map(int_or_none, key.split(':'))))
5199                     if key == slice(None):
5200                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5201                 if not isinstance(key, (int, slice)):
5202                     return None
5203                 if not isinstance(obj, (list, tuple, LazyList)):
5204                     if not traverse_string:
5205                         return None
5206                     obj = str(obj)
5207                 try:
5208                     obj = obj[key]
5209                 except IndexError:
5210                     return None
5211         return obj
5212
5213     if isinstance(expected_type, type):
5214         type_test = lambda val: val if isinstance(val, expected_type) else None
5215     elif expected_type is not None:
5216         type_test = expected_type
5217     else:
5218         type_test = lambda val: val
5219
5220     for path in path_list:
5221         depth = 0
5222         val = _traverse_obj(obj, path)
5223         if val is not None:
5224             if depth:
5225                 for _ in range(depth - 1):
5226                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5227                 val = [v for v in map(type_test, val) if v is not None]
5228                 if val:
5229                     return val if get_all else val[0]
5230             else:
5231                 val = type_test(val)
5232                 if val is not None:
5233                     return val
5234     return default
5235
5236
5237 def traverse_dict(dictn, keys, casesense=True):
5238     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5239                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5240     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5241
5242
5243 def get_first(obj, keys, **kwargs):
5244     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5245
5246
5247 def variadic(x, allowed_types=(str, bytes, dict)):
5248     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5249
5250
5251 def decode_base(value, digits):
5252     # This will convert given base-x string to scalar (long or int)
5253     table = {char: index for index, char in enumerate(digits)}
5254     result = 0
5255     base = len(digits)
5256     for chr in value:
5257         result *= base
5258         result += table[chr]
5259     return result
5260
5261
5262 def time_seconds(**kwargs):
5263     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5264     return t.timestamp()
5265
5266
5267 # create a JSON Web Signature (jws) with HS256 algorithm
5268 # the resulting format is in JWS Compact Serialization
5269 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5270 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5271 def jwt_encode_hs256(payload_data, key, headers={}):
5272     header_data = {
5273         'alg': 'HS256',
5274         'typ': 'JWT',
5275     }
5276     if headers:
5277         header_data.update(headers)
5278     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5279     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5280     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5281     signature_b64 = base64.b64encode(h.digest())
5282     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5283     return token
5284
5285
5286 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5287 def jwt_decode_hs256(jwt):
5288     header_b64, payload_b64, signature_b64 = jwt.split('.')
5289     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5290     return payload_data
5291
5292
5293 def supports_terminal_sequences(stream):
5294     if compat_os_name == 'nt':
5295         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5296         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5297             return False
5298     elif not os.getenv('TERM'):
5299         return False
5300     try:
5301         return stream.isatty()
5302     except BaseException:
5303         return False
5304
5305
5306 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5307
5308
5309 def remove_terminal_sequences(string):
5310     return _terminal_sequences_re.sub('', string)
5311
5312
5313 def number_of_digits(number):
5314     return len('%d' % number)
5315
5316
5317 def join_nonempty(*values, delim='-', from_dict=None):
5318     if from_dict is not None:
5319         values = map(from_dict.get, values)
5320     return delim.join(map(str, filter(None, values)))
5321
5322
5323 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5324     """
5325     Find the largest format dimensions in terms of video width and, for each thumbnail:
5326     * Modify the URL: Match the width with the provided regex and replace with the former width
5327     * Update dimensions
5328
5329     This function is useful with video services that scale the provided thumbnails on demand
5330     """
5331     _keys = ('width', 'height')
5332     max_dimensions = max(
5333         [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5334         default=(0, 0))
5335     if not max_dimensions[0]:
5336         return thumbnails
5337     return [
5338         merge_dicts(
5339             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5340             dict(zip(_keys, max_dimensions)), thumbnail)
5341         for thumbnail in thumbnails
5342     ]
5343
5344
5345 def parse_http_range(range):
5346     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5347     if not range:
5348         return None, None, None
5349     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5350     if not crg:
5351         return None, None, None
5352     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5353
5354
5355 class Config:
5356     own_args = None
5357     filename = None
5358     __initialized = False
5359
5360     def __init__(self, parser, label=None):
5361         self._parser, self.label = parser, label
5362         self._loaded_paths, self.configs = set(), []
5363
5364     def init(self, args=None, filename=None):
5365         assert not self.__initialized
5366         directory = ''
5367         if filename:
5368             location = os.path.realpath(filename)
5369             directory = os.path.dirname(location)
5370             if location in self._loaded_paths:
5371                 return False
5372             self._loaded_paths.add(location)
5373
5374         self.__initialized = True
5375         self.own_args, self.filename = args, filename
5376         for location in self._parser.parse_args(args)[0].config_locations or []:
5377             location = os.path.join(directory, expand_path(location))
5378             if os.path.isdir(location):
5379                 location = os.path.join(location, 'yt-dlp.conf')
5380             if not os.path.exists(location):
5381                 self._parser.error(f'config location {location} does not exist')
5382             self.append_config(self.read_file(location), location)
5383         return True
5384
5385     def __str__(self):
5386         label = join_nonempty(
5387             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5388             delim=' ')
5389         return join_nonempty(
5390             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5391             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5392             delim='\n')
5393
5394     @staticmethod
5395     def read_file(filename, default=[]):
5396         try:
5397             optionf = open(filename)
5398         except IOError:
5399             return default  # silently skip if file is not present
5400         try:
5401             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5402             contents = optionf.read()
5403             if sys.version_info < (3,):
5404                 contents = contents.decode(preferredencoding())
5405             res = compat_shlex_split(contents, comments=True)
5406         finally:
5407             optionf.close()
5408         return res
5409
5410     @staticmethod
5411     def hide_login_info(opts):
5412         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5413         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5414
5415         def _scrub_eq(o):
5416             m = eqre.match(o)
5417             if m:
5418                 return m.group('key') + '=PRIVATE'
5419             else:
5420                 return o
5421
5422         opts = list(map(_scrub_eq, opts))
5423         for idx, opt in enumerate(opts):
5424             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5425                 opts[idx + 1] = 'PRIVATE'
5426         return opts
5427
5428     def append_config(self, *args, label=None):
5429         config = type(self)(self._parser, label)
5430         config._loaded_paths = self._loaded_paths
5431         if config.init(*args):
5432             self.configs.append(config)
5433
5434     @property
5435     def all_args(self):
5436         for config in reversed(self.configs):
5437             yield from config.all_args
5438         yield from self.own_args or []
5439
5440     def parse_args(self):
5441         return self._parser.parse_args(list(self.all_args))
5442
5443
5444 class WebSocketsWrapper():
5445     """Wraps websockets module to use in non-async scopes"""
5446
5447     def __init__(self, url, headers=None, connect=True):
5448         self.loop = asyncio.events.new_event_loop()
5449         self.conn = compat_websockets.connect(
5450             url, extra_headers=headers, ping_interval=None,
5451             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5452         if connect:
5453             self.__enter__()
5454         atexit.register(self.__exit__, None, None, None)
5455
5456     def __enter__(self):
5457         if not self.pool:
5458             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5459         return self
5460
5461     def send(self, *args):
5462         self.run_with_loop(self.pool.send(*args), self.loop)
5463
5464     def recv(self, *args):
5465         return self.run_with_loop(self.pool.recv(*args), self.loop)
5466
5467     def __exit__(self, type, value, traceback):
5468         try:
5469             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5470         finally:
5471             self.loop.close()
5472             self._cancel_all_tasks(self.loop)
5473
5474     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5475     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5476     @staticmethod
5477     def run_with_loop(main, loop):
5478         if not asyncio.coroutines.iscoroutine(main):
5479             raise ValueError(f'a coroutine was expected, got {main!r}')
5480
5481         try:
5482             return loop.run_until_complete(main)
5483         finally:
5484             loop.run_until_complete(loop.shutdown_asyncgens())
5485             if hasattr(loop, 'shutdown_default_executor'):
5486                 loop.run_until_complete(loop.shutdown_default_executor())
5487
5488     @staticmethod
5489     def _cancel_all_tasks(loop):
5490         to_cancel = asyncio.tasks.all_tasks(loop)
5491
5492         if not to_cancel:
5493             return
5494
5495         for task in to_cancel:
5496             task.cancel()
5497
5498         loop.run_until_complete(
5499             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5500
5501         for task in to_cancel:
5502             if task.cancelled():
5503                 continue
5504             if task.exception() is not None:
5505                 loop.call_exception_handler({
5506                     'message': 'unhandled exception during asyncio.run() shutdown',
5507                     'exception': task.exception(),
5508                     'task': task,
5509                 })
5510
5511
5512 has_websockets = bool(compat_websockets)
5513
5514
5515 def merge_headers(*dicts):
5516     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5517     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5518
5519
5520 class classproperty:
5521     def __init__(self, f):
5522         self.f = f
5523
5524     def __get__(self, _, cls):
5525         return self.f(cls)