yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_brotli,
  51     compat_chr,
  52     compat_cookiejar,
  53     compat_ctypes_WINFUNCTYPE,
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_html_entities,
  57     compat_html_entities_html5,
  58     compat_http_client,
  59     compat_integer_types,
  60     compat_numeric_types,
  61     compat_kwargs,
  62     compat_os_name,
  63     compat_parse_qs,
  64     compat_shlex_split,
  65     compat_shlex_quote,
  66     compat_str,
  67     compat_struct_pack,
  68     compat_struct_unpack,
  69     compat_urllib_error,
  70     compat_urllib_parse,
  71     compat_urllib_parse_urlencode,
  72     compat_urllib_parse_urlparse,
  73     compat_urllib_parse_urlunparse,
  74     compat_urllib_parse_quote,
  75     compat_urllib_parse_quote_plus,
  76     compat_urllib_parse_unquote_plus,
  77     compat_urllib_request,
  78     compat_urlparse,
  79     compat_websockets,
  80     compat_xpath,
  81 )
  82
  83 from .socks import (
  84     ProxyType,
  85     sockssocket,
  86 )
  87
  88
  89 def register_socks_protocols():
  90     # "Register" SOCKS protocols
  91     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  92     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  93     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  94         if scheme not in compat_urlparse.uses_netloc:
  95             compat_urlparse.uses_netloc.append(scheme)
  96
  97
  98 # This is not clearly defined otherwise
  99 compiled_regex_type = type(re.compile(''))
 100
 101
 102 def random_user_agent():
 103     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 104     _CHROME_VERSIONS = (
 105         '90.0.4430.212',
 106         '90.0.4430.24',
 107         '90.0.4430.70',
 108         '90.0.4430.72',
 109         '90.0.4430.85',
 110         '90.0.4430.93',
 111         '91.0.4472.101',
 112         '91.0.4472.106',
 113         '91.0.4472.114',
 114         '91.0.4472.124',
 115         '91.0.4472.164',
 116         '91.0.4472.19',
 117         '91.0.4472.77',
 118         '92.0.4515.107',
 119         '92.0.4515.115',
 120         '92.0.4515.131',
 121         '92.0.4515.159',
 122         '92.0.4515.43',
 123         '93.0.4556.0',
 124         '93.0.4577.15',
 125         '93.0.4577.63',
 126         '93.0.4577.82',
 127         '94.0.4606.41',
 128         '94.0.4606.54',
 129         '94.0.4606.61',
 130         '94.0.4606.71',
 131         '94.0.4606.81',
 132         '94.0.4606.85',
 133         '95.0.4638.17',
 134         '95.0.4638.50',
 135         '95.0.4638.54',
 136         '95.0.4638.69',
 137         '95.0.4638.74',
 138         '96.0.4664.18',
 139         '96.0.4664.45',
 140         '96.0.4664.55',
 141         '96.0.4664.93',
 142         '97.0.4692.20',
 143     )
 144     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 145
 146
 147 SUPPORTED_ENCODINGS = [
 148     'gzip', 'deflate'
 149 ]
 150 if compat_brotli:
 151     SUPPORTED_ENCODINGS.append('br')
 152
 153 std_headers = {
 154     'User-Agent': random_user_agent(),
 155     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 156     'Accept-Encoding': ', '.join(SUPPORTED_ENCODINGS),
 157     'Accept-Language': 'en-us,en;q=0.5',
 158     'Sec-Fetch-Mode': 'navigate',
 159 }
 160
 161
 162 USER_AGENTS = {
 163     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 164 }
 165
 166
 167 NO_DEFAULT = object()
 168
 169 ENGLISH_MONTH_NAMES = [
 170     'January', 'February', 'March', 'April', 'May', 'June',
 171     'July', 'August', 'September', 'October', 'November', 'December']
 172
 173 MONTH_NAMES = {
 174     'en': ENGLISH_MONTH_NAMES,
 175     'fr': [
 176         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 177         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 178 }
 179
 180 KNOWN_EXTENSIONS = (
 181     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 182     'flv', 'f4v', 'f4a', 'f4b',
 183     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 184     'mkv', 'mka', 'mk3d',
 185     'avi', 'divx',
 186     'mov',
 187     'asf', 'wmv', 'wma',
 188     '3gp', '3g2',
 189     'mp3',
 190     'flac',
 191     'ape',
 192     'wav',
 193     'f4f', 'f4m', 'm3u8', 'smil')
 194
 195 # needed for sanitizing filenames in restricted mode
 196 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 197                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 198                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 199
 200 DATE_FORMATS = (
 201     '%d %B %Y',
 202     '%d %b %Y',
 203     '%B %d %Y',
 204     '%B %dst %Y',
 205     '%B %dnd %Y',
 206     '%B %drd %Y',
 207     '%B %dth %Y',
 208     '%b %d %Y',
 209     '%b %dst %Y',
 210     '%b %dnd %Y',
 211     '%b %drd %Y',
 212     '%b %dth %Y',
 213     '%b %dst %Y %I:%M',
 214     '%b %dnd %Y %I:%M',
 215     '%b %drd %Y %I:%M',
 216     '%b %dth %Y %I:%M',
 217     '%Y %m %d',
 218     '%Y-%m-%d',
 219     '%Y.%m.%d.',
 220     '%Y/%m/%d',
 221     '%Y/%m/%d %H:%M',
 222     '%Y/%m/%d %H:%M:%S',
 223     '%Y%m%d%H%M',
 224     '%Y%m%d%H%M%S',
 225     '%Y%m%d',
 226     '%Y-%m-%d %H:%M',
 227     '%Y-%m-%d %H:%M:%S',
 228     '%Y-%m-%d %H:%M:%S.%f',
 229     '%Y-%m-%d %H:%M:%S:%f',
 230     '%d.%m.%Y %H:%M',
 231     '%d.%m.%Y %H.%M',
 232     '%Y-%m-%dT%H:%M:%SZ',
 233     '%Y-%m-%dT%H:%M:%S.%fZ',
 234     '%Y-%m-%dT%H:%M:%S.%f0Z',
 235     '%Y-%m-%dT%H:%M:%S',
 236     '%Y-%m-%dT%H:%M:%S.%f',
 237     '%Y-%m-%dT%H:%M',
 238     '%b %d %Y at %H:%M',
 239     '%b %d %Y at %H:%M:%S',
 240     '%B %d %Y at %H:%M',
 241     '%B %d %Y at %H:%M:%S',
 242     '%H:%M %d-%b-%Y',
 243 )
 244
 245 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 246 DATE_FORMATS_DAY_FIRST.extend([
 247     '%d-%m-%Y',
 248     '%d.%m.%Y',
 249     '%d.%m.%y',
 250     '%d/%m/%Y',
 251     '%d/%m/%y',
 252     '%d/%m/%Y %H:%M:%S',
 253 ])
 254
 255 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 256 DATE_FORMATS_MONTH_FIRST.extend([
 257     '%m-%d-%Y',
 258     '%m.%d.%Y',
 259     '%m/%d/%Y',
 260     '%m/%d/%y',
 261     '%m/%d/%Y %H:%M:%S',
 262 ])
 263
 264 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 265 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 266
 267
 268 def preferredencoding():
 269     """Get preferred encoding.
 270
 271     Returns the best encoding scheme for the system, based on
 272     locale.getpreferredencoding() and some further tweaks.
 273     """
 274     try:
 275         pref = locale.getpreferredencoding()
 276         'TEST'.encode(pref)
 277     except Exception:
 278         pref = 'UTF-8'
 279
 280     return pref
 281
 282
 283 def write_json_file(obj, fn):
 284     """ Encode obj as JSON and write it to fn, atomically if possible """
 285
 286     fn = encodeFilename(fn)
 287     if sys.version_info < (3, 0) and sys.platform != 'win32':
 288         encoding = get_filesystem_encoding()
 289         # os.path.basename returns a bytes object, but NamedTemporaryFile
 290         # will fail if the filename contains non ascii characters unless we
 291         # use a unicode object
 292         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 293         # the same for os.path.dirname
 294         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 295     else:
 296         path_basename = os.path.basename
 297         path_dirname = os.path.dirname
 298
 299     args = {
 300         'suffix': '.tmp',
 301         'prefix': path_basename(fn) + '.',
 302         'dir': path_dirname(fn),
 303         'delete': False,
 304     }
 305
 306     # In Python 2.x, json.dump expects a bytestream.
 307     # In Python 3.x, it writes to a character stream
 308     if sys.version_info < (3, 0):
 309         args['mode'] = 'wb'
 310     else:
 311         args.update({
 312             'mode': 'w',
 313             'encoding': 'utf-8',
 314         })
 315
 316     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 317
 318     try:
 319         with tf:
 320             json.dump(obj, tf, ensure_ascii=False)
 321         if sys.platform == 'win32':
 322             # Need to remove existing file on Windows, else os.rename raises
 323             # WindowsError or FileExistsError.
 324             try:
 325                 os.unlink(fn)
 326             except OSError:
 327                 pass
 328         try:
 329             mask = os.umask(0)
 330             os.umask(mask)
 331             os.chmod(tf.name, 0o666 & ~mask)
 332         except OSError:
 333             pass
 334         os.rename(tf.name, fn)
 335     except Exception:
 336         try:
 337             os.remove(tf.name)
 338         except OSError:
 339             pass
 340         raise
 341
 342
 343 if sys.version_info >= (2, 7):
 344     def find_xpath_attr(node, xpath, key, val=None):
 345         """ Find the xpath xpath[@key=val] """
 346         assert re.match(r'^[a-zA-Z_-]+$', key)
 347         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 348         return node.find(expr)
 349 else:
 350     def find_xpath_attr(node, xpath, key, val=None):
 351         for f in node.findall(compat_xpath(xpath)):
 352             if key not in f.attrib:
 353                 continue
 354             if val is None or f.attrib.get(key) == val:
 355                 return f
 356         return None
 357
 358 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 359 # the namespace parameter
 360
 361
 362 def xpath_with_ns(path, ns_map):
 363     components = [c.split(':') for c in path.split('/')]
 364     replaced = []
 365     for c in components:
 366         if len(c) == 1:
 367             replaced.append(c[0])
 368         else:
 369             ns, tag = c
 370             replaced.append('{%s}%s' % (ns_map[ns], tag))
 371     return '/'.join(replaced)
 372
 373
 374 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 375     def _find_xpath(xpath):
 376         return node.find(compat_xpath(xpath))
 377
 378     if isinstance(xpath, (str, compat_str)):
 379         n = _find_xpath(xpath)
 380     else:
 381         for xp in xpath:
 382             n = _find_xpath(xp)
 383             if n is not None:
 384                 break
 385
 386     if n is None:
 387         if default is not NO_DEFAULT:
 388             return default
 389         elif fatal:
 390             name = xpath if name is None else name
 391             raise ExtractorError('Could not find XML element %s' % name)
 392         else:
 393             return None
 394     return n
 395
 396
 397 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 398     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 399     if n is None or n == default:
 400         return n
 401     if n.text is None:
 402         if default is not NO_DEFAULT:
 403             return default
 404         elif fatal:
 405             name = xpath if name is None else name
 406             raise ExtractorError('Could not find XML element\'s text %s' % name)
 407         else:
 408             return None
 409     return n.text
 410
 411
 412 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 413     n = find_xpath_attr(node, xpath, key)
 414     if n is None:
 415         if default is not NO_DEFAULT:
 416             return default
 417         elif fatal:
 418             name = '%s[@%s]' % (xpath, key) if name is None else name
 419             raise ExtractorError('Could not find XML attribute %s' % name)
 420         else:
 421             return None
 422     return n.attrib[key]
 423
 424
 425 def get_element_by_id(id, html):
 426     """Return the content of the tag with the specified ID in the passed HTML document"""
 427     return get_element_by_attribute('id', id, html)
 428
 429
 430 def get_element_html_by_id(id, html):
 431     """Return the html of the tag with the specified ID in the passed HTML document"""
 432     return get_element_html_by_attribute('id', id, html)
 433
 434
 435 def get_element_by_class(class_name, html):
 436     """Return the content of the first tag with the specified class in the passed HTML document"""
 437     retval = get_elements_by_class(class_name, html)
 438     return retval[0] if retval else None
 439
 440
 441 def get_element_html_by_class(class_name, html):
 442     """Return the html of the first tag with the specified class in the passed HTML document"""
 443     retval = get_elements_html_by_class(class_name, html)
 444     return retval[0] if retval else None
 445
 446
 447 def get_element_by_attribute(attribute, value, html, escape_value=True):
 448     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 449     return retval[0] if retval else None
 450
 451
 452 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 453     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 454     return retval[0] if retval else None
 455
 456
 457 def get_elements_by_class(class_name, html):
 458     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 459     return get_elements_by_attribute(
 460         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 461         html, escape_value=False)
 462
 463
 464 def get_elements_html_by_class(class_name, html):
 465     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 466     return get_elements_html_by_attribute(
 467         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 468         html, escape_value=False)
 469
 470
 471 def get_elements_by_attribute(*args, **kwargs):
 472     """Return the content of the tag with the specified attribute in the passed HTML document"""
 473     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 474
 475
 476 def get_elements_html_by_attribute(*args, **kwargs):
 477     """Return the html of the tag with the specified attribute in the passed HTML document"""
 478     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 479
 480
 481 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 482     """
 483     Return the text (content) and the html (whole) of the tag with the specified
 484     attribute in the passed HTML document
 485     """
 486
 487     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 488
 489     value = re.escape(value) if escape_value else value
 490
 491     partial_element_re = r'''(?x)
 492         <(?P<tag>[a-zA-Z0-9:._-]+)
 493          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 494          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 495         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 496
 497     for m in re.finditer(partial_element_re, html):
 498         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 499
 500         yield (
 501             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 502             whole
 503         )
 504
 505
 506 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 507     """
 508     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 509     closing tag for the first opening tag it has encountered, and can be used
 510     as a context manager
 511     """
 512
 513     class HTMLBreakOnClosingTagException(Exception):
 514         pass
 515
 516     def __init__(self):
 517         self.tagstack = collections.deque()
 518         compat_HTMLParser.__init__(self)
 519
 520     def __enter__(self):
 521         return self
 522
 523     def __exit__(self, *_):
 524         self.close()
 525
 526     def close(self):
 527         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 528         # so data remains buffered; we no longer have any interest in it, thus
 529         # override this method to discard it
 530         pass
 531
 532     def handle_starttag(self, tag, _):
 533         self.tagstack.append(tag)
 534
 535     def handle_endtag(self, tag):
 536         if not self.tagstack:
 537             raise compat_HTMLParseError('no tags in the stack')
 538         while self.tagstack:
 539             inner_tag = self.tagstack.pop()
 540             if inner_tag == tag:
 541                 break
 542         else:
 543             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 544         if not self.tagstack:
 545             raise self.HTMLBreakOnClosingTagException()
 546
 547
 548 def get_element_text_and_html_by_tag(tag, html):
 549     """
 550     For the first element with the specified tag in the passed HTML document
 551     return its' content (text) and the whole element (html)
 552     """
 553     def find_or_raise(haystack, needle, exc):
 554         try:
 555             return haystack.index(needle)
 556         except ValueError:
 557             raise exc
 558     closing_tag = f'</{tag}>'
 559     whole_start = find_or_raise(
 560         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 561     content_start = find_or_raise(
 562         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 563     content_start += whole_start + 1
 564     with HTMLBreakOnClosingTagParser() as parser:
 565         parser.feed(html[whole_start:content_start])
 566         if not parser.tagstack or parser.tagstack[0] != tag:
 567             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 568         offset = content_start
 569         while offset < len(html):
 570             next_closing_tag_start = find_or_raise(
 571                 html[offset:], closing_tag,
 572                 compat_HTMLParseError(f'closing {tag} tag not found'))
 573             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 574             try:
 575                 parser.feed(html[offset:offset + next_closing_tag_end])
 576                 offset += next_closing_tag_end
 577             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 578                 return html[content_start:offset + next_closing_tag_start], \
 579                     html[whole_start:offset + next_closing_tag_end]
 580         raise compat_HTMLParseError('unexpected end of html')
 581
 582
 583 class HTMLAttributeParser(compat_HTMLParser):
 584     """Trivial HTML parser to gather the attributes for a single element"""
 585
 586     def __init__(self):
 587         self.attrs = {}
 588         compat_HTMLParser.__init__(self)
 589
 590     def handle_starttag(self, tag, attrs):
 591         self.attrs = dict(attrs)
 592
 593
 594 class HTMLListAttrsParser(compat_HTMLParser):
 595     """HTML parser to gather the attributes for the elements of a list"""
 596
 597     def __init__(self):
 598         compat_HTMLParser.__init__(self)
 599         self.items = []
 600         self._level = 0
 601
 602     def handle_starttag(self, tag, attrs):
 603         if tag == 'li' and self._level == 0:
 604             self.items.append(dict(attrs))
 605         self._level += 1
 606
 607     def handle_endtag(self, tag):
 608         self._level -= 1
 609
 610
 611 def extract_attributes(html_element):
 612     """Given a string for an HTML element such as
 613     <el
 614          a="foo" B="bar" c="&98;az" d=boz
 615          empty= noval entity="&amp;"
 616          sq='"' dq="'"
 617     >
 618     Decode and return a dictionary of attributes.
 619     {
 620         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 621         'empty': '', 'noval': None, 'entity': '&',
 622         'sq': '"', 'dq': '\''
 623     }.
 624     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 625     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 626     """
 627     parser = HTMLAttributeParser()
 628     try:
 629         parser.feed(html_element)
 630         parser.close()
 631     # Older Python may throw HTMLParseError in case of malformed HTML
 632     except compat_HTMLParseError:
 633         pass
 634     return parser.attrs
 635
 636
 637 def parse_list(webpage):
 638     """Given a string for an series of HTML <li> elements,
 639     return a dictionary of their attributes"""
 640     parser = HTMLListAttrsParser()
 641     parser.feed(webpage)
 642     parser.close()
 643     return parser.items
 644
 645
 646 def clean_html(html):
 647     """Clean an HTML snippet into a readable string"""
 648
 649     if html is None:  # Convenience for sanitizing descriptions etc.
 650         return html
 651
 652     html = re.sub(r'\s+', ' ', html)
 653     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 654     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 655     # Strip html tags
 656     html = re.sub('<.*?>', '', html)
 657     # Replace html entities
 658     html = unescapeHTML(html)
 659     return html.strip()
 660
 661
 662 def sanitize_open(filename, open_mode):
 663     """Try to open the given filename, and slightly tweak it if this fails.
 664
 665     Attempts to open the given filename. If this fails, it tries to change
 666     the filename slightly, step by step, until it's either able to open it
 667     or it fails and raises a final exception, like the standard open()
 668     function.
 669
 670     It returns the tuple (stream, definitive_file_name).
 671     """
 672     try:
 673         if filename == '-':
 674             if sys.platform == 'win32':
 675                 import msvcrt
 676                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 677             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 678         stream = locked_file(filename, open_mode, block=False).open()
 679         return (stream, filename)
 680     except (IOError, OSError) as err:
 681         if err.errno in (errno.EACCES,):
 682             raise
 683
 684         # In case of error, try to remove win32 forbidden chars
 685         alt_filename = sanitize_path(filename)
 686         if alt_filename == filename:
 687             raise
 688         else:
 689             # An exception here should be caught in the caller
 690             stream = locked_file(filename, open_mode, block=False).open()
 691             return (stream, alt_filename)
 692
 693
 694 def timeconvert(timestr):
 695     """Convert RFC 2822 defined time string into system timestamp"""
 696     timestamp = None
 697     timetuple = email.utils.parsedate_tz(timestr)
 698     if timetuple is not None:
 699         timestamp = email.utils.mktime_tz(timetuple)
 700     return timestamp
 701
 702
 703 def sanitize_filename(s, restricted=False, is_id=False):
 704     """Sanitizes a string so it could be used as part of a filename.
 705     If restricted is set, use a stricter subset of allowed characters.
 706     Set is_id if this is not an arbitrary string, but an ID that should be kept
 707     if possible.
 708     """
 709     def replace_insane(char):
 710         if restricted and char in ACCENT_CHARS:
 711             return ACCENT_CHARS[char]
 712         elif not restricted and char == '\n':
 713             return ' '
 714         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 715             return ''
 716         elif char == '"':
 717             return '' if restricted else '\''
 718         elif char == ':':
 719             return '_-' if restricted else ' -'
 720         elif char in '\\/|*<>':
 721             return '_'
 722         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 723             return '_'
 724         if restricted and ord(char) > 127:
 725             return '_'
 726         return char
 727
 728     if s == '':
 729         return ''
 730     # Handle timestamps
 731     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 732     result = ''.join(map(replace_insane, s))
 733     if not is_id:
 734         while '__' in result:
 735             result = result.replace('__', '_')
 736         result = result.strip('_')
 737         # Common case of "Foreign band name - English song title"
 738         if restricted and result.startswith('-_'):
 739             result = result[2:]
 740         if result.startswith('-'):
 741             result = '_' + result[len('-'):]
 742         result = result.lstrip('.')
 743         if not result:
 744             result = '_'
 745     return result
 746
 747
 748 def sanitize_path(s, force=False):
 749     """Sanitizes and normalizes path on Windows"""
 750     if sys.platform == 'win32':
 751         force = False
 752         drive_or_unc, _ = os.path.splitdrive(s)
 753         if sys.version_info < (2, 7) and not drive_or_unc:
 754             drive_or_unc, _ = os.path.splitunc(s)
 755     elif force:
 756         drive_or_unc = ''
 757     else:
 758         return s
 759
 760     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 761     if drive_or_unc:
 762         norm_path.pop(0)
 763     sanitized_path = [
 764         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 765         for path_part in norm_path]
 766     if drive_or_unc:
 767         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 768     elif force and s[0] == os.path.sep:
 769         sanitized_path.insert(0, os.path.sep)
 770     return os.path.join(*sanitized_path)
 771
 772
 773 def sanitize_url(url):
 774     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 775     # the number of unwanted failures due to missing protocol
 776     if url.startswith('//'):
 777         return 'http:%s' % url
 778     # Fix some common typos seen so far
 779     COMMON_TYPOS = (
 780         # https://github.com/ytdl-org/youtube-dl/issues/15649
 781         (r'^httpss://', r'https://'),
 782         # https://bx1.be/lives/direct-tv/
 783         (r'^rmtp([es]?)://', r'rtmp\1://'),
 784     )
 785     for mistake, fixup in COMMON_TYPOS:
 786         if re.match(mistake, url):
 787             return re.sub(mistake, fixup, url)
 788     return url
 789
 790
 791 def extract_basic_auth(url):
 792     parts = compat_urlparse.urlsplit(url)
 793     if parts.username is None:
 794         return url, None
 795     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 796         parts.hostname if parts.port is None
 797         else '%s:%d' % (parts.hostname, parts.port))))
 798     auth_payload = base64.b64encode(
 799         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 800     return url, 'Basic ' + auth_payload.decode('utf-8')
 801
 802
 803 def sanitized_Request(url, *args, **kwargs):
 804     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 805     if auth_header is not None:
 806         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 807         headers['Authorization'] = auth_header
 808     return compat_urllib_request.Request(url, *args, **kwargs)
 809
 810
 811 def expand_path(s):
 812     """Expand shell variables and ~"""
 813     return os.path.expandvars(compat_expanduser(s))
 814
 815
 816 def orderedSet(iterable):
 817     """ Remove all duplicates from the input iterable """
 818     res = []
 819     for el in iterable:
 820         if el not in res:
 821             res.append(el)
 822     return res
 823
 824
 825 def _htmlentity_transform(entity_with_semicolon):
 826     """Transforms an HTML entity to a character."""
 827     entity = entity_with_semicolon[:-1]
 828
 829     # Known non-numeric HTML entity
 830     if entity in compat_html_entities.name2codepoint:
 831         return compat_chr(compat_html_entities.name2codepoint[entity])
 832
 833     # TODO: HTML5 allows entities without a semicolon. For example,
 834     # '&Eacuteric' should be decoded as 'Éric'.
 835     if entity_with_semicolon in compat_html_entities_html5:
 836         return compat_html_entities_html5[entity_with_semicolon]
 837
 838     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 839     if mobj is not None:
 840         numstr = mobj.group(1)
 841         if numstr.startswith('x'):
 842             base = 16
 843             numstr = '0%s' % numstr
 844         else:
 845             base = 10
 846         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 847         try:
 848             return compat_chr(int(numstr, base))
 849         except ValueError:
 850             pass
 851
 852     # Unknown entity in name, return its literal representation
 853     return '&%s;' % entity
 854
 855
 856 def unescapeHTML(s):
 857     if s is None:
 858         return None
 859     assert type(s) == compat_str
 860
 861     return re.sub(
 862         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 863
 864
 865 def escapeHTML(text):
 866     return (
 867         text
 868         .replace('&', '&amp;')
 869         .replace('<', '&lt;')
 870         .replace('>', '&gt;')
 871         .replace('"', '&quot;')
 872         .replace("'", '&#39;')
 873     )
 874
 875
 876 def process_communicate_or_kill(p, *args, **kwargs):
 877     try:
 878         return p.communicate(*args, **kwargs)
 879     except BaseException:  # Including KeyboardInterrupt
 880         p.kill()
 881         p.wait()
 882         raise
 883
 884
 885 class Popen(subprocess.Popen):
 886     if sys.platform == 'win32':
 887         _startupinfo = subprocess.STARTUPINFO()
 888         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 889     else:
 890         _startupinfo = None
 891
 892     def __init__(self, *args, **kwargs):
 893         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 894
 895     def communicate_or_kill(self, *args, **kwargs):
 896         return process_communicate_or_kill(self, *args, **kwargs)
 897
 898
 899 def get_subprocess_encoding():
 900     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 901         # For subprocess calls, encode with locale encoding
 902         # Refer to http://stackoverflow.com/a/9951851/35070
 903         encoding = preferredencoding()
 904     else:
 905         encoding = sys.getfilesystemencoding()
 906     if encoding is None:
 907         encoding = 'utf-8'
 908     return encoding
 909
 910
 911 def encodeFilename(s, for_subprocess=False):
 912     """
 913     @param s The name of the file
 914     """
 915
 916     assert type(s) == compat_str
 917
 918     # Python 3 has a Unicode API
 919     if sys.version_info >= (3, 0):
 920         return s
 921
 922     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 923     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 924     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 925     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 926         return s
 927
 928     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 929     if sys.platform.startswith('java'):
 930         return s
 931
 932     return s.encode(get_subprocess_encoding(), 'ignore')
 933
 934
 935 def decodeFilename(b, for_subprocess=False):
 936
 937     if sys.version_info >= (3, 0):
 938         return b
 939
 940     if not isinstance(b, bytes):
 941         return b
 942
 943     return b.decode(get_subprocess_encoding(), 'ignore')
 944
 945
 946 def encodeArgument(s):
 947     if not isinstance(s, compat_str):
 948         # Legacy code that uses byte strings
 949         # Uncomment the following line after fixing all post processors
 950         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 951         s = s.decode('ascii')
 952     return encodeFilename(s, True)
 953
 954
 955 def decodeArgument(b):
 956     return decodeFilename(b, True)
 957
 958
 959 def decodeOption(optval):
 960     if optval is None:
 961         return optval
 962     if isinstance(optval, bytes):
 963         optval = optval.decode(preferredencoding())
 964
 965     assert isinstance(optval, compat_str)
 966     return optval
 967
 968
 969 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 970
 971
 972 def timetuple_from_msec(msec):
 973     secs, msec = divmod(msec, 1000)
 974     mins, secs = divmod(secs, 60)
 975     hrs, mins = divmod(mins, 60)
 976     return _timetuple(hrs, mins, secs, msec)
 977
 978
 979 def formatSeconds(secs, delim=':', msec=False):
 980     time = timetuple_from_msec(secs * 1000)
 981     if time.hours:
 982         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 983     elif time.minutes:
 984         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 985     else:
 986         ret = '%d' % time.seconds
 987     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 988
 989
 990 def _ssl_load_windows_store_certs(ssl_context, storename):
 991     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 992     try:
 993         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 994                  if encoding == 'x509_asn' and (
 995                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 996     except PermissionError:
 997         return
 998     for cert in certs:
 999         try:
1000             ssl_context.load_verify_locations(cadata=cert)
1001         except ssl.SSLError:
1002             pass
1003
1004
1005 def make_HTTPS_handler(params, **kwargs):
1006     opts_check_certificate = not params.get('nocheckcertificate')
1007     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1008     context.check_hostname = opts_check_certificate
1009     if params.get('legacyserverconnect'):
1010         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1011     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1012     if opts_check_certificate:
1013         try:
1014             context.load_default_certs()
1015             # Work around the issue in load_default_certs when there are bad certificates. See:
1016             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1017             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1018         except ssl.SSLError:
1019             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1020             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1021                 # Create a new context to discard any certificates that were already loaded
1022                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1023                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1024                 for storename in ('CA', 'ROOT'):
1025                     _ssl_load_windows_store_certs(context, storename)
1026             context.set_default_verify_paths()
1027     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1028
1029
1030 def bug_reports_message(before=';'):
1031     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1032            'filling out the "Broken site" issue template properly. '
1033            'Confirm you are on the latest version using -U')
1034
1035     before = before.rstrip()
1036     if not before or before.endswith(('.', '!', '?')):
1037         msg = msg[0].title() + msg[1:]
1038
1039     return (before + ' ' if before else '') + msg
1040
1041
1042 class YoutubeDLError(Exception):
1043     """Base exception for YoutubeDL errors."""
1044     msg = None
1045
1046     def __init__(self, msg=None):
1047         if msg is not None:
1048             self.msg = msg
1049         elif self.msg is None:
1050             self.msg = type(self).__name__
1051         super().__init__(self.msg)
1052
1053
1054 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1055 if hasattr(ssl, 'CertificateError'):
1056     network_exceptions.append(ssl.CertificateError)
1057 network_exceptions = tuple(network_exceptions)
1058
1059
1060 class ExtractorError(YoutubeDLError):
1061     """Error during info extraction."""
1062
1063     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1064         """ tb, if given, is the original traceback (so that it can be printed out).
1065         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1066         """
1067         if sys.exc_info()[0] in network_exceptions:
1068             expected = True
1069
1070         self.orig_msg = str(msg)
1071         self.traceback = tb
1072         self.expected = expected
1073         self.cause = cause
1074         self.video_id = video_id
1075         self.ie = ie
1076         self.exc_info = sys.exc_info()  # preserve original exception
1077
1078         super(ExtractorError, self).__init__(''.join((
1079             format_field(ie, template='[%s] '),
1080             format_field(video_id, template='%s: '),
1081             msg,
1082             format_field(cause, template=' (caused by %r)'),
1083             '' if expected else bug_reports_message())))
1084
1085     def format_traceback(self):
1086         return join_nonempty(
1087             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1088             self.cause and ''.join(traceback.format_exception(self.cause)[1:]),
1089             delim='\n') or None
1090
1091
1092 class UnsupportedError(ExtractorError):
1093     def __init__(self, url):
1094         super(UnsupportedError, self).__init__(
1095             'Unsupported URL: %s' % url, expected=True)
1096         self.url = url
1097
1098
1099 class RegexNotFoundError(ExtractorError):
1100     """Error when a regex didn't match"""
1101     pass
1102
1103
1104 class GeoRestrictedError(ExtractorError):
1105     """Geographic restriction Error exception.
1106
1107     This exception may be thrown when a video is not available from your
1108     geographic location due to geographic restrictions imposed by a website.
1109     """
1110
1111     def __init__(self, msg, countries=None, **kwargs):
1112         kwargs['expected'] = True
1113         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1114         self.countries = countries
1115
1116
1117 class DownloadError(YoutubeDLError):
1118     """Download Error exception.
1119
1120     This exception may be thrown by FileDownloader objects if they are not
1121     configured to continue on errors. They will contain the appropriate
1122     error message.
1123     """
1124
1125     def __init__(self, msg, exc_info=None):
1126         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1127         super(DownloadError, self).__init__(msg)
1128         self.exc_info = exc_info
1129
1130
1131 class EntryNotInPlaylist(YoutubeDLError):
1132     """Entry not in playlist exception.
1133
1134     This exception will be thrown by YoutubeDL when a requested entry
1135     is not found in the playlist info_dict
1136     """
1137     msg = 'Entry not found in info'
1138
1139
1140 class SameFileError(YoutubeDLError):
1141     """Same File exception.
1142
1143     This exception will be thrown by FileDownloader objects if they detect
1144     multiple files would have to be downloaded to the same file on disk.
1145     """
1146     msg = 'Fixed output name but more than one file to download'
1147
1148     def __init__(self, filename=None):
1149         if filename is not None:
1150             self.msg += f': {filename}'
1151         super().__init__(self.msg)
1152
1153
1154 class PostProcessingError(YoutubeDLError):
1155     """Post Processing exception.
1156
1157     This exception may be raised by PostProcessor's .run() method to
1158     indicate an error in the postprocessing task.
1159     """
1160
1161
1162 class DownloadCancelled(YoutubeDLError):
1163     """ Exception raised when the download queue should be interrupted """
1164     msg = 'The download was cancelled'
1165
1166
1167 class ExistingVideoReached(DownloadCancelled):
1168     """ --break-on-existing triggered """
1169     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1170
1171
1172 class RejectedVideoReached(DownloadCancelled):
1173     """ --break-on-reject triggered """
1174     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1175
1176
1177 class MaxDownloadsReached(DownloadCancelled):
1178     """ --max-downloads limit has been reached. """
1179     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1180
1181
1182 class ReExtractInfo(YoutubeDLError):
1183     """ Video info needs to be re-extracted. """
1184
1185     def __init__(self, msg, expected=False):
1186         super().__init__(msg)
1187         self.expected = expected
1188
1189
1190 class ThrottledDownload(ReExtractInfo):
1191     """ Download speed below --throttled-rate. """
1192     msg = 'The download speed is below throttle limit'
1193
1194     def __init__(self):
1195         super().__init__(self.msg, expected=False)
1196
1197
1198 class UnavailableVideoError(YoutubeDLError):
1199     """Unavailable Format exception.
1200
1201     This exception will be thrown when a video is requested
1202     in a format that is not available for that video.
1203     """
1204     msg = 'Unable to download video'
1205
1206     def __init__(self, err=None):
1207         if err is not None:
1208             self.msg += f': {err}'
1209         super().__init__(self.msg)
1210
1211
1212 class ContentTooShortError(YoutubeDLError):
1213     """Content Too Short exception.
1214
1215     This exception may be raised by FileDownloader objects when a file they
1216     download is too small for what the server announced first, indicating
1217     the connection was probably interrupted.
1218     """
1219
1220     def __init__(self, downloaded, expected):
1221         super(ContentTooShortError, self).__init__(
1222             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1223         )
1224         # Both in bytes
1225         self.downloaded = downloaded
1226         self.expected = expected
1227
1228
1229 class XAttrMetadataError(YoutubeDLError):
1230     def __init__(self, code=None, msg='Unknown error'):
1231         super(XAttrMetadataError, self).__init__(msg)
1232         self.code = code
1233         self.msg = msg
1234
1235         # Parsing code and msg
1236         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1237                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1238             self.reason = 'NO_SPACE'
1239         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1240             self.reason = 'VALUE_TOO_LONG'
1241         else:
1242             self.reason = 'NOT_SUPPORTED'
1243
1244
1245 class XAttrUnavailableError(YoutubeDLError):
1246     pass
1247
1248
1249 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1250     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1251     # expected HTTP responses to meet HTTP/1.0 or later (see also
1252     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1253     if sys.version_info < (3, 0):
1254         kwargs['strict'] = True
1255     hc = http_class(*args, **compat_kwargs(kwargs))
1256     source_address = ydl_handler._params.get('source_address')
1257
1258     if source_address is not None:
1259         # This is to workaround _create_connection() from socket where it will try all
1260         # address data from getaddrinfo() including IPv6. This filters the result from
1261         # getaddrinfo() based on the source_address value.
1262         # This is based on the cpython socket.create_connection() function.
1263         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1264         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1265             host, port = address
1266             err = None
1267             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1268             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1269             ip_addrs = [addr for addr in addrs if addr[0] == af]
1270             if addrs and not ip_addrs:
1271                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1272                 raise socket.error(
1273                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1274                     % (ip_version, source_address[0]))
1275             for res in ip_addrs:
1276                 af, socktype, proto, canonname, sa = res
1277                 sock = None
1278                 try:
1279                     sock = socket.socket(af, socktype, proto)
1280                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1281                         sock.settimeout(timeout)
1282                     sock.bind(source_address)
1283                     sock.connect(sa)
1284                     err = None  # Explicitly break reference cycle
1285                     return sock
1286                 except socket.error as _:
1287                     err = _
1288                     if sock is not None:
1289                         sock.close()
1290             if err is not None:
1291                 raise err
1292             else:
1293                 raise socket.error('getaddrinfo returns an empty list')
1294         if hasattr(hc, '_create_connection'):
1295             hc._create_connection = _create_connection
1296         sa = (source_address, 0)
1297         if hasattr(hc, 'source_address'):  # Python 2.7+
1298             hc.source_address = sa
1299         else:  # Python 2.6
1300             def _hc_connect(self, *args, **kwargs):
1301                 sock = _create_connection(
1302                     (self.host, self.port), self.timeout, sa)
1303                 if is_https:
1304                     self.sock = ssl.wrap_socket(
1305                         sock, self.key_file, self.cert_file,
1306                         ssl_version=ssl.PROTOCOL_TLSv1)
1307                 else:
1308                     self.sock = sock
1309             hc.connect = functools.partial(_hc_connect, hc)
1310
1311     return hc
1312
1313
1314 def handle_youtubedl_headers(headers):
1315     filtered_headers = headers
1316
1317     if 'Youtubedl-no-compression' in filtered_headers:
1318         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1319         del filtered_headers['Youtubedl-no-compression']
1320
1321     return filtered_headers
1322
1323
1324 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1325     """Handler for HTTP requests and responses.
1326
1327     This class, when installed with an OpenerDirector, automatically adds
1328     the standard headers to every HTTP request and handles gzipped and
1329     deflated responses from web servers. If compression is to be avoided in
1330     a particular request, the original request in the program code only has
1331     to include the HTTP header "Youtubedl-no-compression", which will be
1332     removed before making the real request.
1333
1334     Part of this code was copied from:
1335
1336     http://techknack.net/python-urllib2-handlers/
1337
1338     Andrew Rowls, the author of that code, agreed to release it to the
1339     public domain.
1340     """
1341
1342     def __init__(self, params, *args, **kwargs):
1343         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1344         self._params = params
1345
1346     def http_open(self, req):
1347         conn_class = compat_http_client.HTTPConnection
1348
1349         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1350         if socks_proxy:
1351             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1352             del req.headers['Ytdl-socks-proxy']
1353
1354         return self.do_open(functools.partial(
1355             _create_http_connection, self, conn_class, False),
1356             req)
1357
1358     @staticmethod
1359     def deflate(data):
1360         if not data:
1361             return data
1362         try:
1363             return zlib.decompress(data, -zlib.MAX_WBITS)
1364         except zlib.error:
1365             return zlib.decompress(data)
1366
1367     @staticmethod
1368     def brotli(data):
1369         if not data:
1370             return data
1371         return compat_brotli.decompress(data)
1372
1373     def http_request(self, req):
1374         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1375         # always respected by websites, some tend to give out URLs with non percent-encoded
1376         # non-ASCII characters (see telemb.py, ard.py [#3412])
1377         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1378         # To work around aforementioned issue we will replace request's original URL with
1379         # percent-encoded one
1380         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1381         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1382         url = req.get_full_url()
1383         url_escaped = escape_url(url)
1384
1385         # Substitute URL if any change after escaping
1386         if url != url_escaped:
1387             req = update_Request(req, url=url_escaped)
1388
1389         for h, v in self._params.get('http_headers', std_headers).items():
1390             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1391             # The dict keys are capitalized because of this bug by urllib
1392             if h.capitalize() not in req.headers:
1393                 req.add_header(h, v)
1394
1395         req.headers = handle_youtubedl_headers(req.headers)
1396
1397         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1398             # Python 2.6 is brain-dead when it comes to fragments
1399             req._Request__original = req._Request__original.partition('#')[0]
1400             req._Request__r_type = req._Request__r_type.partition('#')[0]
1401
1402         return req
1403
1404     def http_response(self, req, resp):
1405         old_resp = resp
1406         # gzip
1407         if resp.headers.get('Content-encoding', '') == 'gzip':
1408             content = resp.read()
1409             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1410             try:
1411                 uncompressed = io.BytesIO(gz.read())
1412             except IOError as original_ioerror:
1413                 # There may be junk add the end of the file
1414                 # See http://stackoverflow.com/q/4928560/35070 for details
1415                 for i in range(1, 1024):
1416                     try:
1417                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1418                         uncompressed = io.BytesIO(gz.read())
1419                     except IOError:
1420                         continue
1421                     break
1422                 else:
1423                     raise original_ioerror
1424             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1425             resp.msg = old_resp.msg
1426             del resp.headers['Content-encoding']
1427         # deflate
1428         if resp.headers.get('Content-encoding', '') == 'deflate':
1429             gz = io.BytesIO(self.deflate(resp.read()))
1430             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1431             resp.msg = old_resp.msg
1432             del resp.headers['Content-encoding']
1433         # brotli
1434         if resp.headers.get('Content-encoding', '') == 'br':
1435             resp = compat_urllib_request.addinfourl(
1436                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1437             resp.msg = old_resp.msg
1438             del resp.headers['Content-encoding']
1439         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1440         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1441         if 300 <= resp.code < 400:
1442             location = resp.headers.get('Location')
1443             if location:
1444                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1445                 if sys.version_info >= (3, 0):
1446                     location = location.encode('iso-8859-1').decode('utf-8')
1447                 else:
1448                     location = location.decode('utf-8')
1449                 location_escaped = escape_url(location)
1450                 if location != location_escaped:
1451                     del resp.headers['Location']
1452                     if sys.version_info < (3, 0):
1453                         location_escaped = location_escaped.encode('utf-8')
1454                     resp.headers['Location'] = location_escaped
1455         return resp
1456
1457     https_request = http_request
1458     https_response = http_response
1459
1460
1461 def make_socks_conn_class(base_class, socks_proxy):
1462     assert issubclass(base_class, (
1463         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1464
1465     url_components = compat_urlparse.urlparse(socks_proxy)
1466     if url_components.scheme.lower() == 'socks5':
1467         socks_type = ProxyType.SOCKS5
1468     elif url_components.scheme.lower() in ('socks', 'socks4'):
1469         socks_type = ProxyType.SOCKS4
1470     elif url_components.scheme.lower() == 'socks4a':
1471         socks_type = ProxyType.SOCKS4A
1472
1473     def unquote_if_non_empty(s):
1474         if not s:
1475             return s
1476         return compat_urllib_parse_unquote_plus(s)
1477
1478     proxy_args = (
1479         socks_type,
1480         url_components.hostname, url_components.port or 1080,
1481         True,  # Remote DNS
1482         unquote_if_non_empty(url_components.username),
1483         unquote_if_non_empty(url_components.password),
1484     )
1485
1486     class SocksConnection(base_class):
1487         def connect(self):
1488             self.sock = sockssocket()
1489             self.sock.setproxy(*proxy_args)
1490             if type(self.timeout) in (int, float):
1491                 self.sock.settimeout(self.timeout)
1492             self.sock.connect((self.host, self.port))
1493
1494             if isinstance(self, compat_http_client.HTTPSConnection):
1495                 if hasattr(self, '_context'):  # Python > 2.6
1496                     self.sock = self._context.wrap_socket(
1497                         self.sock, server_hostname=self.host)
1498                 else:
1499                     self.sock = ssl.wrap_socket(self.sock)
1500
1501     return SocksConnection
1502
1503
1504 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1505     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1506         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1507         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1508         self._params = params
1509
1510     def https_open(self, req):
1511         kwargs = {}
1512         conn_class = self._https_conn_class
1513
1514         if hasattr(self, '_context'):  # python > 2.6
1515             kwargs['context'] = self._context
1516         if hasattr(self, '_check_hostname'):  # python 3.x
1517             kwargs['check_hostname'] = self._check_hostname
1518
1519         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1520         if socks_proxy:
1521             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1522             del req.headers['Ytdl-socks-proxy']
1523
1524         return self.do_open(functools.partial(
1525             _create_http_connection, self, conn_class, True),
1526             req, **kwargs)
1527
1528
1529 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1530     """
1531     See [1] for cookie file format.
1532
1533     1. https://curl.haxx.se/docs/http-cookies.html
1534     """
1535     _HTTPONLY_PREFIX = '#HttpOnly_'
1536     _ENTRY_LEN = 7
1537     _HEADER = '''# Netscape HTTP Cookie File
1538 # This file is generated by yt-dlp.  Do not edit.
1539
1540 '''
1541     _CookieFileEntry = collections.namedtuple(
1542         'CookieFileEntry',
1543         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1544
1545     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1546         """
1547         Save cookies to a file.
1548
1549         Most of the code is taken from CPython 3.8 and slightly adapted
1550         to support cookie files with UTF-8 in both python 2 and 3.
1551         """
1552         if filename is None:
1553             if self.filename is not None:
1554                 filename = self.filename
1555             else:
1556                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1557
1558         # Store session cookies with `expires` set to 0 instead of an empty
1559         # string
1560         for cookie in self:
1561             if cookie.expires is None:
1562                 cookie.expires = 0
1563
1564         with io.open(filename, 'w', encoding='utf-8') as f:
1565             f.write(self._HEADER)
1566             now = time.time()
1567             for cookie in self:
1568                 if not ignore_discard and cookie.discard:
1569                     continue
1570                 if not ignore_expires and cookie.is_expired(now):
1571                     continue
1572                 if cookie.secure:
1573                     secure = 'TRUE'
1574                 else:
1575                     secure = 'FALSE'
1576                 if cookie.domain.startswith('.'):
1577                     initial_dot = 'TRUE'
1578                 else:
1579                     initial_dot = 'FALSE'
1580                 if cookie.expires is not None:
1581                     expires = compat_str(cookie.expires)
1582                 else:
1583                     expires = ''
1584                 if cookie.value is None:
1585                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1586                     # with no name, whereas http.cookiejar regards it as a
1587                     # cookie with no value.
1588                     name = ''
1589                     value = cookie.name
1590                 else:
1591                     name = cookie.name
1592                     value = cookie.value
1593                 f.write(
1594                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1595                                secure, expires, name, value]) + '\n')
1596
1597     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1598         """Load cookies from a file."""
1599         if filename is None:
1600             if self.filename is not None:
1601                 filename = self.filename
1602             else:
1603                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1604
1605         def prepare_line(line):
1606             if line.startswith(self._HTTPONLY_PREFIX):
1607                 line = line[len(self._HTTPONLY_PREFIX):]
1608             # comments and empty lines are fine
1609             if line.startswith('#') or not line.strip():
1610                 return line
1611             cookie_list = line.split('\t')
1612             if len(cookie_list) != self._ENTRY_LEN:
1613                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1614             cookie = self._CookieFileEntry(*cookie_list)
1615             if cookie.expires_at and not cookie.expires_at.isdigit():
1616                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1617             return line
1618
1619         cf = io.StringIO()
1620         with io.open(filename, encoding='utf-8') as f:
1621             for line in f:
1622                 try:
1623                     cf.write(prepare_line(line))
1624                 except compat_cookiejar.LoadError as e:
1625                     write_string(
1626                         'WARNING: skipping cookie file entry due to %s: %r\n'
1627                         % (e, line), sys.stderr)
1628                     continue
1629         cf.seek(0)
1630         self._really_load(cf, filename, ignore_discard, ignore_expires)
1631         # Session cookies are denoted by either `expires` field set to
1632         # an empty string or 0. MozillaCookieJar only recognizes the former
1633         # (see [1]). So we need force the latter to be recognized as session
1634         # cookies on our own.
1635         # Session cookies may be important for cookies-based authentication,
1636         # e.g. usually, when user does not check 'Remember me' check box while
1637         # logging in on a site, some important cookies are stored as session
1638         # cookies so that not recognizing them will result in failed login.
1639         # 1. https://bugs.python.org/issue17164
1640         for cookie in self:
1641             # Treat `expires=0` cookies as session cookies
1642             if cookie.expires == 0:
1643                 cookie.expires = None
1644                 cookie.discard = True
1645
1646
1647 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1648     def __init__(self, cookiejar=None):
1649         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1650
1651     def http_response(self, request, response):
1652         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1653         # characters in Set-Cookie HTTP header of last response (see
1654         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1655         # In order to at least prevent crashing we will percent encode Set-Cookie
1656         # header before HTTPCookieProcessor starts processing it.
1657         # if sys.version_info < (3, 0) and response.headers:
1658         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1659         #         set_cookie = response.headers.get(set_cookie_header)
1660         #         if set_cookie:
1661         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1662         #             if set_cookie != set_cookie_escaped:
1663         #                 del response.headers[set_cookie_header]
1664         #                 response.headers[set_cookie_header] = set_cookie_escaped
1665         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1666
1667     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1668     https_response = http_response
1669
1670
1671 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1672     """YoutubeDL redirect handler
1673
1674     The code is based on HTTPRedirectHandler implementation from CPython [1].
1675
1676     This redirect handler solves two issues:
1677      - ensures redirect URL is always unicode under python 2
1678      - introduces support for experimental HTTP response status code
1679        308 Permanent Redirect [2] used by some sites [3]
1680
1681     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1682     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1683     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1684     """
1685
1686     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1687
1688     def redirect_request(self, req, fp, code, msg, headers, newurl):
1689         """Return a Request or None in response to a redirect.
1690
1691         This is called by the http_error_30x methods when a
1692         redirection response is received.  If a redirection should
1693         take place, return a new Request to allow http_error_30x to
1694         perform the redirect.  Otherwise, raise HTTPError if no-one
1695         else should try to handle this url.  Return None if you can't
1696         but another Handler might.
1697         """
1698         m = req.get_method()
1699         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1700                  or code in (301, 302, 303) and m == "POST")):
1701             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1702         # Strictly (according to RFC 2616), 301 or 302 in response to
1703         # a POST MUST NOT cause a redirection without confirmation
1704         # from the user (of urllib.request, in this case).  In practice,
1705         # essentially all clients do redirect in this case, so we do
1706         # the same.
1707
1708         # On python 2 urlh.geturl() may sometimes return redirect URL
1709         # as byte string instead of unicode. This workaround allows
1710         # to force it always return unicode.
1711         if sys.version_info[0] < 3:
1712             newurl = compat_str(newurl)
1713
1714         # Be conciliant with URIs containing a space.  This is mainly
1715         # redundant with the more complete encoding done in http_error_302(),
1716         # but it is kept for compatibility with other callers.
1717         newurl = newurl.replace(' ', '%20')
1718
1719         CONTENT_HEADERS = ("content-length", "content-type")
1720         # NB: don't use dict comprehension for python 2.6 compatibility
1721         newheaders = dict((k, v) for k, v in req.headers.items()
1722                           if k.lower() not in CONTENT_HEADERS)
1723         return compat_urllib_request.Request(
1724             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1725             unverifiable=True)
1726
1727
1728 def extract_timezone(date_str):
1729     m = re.search(
1730         r'''(?x)
1731             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1732             (?P<tz>Z|                                            # just the UTC Z, or
1733                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1734                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1735                    [ ]?                                          # optional space
1736                 (?P<sign>\+|-)                                   # +/-
1737                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1738             $)
1739         ''', date_str)
1740     if not m:
1741         timezone = datetime.timedelta()
1742     else:
1743         date_str = date_str[:-len(m.group('tz'))]
1744         if not m.group('sign'):
1745             timezone = datetime.timedelta()
1746         else:
1747             sign = 1 if m.group('sign') == '+' else -1
1748             timezone = datetime.timedelta(
1749                 hours=sign * int(m.group('hours')),
1750                 minutes=sign * int(m.group('minutes')))
1751     return timezone, date_str
1752
1753
1754 def parse_iso8601(date_str, delimiter='T', timezone=None):
1755     """ Return a UNIX timestamp from the given date """
1756
1757     if date_str is None:
1758         return None
1759
1760     date_str = re.sub(r'\.[0-9]+', '', date_str)
1761
1762     if timezone is None:
1763         timezone, date_str = extract_timezone(date_str)
1764
1765     try:
1766         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1767         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1768         return calendar.timegm(dt.timetuple())
1769     except ValueError:
1770         pass
1771
1772
1773 def date_formats(day_first=True):
1774     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1775
1776
1777 def unified_strdate(date_str, day_first=True):
1778     """Return a string with the date in the format YYYYMMDD"""
1779
1780     if date_str is None:
1781         return None
1782     upload_date = None
1783     # Replace commas
1784     date_str = date_str.replace(',', ' ')
1785     # Remove AM/PM + timezone
1786     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1787     _, date_str = extract_timezone(date_str)
1788
1789     for expression in date_formats(day_first):
1790         try:
1791             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1792         except ValueError:
1793             pass
1794     if upload_date is None:
1795         timetuple = email.utils.parsedate_tz(date_str)
1796         if timetuple:
1797             try:
1798                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1799             except ValueError:
1800                 pass
1801     if upload_date is not None:
1802         return compat_str(upload_date)
1803
1804
1805 def unified_timestamp(date_str, day_first=True):
1806     if date_str is None:
1807         return None
1808
1809     date_str = re.sub(r'[,|]', '', date_str)
1810
1811     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1812     timezone, date_str = extract_timezone(date_str)
1813
1814     # Remove AM/PM + timezone
1815     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1816
1817     # Remove unrecognized timezones from ISO 8601 alike timestamps
1818     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1819     if m:
1820         date_str = date_str[:-len(m.group('tz'))]
1821
1822     # Python only supports microseconds, so remove nanoseconds
1823     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1824     if m:
1825         date_str = m.group(1)
1826
1827     for expression in date_formats(day_first):
1828         try:
1829             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1830             return calendar.timegm(dt.timetuple())
1831         except ValueError:
1832             pass
1833     timetuple = email.utils.parsedate_tz(date_str)
1834     if timetuple:
1835         return calendar.timegm(timetuple) + pm_delta * 3600
1836
1837
1838 def determine_ext(url, default_ext='unknown_video'):
1839     if url is None or '.' not in url:
1840         return default_ext
1841     guess = url.partition('?')[0].rpartition('.')[2]
1842     if re.match(r'^[A-Za-z0-9]+$', guess):
1843         return guess
1844     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1845     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1846         return guess.rstrip('/')
1847     else:
1848         return default_ext
1849
1850
1851 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1852     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1853
1854
1855 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1856     """
1857     Return a datetime object from a string in the format YYYYMMDD or
1858     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1859
1860     format: string date format used to return datetime object from
1861     precision: round the time portion of a datetime object.
1862                 auto|microsecond|second|minute|hour|day.
1863                 auto: round to the unit provided in date_str (if applicable).
1864     """
1865     auto_precision = False
1866     if precision == 'auto':
1867         auto_precision = True
1868         precision = 'microsecond'
1869     today = datetime_round(datetime.datetime.utcnow(), precision)
1870     if date_str in ('now', 'today'):
1871         return today
1872     if date_str == 'yesterday':
1873         return today - datetime.timedelta(days=1)
1874     match = re.match(
1875         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1876         date_str)
1877     if match is not None:
1878         start_time = datetime_from_str(match.group('start'), precision, format)
1879         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1880         unit = match.group('unit')
1881         if unit == 'month' or unit == 'year':
1882             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1883             unit = 'day'
1884         else:
1885             if unit == 'week':
1886                 unit = 'day'
1887                 time *= 7
1888             delta = datetime.timedelta(**{unit + 's': time})
1889             new_date = start_time + delta
1890         if auto_precision:
1891             return datetime_round(new_date, unit)
1892         return new_date
1893
1894     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1895
1896
1897 def date_from_str(date_str, format='%Y%m%d', strict=False):
1898     """
1899     Return a datetime object from a string in the format YYYYMMDD or
1900     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1901
1902     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1903
1904     format: string date format used to return datetime object from
1905     """
1906     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1907         raise ValueError(f'Invalid date format {date_str}')
1908     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1909
1910
1911 def datetime_add_months(dt, months):
1912     """Increment/Decrement a datetime object by months."""
1913     month = dt.month + months - 1
1914     year = dt.year + month // 12
1915     month = month % 12 + 1
1916     day = min(dt.day, calendar.monthrange(year, month)[1])
1917     return dt.replace(year, month, day)
1918
1919
1920 def datetime_round(dt, precision='day'):
1921     """
1922     Round a datetime object's time to a specific precision
1923     """
1924     if precision == 'microsecond':
1925         return dt
1926
1927     unit_seconds = {
1928         'day': 86400,
1929         'hour': 3600,
1930         'minute': 60,
1931         'second': 1,
1932     }
1933     roundto = lambda x, n: ((x + n / 2) // n) * n
1934     timestamp = calendar.timegm(dt.timetuple())
1935     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1936
1937
1938 def hyphenate_date(date_str):
1939     """
1940     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1941     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1942     if match is not None:
1943         return '-'.join(match.groups())
1944     else:
1945         return date_str
1946
1947
1948 class DateRange(object):
1949     """Represents a time interval between two dates"""
1950
1951     def __init__(self, start=None, end=None):
1952         """start and end must be strings in the format accepted by date"""
1953         if start is not None:
1954             self.start = date_from_str(start, strict=True)
1955         else:
1956             self.start = datetime.datetime.min.date()
1957         if end is not None:
1958             self.end = date_from_str(end, strict=True)
1959         else:
1960             self.end = datetime.datetime.max.date()
1961         if self.start > self.end:
1962             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1963
1964     @classmethod
1965     def day(cls, day):
1966         """Returns a range that only contains the given day"""
1967         return cls(day, day)
1968
1969     def __contains__(self, date):
1970         """Check if the date is in the range"""
1971         if not isinstance(date, datetime.date):
1972             date = date_from_str(date)
1973         return self.start <= date <= self.end
1974
1975     def __str__(self):
1976         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1977
1978
1979 def platform_name():
1980     """ Returns the platform name as a compat_str """
1981     res = platform.platform()
1982     if isinstance(res, bytes):
1983         res = res.decode(preferredencoding())
1984
1985     assert isinstance(res, compat_str)
1986     return res
1987
1988
1989 def get_windows_version():
1990     ''' Get Windows version. None if it's not running on Windows '''
1991     if compat_os_name == 'nt':
1992         return version_tuple(platform.win32_ver()[1])
1993     else:
1994         return None
1995
1996
1997 def _windows_write_string(s, out):
1998     """ Returns True if the string was written using special methods,
1999     False if it has yet to be written out."""
2000     # Adapted from http://stackoverflow.com/a/3259271/35070
2001
2002     import ctypes.wintypes
2003
2004     WIN_OUTPUT_IDS = {
2005         1: -11,
2006         2: -12,
2007     }
2008
2009     try:
2010         fileno = out.fileno()
2011     except AttributeError:
2012         # If the output stream doesn't have a fileno, it's virtual
2013         return False
2014     except io.UnsupportedOperation:
2015         # Some strange Windows pseudo files?
2016         return False
2017     if fileno not in WIN_OUTPUT_IDS:
2018         return False
2019
2020     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2021         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2022         ('GetStdHandle', ctypes.windll.kernel32))
2023     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2024
2025     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2026         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2027         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2028         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2029     written = ctypes.wintypes.DWORD(0)
2030
2031     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2032     FILE_TYPE_CHAR = 0x0002
2033     FILE_TYPE_REMOTE = 0x8000
2034     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2035         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2036         ctypes.POINTER(ctypes.wintypes.DWORD))(
2037         ('GetConsoleMode', ctypes.windll.kernel32))
2038     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2039
2040     def not_a_console(handle):
2041         if handle == INVALID_HANDLE_VALUE or handle is None:
2042             return True
2043         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2044                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2045
2046     if not_a_console(h):
2047         return False
2048
2049     def next_nonbmp_pos(s):
2050         try:
2051             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2052         except StopIteration:
2053             return len(s)
2054
2055     while s:
2056         count = min(next_nonbmp_pos(s), 1024)
2057
2058         ret = WriteConsoleW(
2059             h, s, count if count else 2, ctypes.byref(written), None)
2060         if ret == 0:
2061             raise OSError('Failed to write string')
2062         if not count:  # We just wrote a non-BMP character
2063             assert written.value == 2
2064             s = s[1:]
2065         else:
2066             assert written.value > 0
2067             s = s[written.value:]
2068     return True
2069
2070
2071 def write_string(s, out=None, encoding=None):
2072     if out is None:
2073         out = sys.stderr
2074     assert type(s) == compat_str
2075
2076     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2077         if _windows_write_string(s, out):
2078             return
2079
2080     if ('b' in getattr(out, 'mode', '')
2081             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2082         byt = s.encode(encoding or preferredencoding(), 'ignore')
2083         out.write(byt)
2084     elif hasattr(out, 'buffer'):
2085         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2086         byt = s.encode(enc, 'ignore')
2087         out.buffer.write(byt)
2088     else:
2089         out.write(s)
2090     out.flush()
2091
2092
2093 def bytes_to_intlist(bs):
2094     if not bs:
2095         return []
2096     if isinstance(bs[0], int):  # Python 3
2097         return list(bs)
2098     else:
2099         return [ord(c) for c in bs]
2100
2101
2102 def intlist_to_bytes(xs):
2103     if not xs:
2104         return b''
2105     return compat_struct_pack('%dB' % len(xs), *xs)
2106
2107
2108 # Cross-platform file locking
2109 if sys.platform == 'win32':
2110     import ctypes.wintypes
2111     import msvcrt
2112
2113     class OVERLAPPED(ctypes.Structure):
2114         _fields_ = [
2115             ('Internal', ctypes.wintypes.LPVOID),
2116             ('InternalHigh', ctypes.wintypes.LPVOID),
2117             ('Offset', ctypes.wintypes.DWORD),
2118             ('OffsetHigh', ctypes.wintypes.DWORD),
2119             ('hEvent', ctypes.wintypes.HANDLE),
2120         ]
2121
2122     kernel32 = ctypes.windll.kernel32
2123     LockFileEx = kernel32.LockFileEx
2124     LockFileEx.argtypes = [
2125         ctypes.wintypes.HANDLE,     # hFile
2126         ctypes.wintypes.DWORD,      # dwFlags
2127         ctypes.wintypes.DWORD,      # dwReserved
2128         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2129         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2130         ctypes.POINTER(OVERLAPPED)  # Overlapped
2131     ]
2132     LockFileEx.restype = ctypes.wintypes.BOOL
2133     UnlockFileEx = kernel32.UnlockFileEx
2134     UnlockFileEx.argtypes = [
2135         ctypes.wintypes.HANDLE,     # hFile
2136         ctypes.wintypes.DWORD,      # dwReserved
2137         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2138         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2139         ctypes.POINTER(OVERLAPPED)  # Overlapped
2140     ]
2141     UnlockFileEx.restype = ctypes.wintypes.BOOL
2142     whole_low = 0xffffffff
2143     whole_high = 0x7fffffff
2144
2145     def _lock_file(f, exclusive, block):
2146         overlapped = OVERLAPPED()
2147         overlapped.Offset = 0
2148         overlapped.OffsetHigh = 0
2149         overlapped.hEvent = 0
2150         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2151
2152         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2153                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2154                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2155             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2156
2157     def _unlock_file(f):
2158         assert f._lock_file_overlapped_p
2159         handle = msvcrt.get_osfhandle(f.fileno())
2160         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2161             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2162
2163 else:
2164     try:
2165         import fcntl
2166
2167         def _lock_file(f, exclusive, block):
2168             try:
2169                 fcntl.flock(f,
2170                             fcntl.LOCK_SH if not exclusive
2171                             else fcntl.LOCK_EX if block
2172                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2173             except BlockingIOError:
2174                 raise
2175             except OSError:  # AOSP does not have flock()
2176                 fcntl.lockf(f,
2177                             fcntl.LOCK_SH if not exclusive
2178                             else fcntl.LOCK_EX if block
2179                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2180
2181         def _unlock_file(f):
2182             try:
2183                 fcntl.flock(f, fcntl.LOCK_UN)
2184             except OSError:
2185                 fcntl.lockf(f, fcntl.LOCK_UN)
2186
2187     except ImportError:
2188         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2189
2190         def _lock_file(f, exclusive, block):
2191             raise IOError(UNSUPPORTED_MSG)
2192
2193         def _unlock_file(f):
2194             raise IOError(UNSUPPORTED_MSG)
2195
2196
2197 class locked_file(object):
2198     _closed = False
2199
2200     def __init__(self, filename, mode, block=True, encoding=None):
2201         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2202         self.f = io.open(filename, mode, encoding=encoding)
2203         self.mode = mode
2204         self.block = block
2205
2206     def __enter__(self):
2207         exclusive = 'r' not in self.mode
2208         try:
2209             _lock_file(self.f, exclusive, self.block)
2210         except IOError:
2211             self.f.close()
2212             raise
2213         return self
2214
2215     def __exit__(self, etype, value, traceback):
2216         try:
2217             if not self._closed:
2218                 _unlock_file(self.f)
2219         finally:
2220             self.f.close()
2221             self._closed = True
2222
2223     def __iter__(self):
2224         return iter(self.f)
2225
2226     def write(self, *args):
2227         return self.f.write(*args)
2228
2229     def read(self, *args):
2230         return self.f.read(*args)
2231
2232     def flush(self):
2233         self.f.flush()
2234
2235     def open(self):
2236         return self.__enter__()
2237
2238     def close(self, *args):
2239         self.__exit__(self, *args, value=False, traceback=False)
2240
2241
2242 def get_filesystem_encoding():
2243     encoding = sys.getfilesystemencoding()
2244     return encoding if encoding is not None else 'utf-8'
2245
2246
2247 def shell_quote(args):
2248     quoted_args = []
2249     encoding = get_filesystem_encoding()
2250     for a in args:
2251         if isinstance(a, bytes):
2252             # We may get a filename encoded with 'encodeFilename'
2253             a = a.decode(encoding)
2254         quoted_args.append(compat_shlex_quote(a))
2255     return ' '.join(quoted_args)
2256
2257
2258 def smuggle_url(url, data):
2259     """ Pass additional data in a URL for internal use. """
2260
2261     url, idata = unsmuggle_url(url, {})
2262     data.update(idata)
2263     sdata = compat_urllib_parse_urlencode(
2264         {'__youtubedl_smuggle': json.dumps(data)})
2265     return url + '#' + sdata
2266
2267
2268 def unsmuggle_url(smug_url, default=None):
2269     if '#__youtubedl_smuggle' not in smug_url:
2270         return smug_url, default
2271     url, _, sdata = smug_url.rpartition('#')
2272     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2273     data = json.loads(jsond)
2274     return url, data
2275
2276
2277 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2278     """ Formats numbers with decimal sufixes like K, M, etc """
2279     num, factor = float_or_none(num), float(factor)
2280     if num is None or num < 0:
2281         return None
2282     exponent = 0 if num == 0 else int(math.log(num, factor))
2283     suffix = ['', *'kMGTPEZY'][exponent]
2284     if factor == 1024:
2285         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2286     converted = num / (factor ** exponent)
2287     return fmt % (converted, suffix)
2288
2289
2290 def format_bytes(bytes):
2291     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2292
2293
2294 def lookup_unit_table(unit_table, s):
2295     units_re = '|'.join(re.escape(u) for u in unit_table)
2296     m = re.match(
2297         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2298     if not m:
2299         return None
2300     num_str = m.group('num').replace(',', '.')
2301     mult = unit_table[m.group('unit')]
2302     return int(float(num_str) * mult)
2303
2304
2305 def parse_filesize(s):
2306     if s is None:
2307         return None
2308
2309     # The lower-case forms are of course incorrect and unofficial,
2310     # but we support those too
2311     _UNIT_TABLE = {
2312         'B': 1,
2313         'b': 1,
2314         'bytes': 1,
2315         'KiB': 1024,
2316         'KB': 1000,
2317         'kB': 1024,
2318         'Kb': 1000,
2319         'kb': 1000,
2320         'kilobytes': 1000,
2321         'kibibytes': 1024,
2322         'MiB': 1024 ** 2,
2323         'MB': 1000 ** 2,
2324         'mB': 1024 ** 2,
2325         'Mb': 1000 ** 2,
2326         'mb': 1000 ** 2,
2327         'megabytes': 1000 ** 2,
2328         'mebibytes': 1024 ** 2,
2329         'GiB': 1024 ** 3,
2330         'GB': 1000 ** 3,
2331         'gB': 1024 ** 3,
2332         'Gb': 1000 ** 3,
2333         'gb': 1000 ** 3,
2334         'gigabytes': 1000 ** 3,
2335         'gibibytes': 1024 ** 3,
2336         'TiB': 1024 ** 4,
2337         'TB': 1000 ** 4,
2338         'tB': 1024 ** 4,
2339         'Tb': 1000 ** 4,
2340         'tb': 1000 ** 4,
2341         'terabytes': 1000 ** 4,
2342         'tebibytes': 1024 ** 4,
2343         'PiB': 1024 ** 5,
2344         'PB': 1000 ** 5,
2345         'pB': 1024 ** 5,
2346         'Pb': 1000 ** 5,
2347         'pb': 1000 ** 5,
2348         'petabytes': 1000 ** 5,
2349         'pebibytes': 1024 ** 5,
2350         'EiB': 1024 ** 6,
2351         'EB': 1000 ** 6,
2352         'eB': 1024 ** 6,
2353         'Eb': 1000 ** 6,
2354         'eb': 1000 ** 6,
2355         'exabytes': 1000 ** 6,
2356         'exbibytes': 1024 ** 6,
2357         'ZiB': 1024 ** 7,
2358         'ZB': 1000 ** 7,
2359         'zB': 1024 ** 7,
2360         'Zb': 1000 ** 7,
2361         'zb': 1000 ** 7,
2362         'zettabytes': 1000 ** 7,
2363         'zebibytes': 1024 ** 7,
2364         'YiB': 1024 ** 8,
2365         'YB': 1000 ** 8,
2366         'yB': 1024 ** 8,
2367         'Yb': 1000 ** 8,
2368         'yb': 1000 ** 8,
2369         'yottabytes': 1000 ** 8,
2370         'yobibytes': 1024 ** 8,
2371     }
2372
2373     return lookup_unit_table(_UNIT_TABLE, s)
2374
2375
2376 def parse_count(s):
2377     if s is None:
2378         return None
2379
2380     s = re.sub(r'^[^\d]+\s', '', s).strip()
2381
2382     if re.match(r'^[\d,.]+$', s):
2383         return str_to_int(s)
2384
2385     _UNIT_TABLE = {
2386         'k': 1000,
2387         'K': 1000,
2388         'm': 1000 ** 2,
2389         'M': 1000 ** 2,
2390         'kk': 1000 ** 2,
2391         'KK': 1000 ** 2,
2392         'b': 1000 ** 3,
2393         'B': 1000 ** 3,
2394     }
2395
2396     ret = lookup_unit_table(_UNIT_TABLE, s)
2397     if ret is not None:
2398         return ret
2399
2400     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2401     if mobj:
2402         return str_to_int(mobj.group(1))
2403
2404
2405 def parse_resolution(s):
2406     if s is None:
2407         return {}
2408
2409     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2410     if mobj:
2411         return {
2412             'width': int(mobj.group('w')),
2413             'height': int(mobj.group('h')),
2414         }
2415
2416     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2417     if mobj:
2418         return {'height': int(mobj.group(1))}
2419
2420     mobj = re.search(r'\b([48])[kK]\b', s)
2421     if mobj:
2422         return {'height': int(mobj.group(1)) * 540}
2423
2424     return {}
2425
2426
2427 def parse_bitrate(s):
2428     if not isinstance(s, compat_str):
2429         return
2430     mobj = re.search(r'\b(\d+)\s*kbps', s)
2431     if mobj:
2432         return int(mobj.group(1))
2433
2434
2435 def month_by_name(name, lang='en'):
2436     """ Return the number of a month by (locale-independently) English name """
2437
2438     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2439
2440     try:
2441         return month_names.index(name) + 1
2442     except ValueError:
2443         return None
2444
2445
2446 def month_by_abbreviation(abbrev):
2447     """ Return the number of a month by (locale-independently) English
2448         abbreviations """
2449
2450     try:
2451         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2452     except ValueError:
2453         return None
2454
2455
2456 def fix_xml_ampersands(xml_str):
2457     """Replace all the '&' by '&amp;' in XML"""
2458     return re.sub(
2459         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2460         '&amp;',
2461         xml_str)
2462
2463
2464 def setproctitle(title):
2465     assert isinstance(title, compat_str)
2466
2467     # ctypes in Jython is not complete
2468     # http://bugs.jython.org/issue2148
2469     if sys.platform.startswith('java'):
2470         return
2471
2472     try:
2473         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2474     except OSError:
2475         return
2476     except TypeError:
2477         # LoadLibrary in Windows Python 2.7.13 only expects
2478         # a bytestring, but since unicode_literals turns
2479         # every string into a unicode string, it fails.
2480         return
2481     title_bytes = title.encode('utf-8')
2482     buf = ctypes.create_string_buffer(len(title_bytes))
2483     buf.value = title_bytes
2484     try:
2485         libc.prctl(15, buf, 0, 0, 0)
2486     except AttributeError:
2487         return  # Strange libc, just skip this
2488
2489
2490 def remove_start(s, start):
2491     return s[len(start):] if s is not None and s.startswith(start) else s
2492
2493
2494 def remove_end(s, end):
2495     return s[:-len(end)] if s is not None and s.endswith(end) else s
2496
2497
2498 def remove_quotes(s):
2499     if s is None or len(s) < 2:
2500         return s
2501     for quote in ('"', "'", ):
2502         if s[0] == quote and s[-1] == quote:
2503             return s[1:-1]
2504     return s
2505
2506
2507 def get_domain(url):
2508     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2509     return domain.group('domain') if domain else None
2510
2511
2512 def url_basename(url):
2513     path = compat_urlparse.urlparse(url).path
2514     return path.strip('/').split('/')[-1]
2515
2516
2517 def base_url(url):
2518     return re.match(r'https?://[^?#&]+/', url).group()
2519
2520
2521 def urljoin(base, path):
2522     if isinstance(path, bytes):
2523         path = path.decode('utf-8')
2524     if not isinstance(path, compat_str) or not path:
2525         return None
2526     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2527         return path
2528     if isinstance(base, bytes):
2529         base = base.decode('utf-8')
2530     if not isinstance(base, compat_str) or not re.match(
2531             r'^(?:https?:)?//', base):
2532         return None
2533     return compat_urlparse.urljoin(base, path)
2534
2535
2536 class HEADRequest(compat_urllib_request.Request):
2537     def get_method(self):
2538         return 'HEAD'
2539
2540
2541 class PUTRequest(compat_urllib_request.Request):
2542     def get_method(self):
2543         return 'PUT'
2544
2545
2546 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2547     if get_attr and v is not None:
2548         v = getattr(v, get_attr, None)
2549     try:
2550         return int(v) * invscale // scale
2551     except (ValueError, TypeError, OverflowError):
2552         return default
2553
2554
2555 def str_or_none(v, default=None):
2556     return default if v is None else compat_str(v)
2557
2558
2559 def str_to_int(int_str):
2560     """ A more relaxed version of int_or_none """
2561     if isinstance(int_str, compat_integer_types):
2562         return int_str
2563     elif isinstance(int_str, compat_str):
2564         int_str = re.sub(r'[,\.\+]', '', int_str)
2565         return int_or_none(int_str)
2566
2567
2568 def float_or_none(v, scale=1, invscale=1, default=None):
2569     if v is None:
2570         return default
2571     try:
2572         return float(v) * invscale / scale
2573     except (ValueError, TypeError):
2574         return default
2575
2576
2577 def bool_or_none(v, default=None):
2578     return v if isinstance(v, bool) else default
2579
2580
2581 def strip_or_none(v, default=None):
2582     return v.strip() if isinstance(v, compat_str) else default
2583
2584
2585 def url_or_none(url):
2586     if not url or not isinstance(url, compat_str):
2587         return None
2588     url = url.strip()
2589     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2590
2591
2592 def request_to_url(req):
2593     if isinstance(req, compat_urllib_request.Request):
2594         return req.get_full_url()
2595     else:
2596         return req
2597
2598
2599 def strftime_or_none(timestamp, date_format, default=None):
2600     datetime_object = None
2601     try:
2602         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2603             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2604         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2605             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2606         return datetime_object.strftime(date_format)
2607     except (ValueError, TypeError, AttributeError):
2608         return default
2609
2610
2611 def parse_duration(s):
2612     if not isinstance(s, compat_basestring):
2613         return None
2614     s = s.strip()
2615     if not s:
2616         return None
2617
2618     days, hours, mins, secs, ms = [None] * 5
2619     m = re.match(r'''(?x)
2620             (?P<before_secs>
2621                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2622             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2623             (?P<ms>[.:][0-9]+)?Z?$
2624         ''', s)
2625     if m:
2626         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2627     else:
2628         m = re.match(
2629             r'''(?ix)(?:P?
2630                 (?:
2631                     [0-9]+\s*y(?:ears?)?\s*
2632                 )?
2633                 (?:
2634                     [0-9]+\s*m(?:onths?)?\s*
2635                 )?
2636                 (?:
2637                     [0-9]+\s*w(?:eeks?)?\s*
2638                 )?
2639                 (?:
2640                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2641                 )?
2642                 T)?
2643                 (?:
2644                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2645                 )?
2646                 (?:
2647                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2648                 )?
2649                 (?:
2650                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2651                 )?Z?$''', s)
2652         if m:
2653             days, hours, mins, secs, ms = m.groups()
2654         else:
2655             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2656             if m:
2657                 hours, mins = m.groups()
2658             else:
2659                 return None
2660
2661     duration = 0
2662     if secs:
2663         duration += float(secs)
2664     if mins:
2665         duration += float(mins) * 60
2666     if hours:
2667         duration += float(hours) * 60 * 60
2668     if days:
2669         duration += float(days) * 24 * 60 * 60
2670     if ms:
2671         duration += float(ms.replace(':', '.'))
2672     return duration
2673
2674
2675 def prepend_extension(filename, ext, expected_real_ext=None):
2676     name, real_ext = os.path.splitext(filename)
2677     return (
2678         '{0}.{1}{2}'.format(name, ext, real_ext)
2679         if not expected_real_ext or real_ext[1:] == expected_real_ext
2680         else '{0}.{1}'.format(filename, ext))
2681
2682
2683 def replace_extension(filename, ext, expected_real_ext=None):
2684     name, real_ext = os.path.splitext(filename)
2685     return '{0}.{1}'.format(
2686         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2687         ext)
2688
2689
2690 def check_executable(exe, args=[]):
2691     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2692     args can be a list of arguments for a short output (like -version) """
2693     try:
2694         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2695     except OSError:
2696         return False
2697     return exe
2698
2699
2700 def _get_exe_version_output(exe, args):
2701     try:
2702         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2703         # SIGTTOU if yt-dlp is run in the background.
2704         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2705         out, _ = Popen(
2706             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2707             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2708     except OSError:
2709         return False
2710     if isinstance(out, bytes):  # Python 2.x
2711         out = out.decode('ascii', 'ignore')
2712     return out
2713
2714
2715 def detect_exe_version(output, version_re=None, unrecognized='present'):
2716     assert isinstance(output, compat_str)
2717     if version_re is None:
2718         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2719     m = re.search(version_re, output)
2720     if m:
2721         return m.group(1)
2722     else:
2723         return unrecognized
2724
2725
2726 def get_exe_version(exe, args=['--version'],
2727                     version_re=None, unrecognized='present'):
2728     """ Returns the version of the specified executable,
2729     or False if the executable is not present """
2730     out = _get_exe_version_output(exe, args)
2731     return detect_exe_version(out, version_re, unrecognized) if out else False
2732
2733
2734 class LazyList(collections.abc.Sequence):
2735     ''' Lazy immutable list from an iterable
2736     Note that slices of a LazyList are lists and not LazyList'''
2737
2738     class IndexError(IndexError):
2739         pass
2740
2741     def __init__(self, iterable, *, reverse=False, _cache=None):
2742         self.__iterable = iter(iterable)
2743         self.__cache = [] if _cache is None else _cache
2744         self.__reversed = reverse
2745
2746     def __iter__(self):
2747         if self.__reversed:
2748             # We need to consume the entire iterable to iterate in reverse
2749             yield from self.exhaust()
2750             return
2751         yield from self.__cache
2752         for item in self.__iterable:
2753             self.__cache.append(item)
2754             yield item
2755
2756     def __exhaust(self):
2757         self.__cache.extend(self.__iterable)
2758         # Discard the emptied iterable to make it pickle-able
2759         self.__iterable = []
2760         return self.__cache
2761
2762     def exhaust(self):
2763         ''' Evaluate the entire iterable '''
2764         return self.__exhaust()[::-1 if self.__reversed else 1]
2765
2766     @staticmethod
2767     def __reverse_index(x):
2768         return None if x is None else -(x + 1)
2769
2770     def __getitem__(self, idx):
2771         if isinstance(idx, slice):
2772             if self.__reversed:
2773                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2774             start, stop, step = idx.start, idx.stop, idx.step or 1
2775         elif isinstance(idx, int):
2776             if self.__reversed:
2777                 idx = self.__reverse_index(idx)
2778             start, stop, step = idx, idx, 0
2779         else:
2780             raise TypeError('indices must be integers or slices')
2781         if ((start or 0) < 0 or (stop or 0) < 0
2782                 or (start is None and step < 0)
2783                 or (stop is None and step > 0)):
2784             # We need to consume the entire iterable to be able to slice from the end
2785             # Obviously, never use this with infinite iterables
2786             self.__exhaust()
2787             try:
2788                 return self.__cache[idx]
2789             except IndexError as e:
2790                 raise self.IndexError(e) from e
2791         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2792         if n > 0:
2793             self.__cache.extend(itertools.islice(self.__iterable, n))
2794         try:
2795             return self.__cache[idx]
2796         except IndexError as e:
2797             raise self.IndexError(e) from e
2798
2799     def __bool__(self):
2800         try:
2801             self[-1] if self.__reversed else self[0]
2802         except self.IndexError:
2803             return False
2804         return True
2805
2806     def __len__(self):
2807         self.__exhaust()
2808         return len(self.__cache)
2809
2810     def __reversed__(self):
2811         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2812
2813     def __copy__(self):
2814         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2815
2816     def __repr__(self):
2817         # repr and str should mimic a list. So we exhaust the iterable
2818         return repr(self.exhaust())
2819
2820     def __str__(self):
2821         return repr(self.exhaust())
2822
2823
2824 class PagedList:
2825
2826     class IndexError(IndexError):
2827         pass
2828
2829     def __len__(self):
2830         # This is only useful for tests
2831         return len(self.getslice())
2832
2833     def __init__(self, pagefunc, pagesize, use_cache=True):
2834         self._pagefunc = pagefunc
2835         self._pagesize = pagesize
2836         self._pagecount = float('inf')
2837         self._use_cache = use_cache
2838         self._cache = {}
2839
2840     def getpage(self, pagenum):
2841         page_results = self._cache.get(pagenum)
2842         if page_results is None:
2843             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2844         if self._use_cache:
2845             self._cache[pagenum] = page_results
2846         return page_results
2847
2848     def getslice(self, start=0, end=None):
2849         return list(self._getslice(start, end))
2850
2851     def _getslice(self, start, end):
2852         raise NotImplementedError('This method must be implemented by subclasses')
2853
2854     def __getitem__(self, idx):
2855         assert self._use_cache, 'Indexing PagedList requires cache'
2856         if not isinstance(idx, int) or idx < 0:
2857             raise TypeError('indices must be non-negative integers')
2858         entries = self.getslice(idx, idx + 1)
2859         if not entries:
2860             raise self.IndexError()
2861         return entries[0]
2862
2863
2864 class OnDemandPagedList(PagedList):
2865     def _getslice(self, start, end):
2866         for pagenum in itertools.count(start // self._pagesize):
2867             firstid = pagenum * self._pagesize
2868             nextfirstid = pagenum * self._pagesize + self._pagesize
2869             if start >= nextfirstid:
2870                 continue
2871
2872             startv = (
2873                 start % self._pagesize
2874                 if firstid <= start < nextfirstid
2875                 else 0)
2876             endv = (
2877                 ((end - 1) % self._pagesize) + 1
2878                 if (end is not None and firstid <= end <= nextfirstid)
2879                 else None)
2880
2881             try:
2882                 page_results = self.getpage(pagenum)
2883             except Exception:
2884                 self._pagecount = pagenum - 1
2885                 raise
2886             if startv != 0 or endv is not None:
2887                 page_results = page_results[startv:endv]
2888             yield from page_results
2889
2890             # A little optimization - if current page is not "full", ie. does
2891             # not contain page_size videos then we can assume that this page
2892             # is the last one - there are no more ids on further pages -
2893             # i.e. no need to query again.
2894             if len(page_results) + startv < self._pagesize:
2895                 break
2896
2897             # If we got the whole page, but the next page is not interesting,
2898             # break out early as well
2899             if end == nextfirstid:
2900                 break
2901
2902
2903 class InAdvancePagedList(PagedList):
2904     def __init__(self, pagefunc, pagecount, pagesize):
2905         PagedList.__init__(self, pagefunc, pagesize, True)
2906         self._pagecount = pagecount
2907
2908     def _getslice(self, start, end):
2909         start_page = start // self._pagesize
2910         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2911         skip_elems = start - start_page * self._pagesize
2912         only_more = None if end is None else end - start
2913         for pagenum in range(start_page, end_page):
2914             page_results = self.getpage(pagenum)
2915             if skip_elems:
2916                 page_results = page_results[skip_elems:]
2917                 skip_elems = None
2918             if only_more is not None:
2919                 if len(page_results) < only_more:
2920                     only_more -= len(page_results)
2921                 else:
2922                     yield from page_results[:only_more]
2923                     break
2924             yield from page_results
2925
2926
2927 def uppercase_escape(s):
2928     unicode_escape = codecs.getdecoder('unicode_escape')
2929     return re.sub(
2930         r'\\U[0-9a-fA-F]{8}',
2931         lambda m: unicode_escape(m.group(0))[0],
2932         s)
2933
2934
2935 def lowercase_escape(s):
2936     unicode_escape = codecs.getdecoder('unicode_escape')
2937     return re.sub(
2938         r'\\u[0-9a-fA-F]{4}',
2939         lambda m: unicode_escape(m.group(0))[0],
2940         s)
2941
2942
2943 def escape_rfc3986(s):
2944     """Escape non-ASCII characters as suggested by RFC 3986"""
2945     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2946         s = s.encode('utf-8')
2947     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2948
2949
2950 def escape_url(url):
2951     """Escape URL as suggested by RFC 3986"""
2952     url_parsed = compat_urllib_parse_urlparse(url)
2953     return url_parsed._replace(
2954         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2955         path=escape_rfc3986(url_parsed.path),
2956         params=escape_rfc3986(url_parsed.params),
2957         query=escape_rfc3986(url_parsed.query),
2958         fragment=escape_rfc3986(url_parsed.fragment)
2959     ).geturl()
2960
2961
2962 def parse_qs(url):
2963     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2964
2965
2966 def read_batch_urls(batch_fd):
2967     def fixup(url):
2968         if not isinstance(url, compat_str):
2969             url = url.decode('utf-8', 'replace')
2970         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2971         for bom in BOM_UTF8:
2972             if url.startswith(bom):
2973                 url = url[len(bom):]
2974         url = url.lstrip()
2975         if not url or url.startswith(('#', ';', ']')):
2976             return False
2977         # "#" cannot be stripped out since it is part of the URI
2978         # However, it can be safely stipped out if follwing a whitespace
2979         return re.split(r'\s#', url, 1)[0].rstrip()
2980
2981     with contextlib.closing(batch_fd) as fd:
2982         return [url for url in map(fixup, fd) if url]
2983
2984
2985 def urlencode_postdata(*args, **kargs):
2986     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2987
2988
2989 def update_url_query(url, query):
2990     if not query:
2991         return url
2992     parsed_url = compat_urlparse.urlparse(url)
2993     qs = compat_parse_qs(parsed_url.query)
2994     qs.update(query)
2995     return compat_urlparse.urlunparse(parsed_url._replace(
2996         query=compat_urllib_parse_urlencode(qs, True)))
2997
2998
2999 def update_Request(req, url=None, data=None, headers={}, query={}):
3000     req_headers = req.headers.copy()
3001     req_headers.update(headers)
3002     req_data = data or req.data
3003     req_url = update_url_query(url or req.get_full_url(), query)
3004     req_get_method = req.get_method()
3005     if req_get_method == 'HEAD':
3006         req_type = HEADRequest
3007     elif req_get_method == 'PUT':
3008         req_type = PUTRequest
3009     else:
3010         req_type = compat_urllib_request.Request
3011     new_req = req_type(
3012         req_url, data=req_data, headers=req_headers,
3013         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3014     if hasattr(req, 'timeout'):
3015         new_req.timeout = req.timeout
3016     return new_req
3017
3018
3019 def _multipart_encode_impl(data, boundary):
3020     content_type = 'multipart/form-data; boundary=%s' % boundary
3021
3022     out = b''
3023     for k, v in data.items():
3024         out += b'--' + boundary.encode('ascii') + b'\r\n'
3025         if isinstance(k, compat_str):
3026             k = k.encode('utf-8')
3027         if isinstance(v, compat_str):
3028             v = v.encode('utf-8')
3029         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3030         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3031         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3032         if boundary.encode('ascii') in content:
3033             raise ValueError('Boundary overlaps with data')
3034         out += content
3035
3036     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3037
3038     return out, content_type
3039
3040
3041 def multipart_encode(data, boundary=None):
3042     '''
3043     Encode a dict to RFC 7578-compliant form-data
3044
3045     data:
3046         A dict where keys and values can be either Unicode or bytes-like
3047         objects.
3048     boundary:
3049         If specified a Unicode object, it's used as the boundary. Otherwise
3050         a random boundary is generated.
3051
3052     Reference: https://tools.ietf.org/html/rfc7578
3053     '''
3054     has_specified_boundary = boundary is not None
3055
3056     while True:
3057         if boundary is None:
3058             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3059
3060         try:
3061             out, content_type = _multipart_encode_impl(data, boundary)
3062             break
3063         except ValueError:
3064             if has_specified_boundary:
3065                 raise
3066             boundary = None
3067
3068     return out, content_type
3069
3070
3071 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3072     if isinstance(key_or_keys, (list, tuple)):
3073         for key in key_or_keys:
3074             if key not in d or d[key] is None or skip_false_values and not d[key]:
3075                 continue
3076             return d[key]
3077         return default
3078     return d.get(key_or_keys, default)
3079
3080
3081 def try_get(src, getter, expected_type=None):
3082     for get in variadic(getter):
3083         try:
3084             v = get(src)
3085         except (AttributeError, KeyError, TypeError, IndexError):
3086             pass
3087         else:
3088             if expected_type is None or isinstance(v, expected_type):
3089                 return v
3090
3091
3092 def merge_dicts(*dicts):
3093     merged = {}
3094     for a_dict in dicts:
3095         for k, v in a_dict.items():
3096             if v is None:
3097                 continue
3098             if (k not in merged
3099                     or (isinstance(v, compat_str) and v
3100                         and isinstance(merged[k], compat_str)
3101                         and not merged[k])):
3102                 merged[k] = v
3103     return merged
3104
3105
3106 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3107     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3108
3109
3110 US_RATINGS = {
3111     'G': 0,
3112     'PG': 10,
3113     'PG-13': 13,
3114     'R': 16,
3115     'NC': 18,
3116 }
3117
3118
3119 TV_PARENTAL_GUIDELINES = {
3120     'TV-Y': 0,
3121     'TV-Y7': 7,
3122     'TV-G': 0,
3123     'TV-PG': 0,
3124     'TV-14': 14,
3125     'TV-MA': 17,
3126 }
3127
3128
3129 def parse_age_limit(s):
3130     if type(s) == int:
3131         return s if 0 <= s <= 21 else None
3132     if not isinstance(s, compat_basestring):
3133         return None
3134     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3135     if m:
3136         return int(m.group('age'))
3137     s = s.upper()
3138     if s in US_RATINGS:
3139         return US_RATINGS[s]
3140     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3141     if m:
3142         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3143     return None
3144
3145
3146 def strip_jsonp(code):
3147     return re.sub(
3148         r'''(?sx)^
3149             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3150             (?:\s*&&\s*(?P=func_name))?
3151             \s*\(\s*(?P<callback_data>.*)\);?
3152             \s*?(?://[^\n]*)*$''',
3153         r'\g<callback_data>', code)
3154
3155
3156 def js_to_json(code, vars={}):
3157     # vars is a dict of var, val pairs to substitute
3158     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3159     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3160     INTEGER_TABLE = (
3161         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3162         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3163     )
3164
3165     def fix_kv(m):
3166         v = m.group(0)
3167         if v in ('true', 'false', 'null'):
3168             return v
3169         elif v in ('undefined', 'void 0'):
3170             return 'null'
3171         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3172             return ""
3173
3174         if v[0] in ("'", '"'):
3175             v = re.sub(r'(?s)\\.|"', lambda m: {
3176                 '"': '\\"',
3177                 "\\'": "'",
3178                 '\\\n': '',
3179                 '\\x': '\\u00',
3180             }.get(m.group(0), m.group(0)), v[1:-1])
3181         else:
3182             for regex, base in INTEGER_TABLE:
3183                 im = re.match(regex, v)
3184                 if im:
3185                     i = int(im.group(1), base)
3186                     return '"%d":' % i if v.endswith(':') else '%d' % i
3187
3188             if v in vars:
3189                 return vars[v]
3190
3191         return '"%s"' % v
3192
3193     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3194
3195     return re.sub(r'''(?sx)
3196         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3197         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3198         {comment}|,(?={skip}[\]}}])|
3199         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3200         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3201         [0-9]+(?={skip}:)|
3202         !+
3203         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3204
3205
3206 def qualities(quality_ids):
3207     """ Get a numeric quality value out of a list of possible values """
3208     def q(qid):
3209         try:
3210             return quality_ids.index(qid)
3211         except ValueError:
3212             return -1
3213     return q
3214
3215
3216 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3217
3218
3219 DEFAULT_OUTTMPL = {
3220     'default': '%(title)s [%(id)s].%(ext)s',
3221     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3222 }
3223 OUTTMPL_TYPES = {
3224     'chapter': None,
3225     'subtitle': None,
3226     'thumbnail': None,
3227     'description': 'description',
3228     'annotation': 'annotations.xml',
3229     'infojson': 'info.json',
3230     'link': None,
3231     'pl_video': None,
3232     'pl_thumbnail': None,
3233     'pl_description': 'description',
3234     'pl_infojson': 'info.json',
3235 }
3236
3237 # As of [1] format syntax is:
3238 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3239 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3240 STR_FORMAT_RE_TMPL = r'''(?x)
3241     (?<!%)(?P<prefix>(?:%%)*)
3242     %
3243     (?P<has_key>\((?P<key>{0})\))?
3244     (?P<format>
3245         (?P<conversion>[#0\-+ ]+)?
3246         (?P<min_width>\d+)?
3247         (?P<precision>\.\d+)?
3248         (?P<len_mod>[hlL])?  # unused in python
3249         {1}  # conversion type
3250     )
3251 '''
3252
3253
3254 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3255
3256
3257 def limit_length(s, length):
3258     """ Add ellipses to overly long strings """
3259     if s is None:
3260         return None
3261     ELLIPSES = '...'
3262     if len(s) > length:
3263         return s[:length - len(ELLIPSES)] + ELLIPSES
3264     return s
3265
3266
3267 def version_tuple(v):
3268     return tuple(int(e) for e in re.split(r'[-.]', v))
3269
3270
3271 def is_outdated_version(version, limit, assume_new=True):
3272     if not version:
3273         return not assume_new
3274     try:
3275         return version_tuple(version) < version_tuple(limit)
3276     except ValueError:
3277         return not assume_new
3278
3279
3280 def ytdl_is_updateable():
3281     """ Returns if yt-dlp can be updated with -U """
3282
3283     from .update import is_non_updateable
3284
3285     return not is_non_updateable()
3286
3287
3288 def args_to_str(args):
3289     # Get a short string representation for a subprocess command
3290     return ' '.join(compat_shlex_quote(a) for a in args)
3291
3292
3293 def error_to_compat_str(err):
3294     err_str = str(err)
3295     # On python 2 error byte string must be decoded with proper
3296     # encoding rather than ascii
3297     if sys.version_info[0] < 3:
3298         err_str = err_str.decode(preferredencoding())
3299     return err_str
3300
3301
3302 def mimetype2ext(mt):
3303     if mt is None:
3304         return None
3305
3306     mt, _, params = mt.partition(';')
3307     mt = mt.strip()
3308
3309     FULL_MAP = {
3310         'audio/mp4': 'm4a',
3311         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3312         # it's the most popular one
3313         'audio/mpeg': 'mp3',
3314         'audio/x-wav': 'wav',
3315         'audio/wav': 'wav',
3316         'audio/wave': 'wav',
3317     }
3318
3319     ext = FULL_MAP.get(mt)
3320     if ext is not None:
3321         return ext
3322
3323     SUBTYPE_MAP = {
3324         '3gpp': '3gp',
3325         'smptett+xml': 'tt',
3326         'ttaf+xml': 'dfxp',
3327         'ttml+xml': 'ttml',
3328         'x-flv': 'flv',
3329         'x-mp4-fragmented': 'mp4',
3330         'x-ms-sami': 'sami',
3331         'x-ms-wmv': 'wmv',
3332         'mpegurl': 'm3u8',
3333         'x-mpegurl': 'm3u8',
3334         'vnd.apple.mpegurl': 'm3u8',
3335         'dash+xml': 'mpd',
3336         'f4m+xml': 'f4m',
3337         'hds+xml': 'f4m',
3338         'vnd.ms-sstr+xml': 'ism',
3339         'quicktime': 'mov',
3340         'mp2t': 'ts',
3341         'x-wav': 'wav',
3342         'filmstrip+json': 'fs',
3343         'svg+xml': 'svg',
3344     }
3345
3346     _, _, subtype = mt.rpartition('/')
3347     ext = SUBTYPE_MAP.get(subtype.lower())
3348     if ext is not None:
3349         return ext
3350
3351     SUFFIX_MAP = {
3352         'json': 'json',
3353         'xml': 'xml',
3354         'zip': 'zip',
3355         'gzip': 'gz',
3356     }
3357
3358     _, _, suffix = subtype.partition('+')
3359     ext = SUFFIX_MAP.get(suffix)
3360     if ext is not None:
3361         return ext
3362
3363     return subtype.replace('+', '.')
3364
3365
3366 def ext2mimetype(ext_or_url):
3367     if not ext_or_url:
3368         return None
3369     if '.' not in ext_or_url:
3370         ext_or_url = f'file.{ext_or_url}'
3371     return mimetypes.guess_type(ext_or_url)[0]
3372
3373
3374 def parse_codecs(codecs_str):
3375     # http://tools.ietf.org/html/rfc6381
3376     if not codecs_str:
3377         return {}
3378     split_codecs = list(filter(None, map(
3379         str.strip, codecs_str.strip().strip(',').split(','))))
3380     vcodec, acodec, tcodec, hdr = None, None, None, None
3381     for full_codec in split_codecs:
3382         parts = full_codec.split('.')
3383         codec = parts[0].replace('0', '')
3384         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3385                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3386             if not vcodec:
3387                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3388                 if codec in ('dvh1', 'dvhe'):
3389                     hdr = 'DV'
3390                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3391                     hdr = 'HDR10'
3392                 elif full_codec.replace('0', '').startswith('vp9.2'):
3393                     hdr = 'HDR10'
3394         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3395             if not acodec:
3396                 acodec = full_codec
3397         elif codec in ('stpp', 'wvtt',):
3398             if not tcodec:
3399                 tcodec = full_codec
3400         else:
3401             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3402     if vcodec or acodec or tcodec:
3403         return {
3404             'vcodec': vcodec or 'none',
3405             'acodec': acodec or 'none',
3406             'dynamic_range': hdr,
3407             **({'tcodec': tcodec} if tcodec is not None else {}),
3408         }
3409     elif len(split_codecs) == 2:
3410         return {
3411             'vcodec': split_codecs[0],
3412             'acodec': split_codecs[1],
3413         }
3414     return {}
3415
3416
3417 def urlhandle_detect_ext(url_handle):
3418     getheader = url_handle.headers.get
3419
3420     cd = getheader('Content-Disposition')
3421     if cd:
3422         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3423         if m:
3424             e = determine_ext(m.group('filename'), default_ext=None)
3425             if e:
3426                 return e
3427
3428     return mimetype2ext(getheader('Content-Type'))
3429
3430
3431 def encode_data_uri(data, mime_type):
3432     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3433
3434
3435 def age_restricted(content_limit, age_limit):
3436     """ Returns True iff the content should be blocked """
3437
3438     if age_limit is None:  # No limit set
3439         return False
3440     if content_limit is None:
3441         return False  # Content available for everyone
3442     return age_limit < content_limit
3443
3444
3445 def is_html(first_bytes):
3446     """ Detect whether a file contains HTML by examining its first bytes. """
3447
3448     BOMS = [
3449         (b'\xef\xbb\xbf', 'utf-8'),
3450         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3451         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3452         (b'\xff\xfe', 'utf-16-le'),
3453         (b'\xfe\xff', 'utf-16-be'),
3454     ]
3455     for bom, enc in BOMS:
3456         if first_bytes.startswith(bom):
3457             s = first_bytes[len(bom):].decode(enc, 'replace')
3458             break
3459     else:
3460         s = first_bytes.decode('utf-8', 'replace')
3461
3462     return re.match(r'^\s*<', s)
3463
3464
3465 def determine_protocol(info_dict):
3466     protocol = info_dict.get('protocol')
3467     if protocol is not None:
3468         return protocol
3469
3470     url = sanitize_url(info_dict['url'])
3471     if url.startswith('rtmp'):
3472         return 'rtmp'
3473     elif url.startswith('mms'):
3474         return 'mms'
3475     elif url.startswith('rtsp'):
3476         return 'rtsp'
3477
3478     ext = determine_ext(url)
3479     if ext == 'm3u8':
3480         return 'm3u8'
3481     elif ext == 'f4m':
3482         return 'f4m'
3483
3484     return compat_urllib_parse_urlparse(url).scheme
3485
3486
3487 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3488     """ Render a list of rows, each as a list of values.
3489     Text after a \t will be right aligned """
3490     def width(string):
3491         return len(remove_terminal_sequences(string).replace('\t', ''))
3492
3493     def get_max_lens(table):
3494         return [max(width(str(v)) for v in col) for col in zip(*table)]
3495
3496     def filter_using_list(row, filterArray):
3497         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3498
3499     max_lens = get_max_lens(data) if hide_empty else []
3500     header_row = filter_using_list(header_row, max_lens)
3501     data = [filter_using_list(row, max_lens) for row in data]
3502
3503     table = [header_row] + data
3504     max_lens = get_max_lens(table)
3505     extra_gap += 1
3506     if delim:
3507         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3508         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3509     for row in table:
3510         for pos, text in enumerate(map(str, row)):
3511             if '\t' in text:
3512                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3513             else:
3514                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3515     ret = '\n'.join(''.join(row).rstrip() for row in table)
3516     return ret
3517
3518
3519 def _match_one(filter_part, dct, incomplete):
3520     # TODO: Generalize code with YoutubeDL._build_format_filter
3521     STRING_OPERATORS = {
3522         '*=': operator.contains,
3523         '^=': lambda attr, value: attr.startswith(value),
3524         '$=': lambda attr, value: attr.endswith(value),
3525         '~=': lambda attr, value: re.search(value, attr),
3526     }
3527     COMPARISON_OPERATORS = {
3528         **STRING_OPERATORS,
3529         '<=': operator.le,  # "<=" must be defined above "<"
3530         '<': operator.lt,
3531         '>=': operator.ge,
3532         '>': operator.gt,
3533         '=': operator.eq,
3534     }
3535
3536     operator_rex = re.compile(r'''(?x)\s*
3537         (?P<key>[a-z_]+)
3538         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3539         (?:
3540             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3541             (?P<strval>.+?)
3542         )
3543         \s*$
3544         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3545     m = operator_rex.search(filter_part)
3546     if m:
3547         m = m.groupdict()
3548         unnegated_op = COMPARISON_OPERATORS[m['op']]
3549         if m['negation']:
3550             op = lambda attr, value: not unnegated_op(attr, value)
3551         else:
3552             op = unnegated_op
3553         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3554         if m['quote']:
3555             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3556         actual_value = dct.get(m['key'])
3557         numeric_comparison = None
3558         if isinstance(actual_value, compat_numeric_types):
3559             # If the original field is a string and matching comparisonvalue is
3560             # a number we should respect the origin of the original field
3561             # and process comparison value as a string (see
3562             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3563             try:
3564                 numeric_comparison = int(comparison_value)
3565             except ValueError:
3566                 numeric_comparison = parse_filesize(comparison_value)
3567                 if numeric_comparison is None:
3568                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3569                 if numeric_comparison is None:
3570                     numeric_comparison = parse_duration(comparison_value)
3571         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3572             raise ValueError('Operator %s only supports string values!' % m['op'])
3573         if actual_value is None:
3574             return incomplete or m['none_inclusive']
3575         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3576
3577     UNARY_OPERATORS = {
3578         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3579         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3580     }
3581     operator_rex = re.compile(r'''(?x)\s*
3582         (?P<op>%s)\s*(?P<key>[a-z_]+)
3583         \s*$
3584         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3585     m = operator_rex.search(filter_part)
3586     if m:
3587         op = UNARY_OPERATORS[m.group('op')]
3588         actual_value = dct.get(m.group('key'))
3589         if incomplete and actual_value is None:
3590             return True
3591         return op(actual_value)
3592
3593     raise ValueError('Invalid filter part %r' % filter_part)
3594
3595
3596 def match_str(filter_str, dct, incomplete=False):
3597     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3598         When incomplete, all conditions passes on missing fields
3599     """
3600     return all(
3601         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3602         for filter_part in re.split(r'(?<!\\)&', filter_str))
3603
3604
3605 def match_filter_func(filter_str):
3606     def _match_func(info_dict, *args, **kwargs):
3607         if match_str(filter_str, info_dict, *args, **kwargs):
3608             return None
3609         else:
3610             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3611             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3612     return _match_func
3613
3614
3615 def parse_dfxp_time_expr(time_expr):
3616     if not time_expr:
3617         return
3618
3619     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3620     if mobj:
3621         return float(mobj.group('time_offset'))
3622
3623     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3624     if mobj:
3625         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3626
3627
3628 def srt_subtitles_timecode(seconds):
3629     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3630
3631
3632 def ass_subtitles_timecode(seconds):
3633     time = timetuple_from_msec(seconds * 1000)
3634     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3635
3636
3637 def dfxp2srt(dfxp_data):
3638     '''
3639     @param dfxp_data A bytes-like object containing DFXP data
3640     @returns A unicode object containing converted SRT data
3641     '''
3642     LEGACY_NAMESPACES = (
3643         (b'http://www.w3.org/ns/ttml', [
3644             b'http://www.w3.org/2004/11/ttaf1',
3645             b'http://www.w3.org/2006/04/ttaf1',
3646             b'http://www.w3.org/2006/10/ttaf1',
3647         ]),
3648         (b'http://www.w3.org/ns/ttml#styling', [
3649             b'http://www.w3.org/ns/ttml#style',
3650         ]),
3651     )
3652
3653     SUPPORTED_STYLING = [
3654         'color',
3655         'fontFamily',
3656         'fontSize',
3657         'fontStyle',
3658         'fontWeight',
3659         'textDecoration'
3660     ]
3661
3662     _x = functools.partial(xpath_with_ns, ns_map={
3663         'xml': 'http://www.w3.org/XML/1998/namespace',
3664         'ttml': 'http://www.w3.org/ns/ttml',
3665         'tts': 'http://www.w3.org/ns/ttml#styling',
3666     })
3667
3668     styles = {}
3669     default_style = {}
3670
3671     class TTMLPElementParser(object):
3672         _out = ''
3673         _unclosed_elements = []
3674         _applied_styles = []
3675
3676         def start(self, tag, attrib):
3677             if tag in (_x('ttml:br'), 'br'):
3678                 self._out += '\n'
3679             else:
3680                 unclosed_elements = []
3681                 style = {}
3682                 element_style_id = attrib.get('style')
3683                 if default_style:
3684                     style.update(default_style)
3685                 if element_style_id:
3686                     style.update(styles.get(element_style_id, {}))
3687                 for prop in SUPPORTED_STYLING:
3688                     prop_val = attrib.get(_x('tts:' + prop))
3689                     if prop_val:
3690                         style[prop] = prop_val
3691                 if style:
3692                     font = ''
3693                     for k, v in sorted(style.items()):
3694                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3695                             continue
3696                         if k == 'color':
3697                             font += ' color="%s"' % v
3698                         elif k == 'fontSize':
3699                             font += ' size="%s"' % v
3700                         elif k == 'fontFamily':
3701                             font += ' face="%s"' % v
3702                         elif k == 'fontWeight' and v == 'bold':
3703                             self._out += '<b>'
3704                             unclosed_elements.append('b')
3705                         elif k == 'fontStyle' and v == 'italic':
3706                             self._out += '<i>'
3707                             unclosed_elements.append('i')
3708                         elif k == 'textDecoration' and v == 'underline':
3709                             self._out += '<u>'
3710                             unclosed_elements.append('u')
3711                     if font:
3712                         self._out += '<font' + font + '>'
3713                         unclosed_elements.append('font')
3714                     applied_style = {}
3715                     if self._applied_styles:
3716                         applied_style.update(self._applied_styles[-1])
3717                     applied_style.update(style)
3718                     self._applied_styles.append(applied_style)
3719                 self._unclosed_elements.append(unclosed_elements)
3720
3721         def end(self, tag):
3722             if tag not in (_x('ttml:br'), 'br'):
3723                 unclosed_elements = self._unclosed_elements.pop()
3724                 for element in reversed(unclosed_elements):
3725                     self._out += '</%s>' % element
3726                 if unclosed_elements and self._applied_styles:
3727                     self._applied_styles.pop()
3728
3729         def data(self, data):
3730             self._out += data
3731
3732         def close(self):
3733             return self._out.strip()
3734
3735     def parse_node(node):
3736         target = TTMLPElementParser()
3737         parser = xml.etree.ElementTree.XMLParser(target=target)
3738         parser.feed(xml.etree.ElementTree.tostring(node))
3739         return parser.close()
3740
3741     for k, v in LEGACY_NAMESPACES:
3742         for ns in v:
3743             dfxp_data = dfxp_data.replace(ns, k)
3744
3745     dfxp = compat_etree_fromstring(dfxp_data)
3746     out = []
3747     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3748
3749     if not paras:
3750         raise ValueError('Invalid dfxp/TTML subtitle')
3751
3752     repeat = False
3753     while True:
3754         for style in dfxp.findall(_x('.//ttml:style')):
3755             style_id = style.get('id') or style.get(_x('xml:id'))
3756             if not style_id:
3757                 continue
3758             parent_style_id = style.get('style')
3759             if parent_style_id:
3760                 if parent_style_id not in styles:
3761                     repeat = True
3762                     continue
3763                 styles[style_id] = styles[parent_style_id].copy()
3764             for prop in SUPPORTED_STYLING:
3765                 prop_val = style.get(_x('tts:' + prop))
3766                 if prop_val:
3767                     styles.setdefault(style_id, {})[prop] = prop_val
3768         if repeat:
3769             repeat = False
3770         else:
3771             break
3772
3773     for p in ('body', 'div'):
3774         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3775         if ele is None:
3776             continue
3777         style = styles.get(ele.get('style'))
3778         if not style:
3779             continue
3780         default_style.update(style)
3781
3782     for para, index in zip(paras, itertools.count(1)):
3783         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3784         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3785         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3786         if begin_time is None:
3787             continue
3788         if not end_time:
3789             if not dur:
3790                 continue
3791             end_time = begin_time + dur
3792         out.append('%d\n%s --> %s\n%s\n\n' % (
3793             index,
3794             srt_subtitles_timecode(begin_time),
3795             srt_subtitles_timecode(end_time),
3796             parse_node(para)))
3797
3798     return ''.join(out)
3799
3800
3801 def cli_option(params, command_option, param):
3802     param = params.get(param)
3803     if param:
3804         param = compat_str(param)
3805     return [command_option, param] if param is not None else []
3806
3807
3808 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3809     param = params.get(param)
3810     if param is None:
3811         return []
3812     assert isinstance(param, bool)
3813     if separator:
3814         return [command_option + separator + (true_value if param else false_value)]
3815     return [command_option, true_value if param else false_value]
3816
3817
3818 def cli_valueless_option(params, command_option, param, expected_value=True):
3819     param = params.get(param)
3820     return [command_option] if param == expected_value else []
3821
3822
3823 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3824     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3825         if use_compat:
3826             return argdict
3827         else:
3828             argdict = None
3829     if argdict is None:
3830         return default
3831     assert isinstance(argdict, dict)
3832
3833     assert isinstance(keys, (list, tuple))
3834     for key_list in keys:
3835         arg_list = list(filter(
3836             lambda x: x is not None,
3837             [argdict.get(key.lower()) for key in variadic(key_list)]))
3838         if arg_list:
3839             return [arg for args in arg_list for arg in args]
3840     return default
3841
3842
3843 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3844     main_key, exe = main_key.lower(), exe.lower()
3845     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3846     keys = [f'{root_key}{k}' for k in (keys or [''])]
3847     if root_key in keys:
3848         if main_key != exe:
3849             keys.append((main_key, exe))
3850         keys.append('default')
3851     else:
3852         use_compat = False
3853     return cli_configuration_args(argdict, keys, default, use_compat)
3854
3855
3856 class ISO639Utils(object):
3857     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3858     _lang_map = {
3859         'aa': 'aar',
3860         'ab': 'abk',
3861         'ae': 'ave',
3862         'af': 'afr',
3863         'ak': 'aka',
3864         'am': 'amh',
3865         'an': 'arg',
3866         'ar': 'ara',
3867         'as': 'asm',
3868         'av': 'ava',
3869         'ay': 'aym',
3870         'az': 'aze',
3871         'ba': 'bak',
3872         'be': 'bel',
3873         'bg': 'bul',
3874         'bh': 'bih',
3875         'bi': 'bis',
3876         'bm': 'bam',
3877         'bn': 'ben',
3878         'bo': 'bod',
3879         'br': 'bre',
3880         'bs': 'bos',
3881         'ca': 'cat',
3882         'ce': 'che',
3883         'ch': 'cha',
3884         'co': 'cos',
3885         'cr': 'cre',
3886         'cs': 'ces',
3887         'cu': 'chu',
3888         'cv': 'chv',
3889         'cy': 'cym',
3890         'da': 'dan',
3891         'de': 'deu',
3892         'dv': 'div',
3893         'dz': 'dzo',
3894         'ee': 'ewe',
3895         'el': 'ell',
3896         'en': 'eng',
3897         'eo': 'epo',
3898         'es': 'spa',
3899         'et': 'est',
3900         'eu': 'eus',
3901         'fa': 'fas',
3902         'ff': 'ful',
3903         'fi': 'fin',
3904         'fj': 'fij',
3905         'fo': 'fao',
3906         'fr': 'fra',
3907         'fy': 'fry',
3908         'ga': 'gle',
3909         'gd': 'gla',
3910         'gl': 'glg',
3911         'gn': 'grn',
3912         'gu': 'guj',
3913         'gv': 'glv',
3914         'ha': 'hau',
3915         'he': 'heb',
3916         'iw': 'heb',  # Replaced by he in 1989 revision
3917         'hi': 'hin',
3918         'ho': 'hmo',
3919         'hr': 'hrv',
3920         'ht': 'hat',
3921         'hu': 'hun',
3922         'hy': 'hye',
3923         'hz': 'her',
3924         'ia': 'ina',
3925         'id': 'ind',
3926         'in': 'ind',  # Replaced by id in 1989 revision
3927         'ie': 'ile',
3928         'ig': 'ibo',
3929         'ii': 'iii',
3930         'ik': 'ipk',
3931         'io': 'ido',
3932         'is': 'isl',
3933         'it': 'ita',
3934         'iu': 'iku',
3935         'ja': 'jpn',
3936         'jv': 'jav',
3937         'ka': 'kat',
3938         'kg': 'kon',
3939         'ki': 'kik',
3940         'kj': 'kua',
3941         'kk': 'kaz',
3942         'kl': 'kal',
3943         'km': 'khm',
3944         'kn': 'kan',
3945         'ko': 'kor',
3946         'kr': 'kau',
3947         'ks': 'kas',
3948         'ku': 'kur',
3949         'kv': 'kom',
3950         'kw': 'cor',
3951         'ky': 'kir',
3952         'la': 'lat',
3953         'lb': 'ltz',
3954         'lg': 'lug',
3955         'li': 'lim',
3956         'ln': 'lin',
3957         'lo': 'lao',
3958         'lt': 'lit',
3959         'lu': 'lub',
3960         'lv': 'lav',
3961         'mg': 'mlg',
3962         'mh': 'mah',
3963         'mi': 'mri',
3964         'mk': 'mkd',
3965         'ml': 'mal',
3966         'mn': 'mon',
3967         'mr': 'mar',
3968         'ms': 'msa',
3969         'mt': 'mlt',
3970         'my': 'mya',
3971         'na': 'nau',
3972         'nb': 'nob',
3973         'nd': 'nde',
3974         'ne': 'nep',
3975         'ng': 'ndo',
3976         'nl': 'nld',
3977         'nn': 'nno',
3978         'no': 'nor',
3979         'nr': 'nbl',
3980         'nv': 'nav',
3981         'ny': 'nya',
3982         'oc': 'oci',
3983         'oj': 'oji',
3984         'om': 'orm',
3985         'or': 'ori',
3986         'os': 'oss',
3987         'pa': 'pan',
3988         'pi': 'pli',
3989         'pl': 'pol',
3990         'ps': 'pus',
3991         'pt': 'por',
3992         'qu': 'que',
3993         'rm': 'roh',
3994         'rn': 'run',
3995         'ro': 'ron',
3996         'ru': 'rus',
3997         'rw': 'kin',
3998         'sa': 'san',
3999         'sc': 'srd',
4000         'sd': 'snd',
4001         'se': 'sme',
4002         'sg': 'sag',
4003         'si': 'sin',
4004         'sk': 'slk',
4005         'sl': 'slv',
4006         'sm': 'smo',
4007         'sn': 'sna',
4008         'so': 'som',
4009         'sq': 'sqi',
4010         'sr': 'srp',
4011         'ss': 'ssw',
4012         'st': 'sot',
4013         'su': 'sun',
4014         'sv': 'swe',
4015         'sw': 'swa',
4016         'ta': 'tam',
4017         'te': 'tel',
4018         'tg': 'tgk',
4019         'th': 'tha',
4020         'ti': 'tir',
4021         'tk': 'tuk',
4022         'tl': 'tgl',
4023         'tn': 'tsn',
4024         'to': 'ton',
4025         'tr': 'tur',
4026         'ts': 'tso',
4027         'tt': 'tat',
4028         'tw': 'twi',
4029         'ty': 'tah',
4030         'ug': 'uig',
4031         'uk': 'ukr',
4032         'ur': 'urd',
4033         'uz': 'uzb',
4034         've': 'ven',
4035         'vi': 'vie',
4036         'vo': 'vol',
4037         'wa': 'wln',
4038         'wo': 'wol',
4039         'xh': 'xho',
4040         'yi': 'yid',
4041         'ji': 'yid',  # Replaced by yi in 1989 revision
4042         'yo': 'yor',
4043         'za': 'zha',
4044         'zh': 'zho',
4045         'zu': 'zul',
4046     }
4047
4048     @classmethod
4049     def short2long(cls, code):
4050         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4051         return cls._lang_map.get(code[:2])
4052
4053     @classmethod
4054     def long2short(cls, code):
4055         """Convert language code from ISO 639-2/T to ISO 639-1"""
4056         for short_name, long_name in cls._lang_map.items():
4057             if long_name == code:
4058                 return short_name
4059
4060
4061 class ISO3166Utils(object):
4062     # From http://data.okfn.org/data/core/country-list
4063     _country_map = {
4064         'AF': 'Afghanistan',
4065         'AX': 'Åland Islands',
4066         'AL': 'Albania',
4067         'DZ': 'Algeria',
4068         'AS': 'American Samoa',
4069         'AD': 'Andorra',
4070         'AO': 'Angola',
4071         'AI': 'Anguilla',
4072         'AQ': 'Antarctica',
4073         'AG': 'Antigua and Barbuda',
4074         'AR': 'Argentina',
4075         'AM': 'Armenia',
4076         'AW': 'Aruba',
4077         'AU': 'Australia',
4078         'AT': 'Austria',
4079         'AZ': 'Azerbaijan',
4080         'BS': 'Bahamas',
4081         'BH': 'Bahrain',
4082         'BD': 'Bangladesh',
4083         'BB': 'Barbados',
4084         'BY': 'Belarus',
4085         'BE': 'Belgium',
4086         'BZ': 'Belize',
4087         'BJ': 'Benin',
4088         'BM': 'Bermuda',
4089         'BT': 'Bhutan',
4090         'BO': 'Bolivia, Plurinational State of',
4091         'BQ': 'Bonaire, Sint Eustatius and Saba',
4092         'BA': 'Bosnia and Herzegovina',
4093         'BW': 'Botswana',
4094         'BV': 'Bouvet Island',
4095         'BR': 'Brazil',
4096         'IO': 'British Indian Ocean Territory',
4097         'BN': 'Brunei Darussalam',
4098         'BG': 'Bulgaria',
4099         'BF': 'Burkina Faso',
4100         'BI': 'Burundi',
4101         'KH': 'Cambodia',
4102         'CM': 'Cameroon',
4103         'CA': 'Canada',
4104         'CV': 'Cape Verde',
4105         'KY': 'Cayman Islands',
4106         'CF': 'Central African Republic',
4107         'TD': 'Chad',
4108         'CL': 'Chile',
4109         'CN': 'China',
4110         'CX': 'Christmas Island',
4111         'CC': 'Cocos (Keeling) Islands',
4112         'CO': 'Colombia',
4113         'KM': 'Comoros',
4114         'CG': 'Congo',
4115         'CD': 'Congo, the Democratic Republic of the',
4116         'CK': 'Cook Islands',
4117         'CR': 'Costa Rica',
4118         'CI': 'Côte d\'Ivoire',
4119         'HR': 'Croatia',
4120         'CU': 'Cuba',
4121         'CW': 'Curaçao',
4122         'CY': 'Cyprus',
4123         'CZ': 'Czech Republic',
4124         'DK': 'Denmark',
4125         'DJ': 'Djibouti',
4126         'DM': 'Dominica',
4127         'DO': 'Dominican Republic',
4128         'EC': 'Ecuador',
4129         'EG': 'Egypt',
4130         'SV': 'El Salvador',
4131         'GQ': 'Equatorial Guinea',
4132         'ER': 'Eritrea',
4133         'EE': 'Estonia',
4134         'ET': 'Ethiopia',
4135         'FK': 'Falkland Islands (Malvinas)',
4136         'FO': 'Faroe Islands',
4137         'FJ': 'Fiji',
4138         'FI': 'Finland',
4139         'FR': 'France',
4140         'GF': 'French Guiana',
4141         'PF': 'French Polynesia',
4142         'TF': 'French Southern Territories',
4143         'GA': 'Gabon',
4144         'GM': 'Gambia',
4145         'GE': 'Georgia',
4146         'DE': 'Germany',
4147         'GH': 'Ghana',
4148         'GI': 'Gibraltar',
4149         'GR': 'Greece',
4150         'GL': 'Greenland',
4151         'GD': 'Grenada',
4152         'GP': 'Guadeloupe',
4153         'GU': 'Guam',
4154         'GT': 'Guatemala',
4155         'GG': 'Guernsey',
4156         'GN': 'Guinea',
4157         'GW': 'Guinea-Bissau',
4158         'GY': 'Guyana',
4159         'HT': 'Haiti',
4160         'HM': 'Heard Island and McDonald Islands',
4161         'VA': 'Holy See (Vatican City State)',
4162         'HN': 'Honduras',
4163         'HK': 'Hong Kong',
4164         'HU': 'Hungary',
4165         'IS': 'Iceland',
4166         'IN': 'India',
4167         'ID': 'Indonesia',
4168         'IR': 'Iran, Islamic Republic of',
4169         'IQ': 'Iraq',
4170         'IE': 'Ireland',
4171         'IM': 'Isle of Man',
4172         'IL': 'Israel',
4173         'IT': 'Italy',
4174         'JM': 'Jamaica',
4175         'JP': 'Japan',
4176         'JE': 'Jersey',
4177         'JO': 'Jordan',
4178         'KZ': 'Kazakhstan',
4179         'KE': 'Kenya',
4180         'KI': 'Kiribati',
4181         'KP': 'Korea, Democratic People\'s Republic of',
4182         'KR': 'Korea, Republic of',
4183         'KW': 'Kuwait',
4184         'KG': 'Kyrgyzstan',
4185         'LA': 'Lao People\'s Democratic Republic',
4186         'LV': 'Latvia',
4187         'LB': 'Lebanon',
4188         'LS': 'Lesotho',
4189         'LR': 'Liberia',
4190         'LY': 'Libya',
4191         'LI': 'Liechtenstein',
4192         'LT': 'Lithuania',
4193         'LU': 'Luxembourg',
4194         'MO': 'Macao',
4195         'MK': 'Macedonia, the Former Yugoslav Republic of',
4196         'MG': 'Madagascar',
4197         'MW': 'Malawi',
4198         'MY': 'Malaysia',
4199         'MV': 'Maldives',
4200         'ML': 'Mali',
4201         'MT': 'Malta',
4202         'MH': 'Marshall Islands',
4203         'MQ': 'Martinique',
4204         'MR': 'Mauritania',
4205         'MU': 'Mauritius',
4206         'YT': 'Mayotte',
4207         'MX': 'Mexico',
4208         'FM': 'Micronesia, Federated States of',
4209         'MD': 'Moldova, Republic of',
4210         'MC': 'Monaco',
4211         'MN': 'Mongolia',
4212         'ME': 'Montenegro',
4213         'MS': 'Montserrat',
4214         'MA': 'Morocco',
4215         'MZ': 'Mozambique',
4216         'MM': 'Myanmar',
4217         'NA': 'Namibia',
4218         'NR': 'Nauru',
4219         'NP': 'Nepal',
4220         'NL': 'Netherlands',
4221         'NC': 'New Caledonia',
4222         'NZ': 'New Zealand',
4223         'NI': 'Nicaragua',
4224         'NE': 'Niger',
4225         'NG': 'Nigeria',
4226         'NU': 'Niue',
4227         'NF': 'Norfolk Island',
4228         'MP': 'Northern Mariana Islands',
4229         'NO': 'Norway',
4230         'OM': 'Oman',
4231         'PK': 'Pakistan',
4232         'PW': 'Palau',
4233         'PS': 'Palestine, State of',
4234         'PA': 'Panama',
4235         'PG': 'Papua New Guinea',
4236         'PY': 'Paraguay',
4237         'PE': 'Peru',
4238         'PH': 'Philippines',
4239         'PN': 'Pitcairn',
4240         'PL': 'Poland',
4241         'PT': 'Portugal',
4242         'PR': 'Puerto Rico',
4243         'QA': 'Qatar',
4244         'RE': 'Réunion',
4245         'RO': 'Romania',
4246         'RU': 'Russian Federation',
4247         'RW': 'Rwanda',
4248         'BL': 'Saint Barthélemy',
4249         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4250         'KN': 'Saint Kitts and Nevis',
4251         'LC': 'Saint Lucia',
4252         'MF': 'Saint Martin (French part)',
4253         'PM': 'Saint Pierre and Miquelon',
4254         'VC': 'Saint Vincent and the Grenadines',
4255         'WS': 'Samoa',
4256         'SM': 'San Marino',
4257         'ST': 'Sao Tome and Principe',
4258         'SA': 'Saudi Arabia',
4259         'SN': 'Senegal',
4260         'RS': 'Serbia',
4261         'SC': 'Seychelles',
4262         'SL': 'Sierra Leone',
4263         'SG': 'Singapore',
4264         'SX': 'Sint Maarten (Dutch part)',
4265         'SK': 'Slovakia',
4266         'SI': 'Slovenia',
4267         'SB': 'Solomon Islands',
4268         'SO': 'Somalia',
4269         'ZA': 'South Africa',
4270         'GS': 'South Georgia and the South Sandwich Islands',
4271         'SS': 'South Sudan',
4272         'ES': 'Spain',
4273         'LK': 'Sri Lanka',
4274         'SD': 'Sudan',
4275         'SR': 'Suriname',
4276         'SJ': 'Svalbard and Jan Mayen',
4277         'SZ': 'Swaziland',
4278         'SE': 'Sweden',
4279         'CH': 'Switzerland',
4280         'SY': 'Syrian Arab Republic',
4281         'TW': 'Taiwan, Province of China',
4282         'TJ': 'Tajikistan',
4283         'TZ': 'Tanzania, United Republic of',
4284         'TH': 'Thailand',
4285         'TL': 'Timor-Leste',
4286         'TG': 'Togo',
4287         'TK': 'Tokelau',
4288         'TO': 'Tonga',
4289         'TT': 'Trinidad and Tobago',
4290         'TN': 'Tunisia',
4291         'TR': 'Turkey',
4292         'TM': 'Turkmenistan',
4293         'TC': 'Turks and Caicos Islands',
4294         'TV': 'Tuvalu',
4295         'UG': 'Uganda',
4296         'UA': 'Ukraine',
4297         'AE': 'United Arab Emirates',
4298         'GB': 'United Kingdom',
4299         'US': 'United States',
4300         'UM': 'United States Minor Outlying Islands',
4301         'UY': 'Uruguay',
4302         'UZ': 'Uzbekistan',
4303         'VU': 'Vanuatu',
4304         'VE': 'Venezuela, Bolivarian Republic of',
4305         'VN': 'Viet Nam',
4306         'VG': 'Virgin Islands, British',
4307         'VI': 'Virgin Islands, U.S.',
4308         'WF': 'Wallis and Futuna',
4309         'EH': 'Western Sahara',
4310         'YE': 'Yemen',
4311         'ZM': 'Zambia',
4312         'ZW': 'Zimbabwe',
4313     }
4314
4315     @classmethod
4316     def short2full(cls, code):
4317         """Convert an ISO 3166-2 country code to the corresponding full name"""
4318         return cls._country_map.get(code.upper())
4319
4320
4321 class GeoUtils(object):
4322     # Major IPv4 address blocks per country
4323     _country_ip_map = {
4324         'AD': '46.172.224.0/19',
4325         'AE': '94.200.0.0/13',
4326         'AF': '149.54.0.0/17',
4327         'AG': '209.59.64.0/18',
4328         'AI': '204.14.248.0/21',
4329         'AL': '46.99.0.0/16',
4330         'AM': '46.70.0.0/15',
4331         'AO': '105.168.0.0/13',
4332         'AP': '182.50.184.0/21',
4333         'AQ': '23.154.160.0/24',
4334         'AR': '181.0.0.0/12',
4335         'AS': '202.70.112.0/20',
4336         'AT': '77.116.0.0/14',
4337         'AU': '1.128.0.0/11',
4338         'AW': '181.41.0.0/18',
4339         'AX': '185.217.4.0/22',
4340         'AZ': '5.197.0.0/16',
4341         'BA': '31.176.128.0/17',
4342         'BB': '65.48.128.0/17',
4343         'BD': '114.130.0.0/16',
4344         'BE': '57.0.0.0/8',
4345         'BF': '102.178.0.0/15',
4346         'BG': '95.42.0.0/15',
4347         'BH': '37.131.0.0/17',
4348         'BI': '154.117.192.0/18',
4349         'BJ': '137.255.0.0/16',
4350         'BL': '185.212.72.0/23',
4351         'BM': '196.12.64.0/18',
4352         'BN': '156.31.0.0/16',
4353         'BO': '161.56.0.0/16',
4354         'BQ': '161.0.80.0/20',
4355         'BR': '191.128.0.0/12',
4356         'BS': '24.51.64.0/18',
4357         'BT': '119.2.96.0/19',
4358         'BW': '168.167.0.0/16',
4359         'BY': '178.120.0.0/13',
4360         'BZ': '179.42.192.0/18',
4361         'CA': '99.224.0.0/11',
4362         'CD': '41.243.0.0/16',
4363         'CF': '197.242.176.0/21',
4364         'CG': '160.113.0.0/16',
4365         'CH': '85.0.0.0/13',
4366         'CI': '102.136.0.0/14',
4367         'CK': '202.65.32.0/19',
4368         'CL': '152.172.0.0/14',
4369         'CM': '102.244.0.0/14',
4370         'CN': '36.128.0.0/10',
4371         'CO': '181.240.0.0/12',
4372         'CR': '201.192.0.0/12',
4373         'CU': '152.206.0.0/15',
4374         'CV': '165.90.96.0/19',
4375         'CW': '190.88.128.0/17',
4376         'CY': '31.153.0.0/16',
4377         'CZ': '88.100.0.0/14',
4378         'DE': '53.0.0.0/8',
4379         'DJ': '197.241.0.0/17',
4380         'DK': '87.48.0.0/12',
4381         'DM': '192.243.48.0/20',
4382         'DO': '152.166.0.0/15',
4383         'DZ': '41.96.0.0/12',
4384         'EC': '186.68.0.0/15',
4385         'EE': '90.190.0.0/15',
4386         'EG': '156.160.0.0/11',
4387         'ER': '196.200.96.0/20',
4388         'ES': '88.0.0.0/11',
4389         'ET': '196.188.0.0/14',
4390         'EU': '2.16.0.0/13',
4391         'FI': '91.152.0.0/13',
4392         'FJ': '144.120.0.0/16',
4393         'FK': '80.73.208.0/21',
4394         'FM': '119.252.112.0/20',
4395         'FO': '88.85.32.0/19',
4396         'FR': '90.0.0.0/9',
4397         'GA': '41.158.0.0/15',
4398         'GB': '25.0.0.0/8',
4399         'GD': '74.122.88.0/21',
4400         'GE': '31.146.0.0/16',
4401         'GF': '161.22.64.0/18',
4402         'GG': '62.68.160.0/19',
4403         'GH': '154.160.0.0/12',
4404         'GI': '95.164.0.0/16',
4405         'GL': '88.83.0.0/19',
4406         'GM': '160.182.0.0/15',
4407         'GN': '197.149.192.0/18',
4408         'GP': '104.250.0.0/19',
4409         'GQ': '105.235.224.0/20',
4410         'GR': '94.64.0.0/13',
4411         'GT': '168.234.0.0/16',
4412         'GU': '168.123.0.0/16',
4413         'GW': '197.214.80.0/20',
4414         'GY': '181.41.64.0/18',
4415         'HK': '113.252.0.0/14',
4416         'HN': '181.210.0.0/16',
4417         'HR': '93.136.0.0/13',
4418         'HT': '148.102.128.0/17',
4419         'HU': '84.0.0.0/14',
4420         'ID': '39.192.0.0/10',
4421         'IE': '87.32.0.0/12',
4422         'IL': '79.176.0.0/13',
4423         'IM': '5.62.80.0/20',
4424         'IN': '117.192.0.0/10',
4425         'IO': '203.83.48.0/21',
4426         'IQ': '37.236.0.0/14',
4427         'IR': '2.176.0.0/12',
4428         'IS': '82.221.0.0/16',
4429         'IT': '79.0.0.0/10',
4430         'JE': '87.244.64.0/18',
4431         'JM': '72.27.0.0/17',
4432         'JO': '176.29.0.0/16',
4433         'JP': '133.0.0.0/8',
4434         'KE': '105.48.0.0/12',
4435         'KG': '158.181.128.0/17',
4436         'KH': '36.37.128.0/17',
4437         'KI': '103.25.140.0/22',
4438         'KM': '197.255.224.0/20',
4439         'KN': '198.167.192.0/19',
4440         'KP': '175.45.176.0/22',
4441         'KR': '175.192.0.0/10',
4442         'KW': '37.36.0.0/14',
4443         'KY': '64.96.0.0/15',
4444         'KZ': '2.72.0.0/13',
4445         'LA': '115.84.64.0/18',
4446         'LB': '178.135.0.0/16',
4447         'LC': '24.92.144.0/20',
4448         'LI': '82.117.0.0/19',
4449         'LK': '112.134.0.0/15',
4450         'LR': '102.183.0.0/16',
4451         'LS': '129.232.0.0/17',
4452         'LT': '78.56.0.0/13',
4453         'LU': '188.42.0.0/16',
4454         'LV': '46.109.0.0/16',
4455         'LY': '41.252.0.0/14',
4456         'MA': '105.128.0.0/11',
4457         'MC': '88.209.64.0/18',
4458         'MD': '37.246.0.0/16',
4459         'ME': '178.175.0.0/17',
4460         'MF': '74.112.232.0/21',
4461         'MG': '154.126.0.0/17',
4462         'MH': '117.103.88.0/21',
4463         'MK': '77.28.0.0/15',
4464         'ML': '154.118.128.0/18',
4465         'MM': '37.111.0.0/17',
4466         'MN': '49.0.128.0/17',
4467         'MO': '60.246.0.0/16',
4468         'MP': '202.88.64.0/20',
4469         'MQ': '109.203.224.0/19',
4470         'MR': '41.188.64.0/18',
4471         'MS': '208.90.112.0/22',
4472         'MT': '46.11.0.0/16',
4473         'MU': '105.16.0.0/12',
4474         'MV': '27.114.128.0/18',
4475         'MW': '102.70.0.0/15',
4476         'MX': '187.192.0.0/11',
4477         'MY': '175.136.0.0/13',
4478         'MZ': '197.218.0.0/15',
4479         'NA': '41.182.0.0/16',
4480         'NC': '101.101.0.0/18',
4481         'NE': '197.214.0.0/18',
4482         'NF': '203.17.240.0/22',
4483         'NG': '105.112.0.0/12',
4484         'NI': '186.76.0.0/15',
4485         'NL': '145.96.0.0/11',
4486         'NO': '84.208.0.0/13',
4487         'NP': '36.252.0.0/15',
4488         'NR': '203.98.224.0/19',
4489         'NU': '49.156.48.0/22',
4490         'NZ': '49.224.0.0/14',
4491         'OM': '5.36.0.0/15',
4492         'PA': '186.72.0.0/15',
4493         'PE': '186.160.0.0/14',
4494         'PF': '123.50.64.0/18',
4495         'PG': '124.240.192.0/19',
4496         'PH': '49.144.0.0/13',
4497         'PK': '39.32.0.0/11',
4498         'PL': '83.0.0.0/11',
4499         'PM': '70.36.0.0/20',
4500         'PR': '66.50.0.0/16',
4501         'PS': '188.161.0.0/16',
4502         'PT': '85.240.0.0/13',
4503         'PW': '202.124.224.0/20',
4504         'PY': '181.120.0.0/14',
4505         'QA': '37.210.0.0/15',
4506         'RE': '102.35.0.0/16',
4507         'RO': '79.112.0.0/13',
4508         'RS': '93.86.0.0/15',
4509         'RU': '5.136.0.0/13',
4510         'RW': '41.186.0.0/16',
4511         'SA': '188.48.0.0/13',
4512         'SB': '202.1.160.0/19',
4513         'SC': '154.192.0.0/11',
4514         'SD': '102.120.0.0/13',
4515         'SE': '78.64.0.0/12',
4516         'SG': '8.128.0.0/10',
4517         'SI': '188.196.0.0/14',
4518         'SK': '78.98.0.0/15',
4519         'SL': '102.143.0.0/17',
4520         'SM': '89.186.32.0/19',
4521         'SN': '41.82.0.0/15',
4522         'SO': '154.115.192.0/18',
4523         'SR': '186.179.128.0/17',
4524         'SS': '105.235.208.0/21',
4525         'ST': '197.159.160.0/19',
4526         'SV': '168.243.0.0/16',
4527         'SX': '190.102.0.0/20',
4528         'SY': '5.0.0.0/16',
4529         'SZ': '41.84.224.0/19',
4530         'TC': '65.255.48.0/20',
4531         'TD': '154.68.128.0/19',
4532         'TG': '196.168.0.0/14',
4533         'TH': '171.96.0.0/13',
4534         'TJ': '85.9.128.0/18',
4535         'TK': '27.96.24.0/21',
4536         'TL': '180.189.160.0/20',
4537         'TM': '95.85.96.0/19',
4538         'TN': '197.0.0.0/11',
4539         'TO': '175.176.144.0/21',
4540         'TR': '78.160.0.0/11',
4541         'TT': '186.44.0.0/15',
4542         'TV': '202.2.96.0/19',
4543         'TW': '120.96.0.0/11',
4544         'TZ': '156.156.0.0/14',
4545         'UA': '37.52.0.0/14',
4546         'UG': '102.80.0.0/13',
4547         'US': '6.0.0.0/8',
4548         'UY': '167.56.0.0/13',
4549         'UZ': '84.54.64.0/18',
4550         'VA': '212.77.0.0/19',
4551         'VC': '207.191.240.0/21',
4552         'VE': '186.88.0.0/13',
4553         'VG': '66.81.192.0/20',
4554         'VI': '146.226.0.0/16',
4555         'VN': '14.160.0.0/11',
4556         'VU': '202.80.32.0/20',
4557         'WF': '117.20.32.0/21',
4558         'WS': '202.4.32.0/19',
4559         'YE': '134.35.0.0/16',
4560         'YT': '41.242.116.0/22',
4561         'ZA': '41.0.0.0/11',
4562         'ZM': '102.144.0.0/13',
4563         'ZW': '102.177.192.0/18',
4564     }
4565
4566     @classmethod
4567     def random_ipv4(cls, code_or_block):
4568         if len(code_or_block) == 2:
4569             block = cls._country_ip_map.get(code_or_block.upper())
4570             if not block:
4571                 return None
4572         else:
4573             block = code_or_block
4574         addr, preflen = block.split('/')
4575         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4576         addr_max = addr_min | (0xffffffff >> int(preflen))
4577         return compat_str(socket.inet_ntoa(
4578             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4579
4580
4581 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4582     def __init__(self, proxies=None):
4583         # Set default handlers
4584         for type in ('http', 'https'):
4585             setattr(self, '%s_open' % type,
4586                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4587                         meth(r, proxy, type))
4588         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4589
4590     def proxy_open(self, req, proxy, type):
4591         req_proxy = req.headers.get('Ytdl-request-proxy')
4592         if req_proxy is not None:
4593             proxy = req_proxy
4594             del req.headers['Ytdl-request-proxy']
4595
4596         if proxy == '__noproxy__':
4597             return None  # No Proxy
4598         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4599             req.add_header('Ytdl-socks-proxy', proxy)
4600             # yt-dlp's http/https handlers do wrapping the socket with socks
4601             return None
4602         return compat_urllib_request.ProxyHandler.proxy_open(
4603             self, req, proxy, type)
4604
4605
4606 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4607 # released into Public Domain
4608 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4609
4610 def long_to_bytes(n, blocksize=0):
4611     """long_to_bytes(n:long, blocksize:int) : string
4612     Convert a long integer to a byte string.
4613
4614     If optional blocksize is given and greater than zero, pad the front of the
4615     byte string with binary zeros so that the length is a multiple of
4616     blocksize.
4617     """
4618     # after much testing, this algorithm was deemed to be the fastest
4619     s = b''
4620     n = int(n)
4621     while n > 0:
4622         s = compat_struct_pack('>I', n & 0xffffffff) + s
4623         n = n >> 32
4624     # strip off leading zeros
4625     for i in range(len(s)):
4626         if s[i] != b'\000'[0]:
4627             break
4628     else:
4629         # only happens when n == 0
4630         s = b'\000'
4631         i = 0
4632     s = s[i:]
4633     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4634     # de-padding being done above, but sigh...
4635     if blocksize > 0 and len(s) % blocksize:
4636         s = (blocksize - len(s) % blocksize) * b'\000' + s
4637     return s
4638
4639
4640 def bytes_to_long(s):
4641     """bytes_to_long(string) : long
4642     Convert a byte string to a long integer.
4643
4644     This is (essentially) the inverse of long_to_bytes().
4645     """
4646     acc = 0
4647     length = len(s)
4648     if length % 4:
4649         extra = (4 - length % 4)
4650         s = b'\000' * extra + s
4651         length = length + extra
4652     for i in range(0, length, 4):
4653         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4654     return acc
4655
4656
4657 def ohdave_rsa_encrypt(data, exponent, modulus):
4658     '''
4659     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4660
4661     Input:
4662         data: data to encrypt, bytes-like object
4663         exponent, modulus: parameter e and N of RSA algorithm, both integer
4664     Output: hex string of encrypted data
4665
4666     Limitation: supports one block encryption only
4667     '''
4668
4669     payload = int(binascii.hexlify(data[::-1]), 16)
4670     encrypted = pow(payload, exponent, modulus)
4671     return '%x' % encrypted
4672
4673
4674 def pkcs1pad(data, length):
4675     """
4676     Padding input data with PKCS#1 scheme
4677
4678     @param {int[]} data        input data
4679     @param {int}   length      target length
4680     @returns {int[]}           padded data
4681     """
4682     if len(data) > length - 11:
4683         raise ValueError('Input data too long for PKCS#1 padding')
4684
4685     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4686     return [0, 2] + pseudo_random + [0] + data
4687
4688
4689 def encode_base_n(num, n, table=None):
4690     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4691     if not table:
4692         table = FULL_TABLE[:n]
4693
4694     if n > len(table):
4695         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4696
4697     if num == 0:
4698         return table[0]
4699
4700     ret = ''
4701     while num:
4702         ret = table[num % n] + ret
4703         num = num // n
4704     return ret
4705
4706
4707 def decode_packed_codes(code):
4708     mobj = re.search(PACKED_CODES_RE, code)
4709     obfuscated_code, base, count, symbols = mobj.groups()
4710     base = int(base)
4711     count = int(count)
4712     symbols = symbols.split('|')
4713     symbol_table = {}
4714
4715     while count:
4716         count -= 1
4717         base_n_count = encode_base_n(count, base)
4718         symbol_table[base_n_count] = symbols[count] or base_n_count
4719
4720     return re.sub(
4721         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4722         obfuscated_code)
4723
4724
4725 def caesar(s, alphabet, shift):
4726     if shift == 0:
4727         return s
4728     l = len(alphabet)
4729     return ''.join(
4730         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4731         for c in s)
4732
4733
4734 def rot47(s):
4735     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4736
4737
4738 def parse_m3u8_attributes(attrib):
4739     info = {}
4740     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4741         if val.startswith('"'):
4742             val = val[1:-1]
4743         info[key] = val
4744     return info
4745
4746
4747 def urshift(val, n):
4748     return val >> n if val >= 0 else (val + 0x100000000) >> n
4749
4750
4751 # Based on png2str() written by @gdkchan and improved by @yokrysty
4752 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4753 def decode_png(png_data):
4754     # Reference: https://www.w3.org/TR/PNG/
4755     header = png_data[8:]
4756
4757     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4758         raise IOError('Not a valid PNG file.')
4759
4760     int_map = {1: '>B', 2: '>H', 4: '>I'}
4761     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4762
4763     chunks = []
4764
4765     while header:
4766         length = unpack_integer(header[:4])
4767         header = header[4:]
4768
4769         chunk_type = header[:4]
4770         header = header[4:]
4771
4772         chunk_data = header[:length]
4773         header = header[length:]
4774
4775         header = header[4:]  # Skip CRC
4776
4777         chunks.append({
4778             'type': chunk_type,
4779             'length': length,
4780             'data': chunk_data
4781         })
4782
4783     ihdr = chunks[0]['data']
4784
4785     width = unpack_integer(ihdr[:4])
4786     height = unpack_integer(ihdr[4:8])
4787
4788     idat = b''
4789
4790     for chunk in chunks:
4791         if chunk['type'] == b'IDAT':
4792             idat += chunk['data']
4793
4794     if not idat:
4795         raise IOError('Unable to read PNG data.')
4796
4797     decompressed_data = bytearray(zlib.decompress(idat))
4798
4799     stride = width * 3
4800     pixels = []
4801
4802     def _get_pixel(idx):
4803         x = idx % stride
4804         y = idx // stride
4805         return pixels[y][x]
4806
4807     for y in range(height):
4808         basePos = y * (1 + stride)
4809         filter_type = decompressed_data[basePos]
4810
4811         current_row = []
4812
4813         pixels.append(current_row)
4814
4815         for x in range(stride):
4816             color = decompressed_data[1 + basePos + x]
4817             basex = y * stride + x
4818             left = 0
4819             up = 0
4820
4821             if x > 2:
4822                 left = _get_pixel(basex - 3)
4823             if y > 0:
4824                 up = _get_pixel(basex - stride)
4825
4826             if filter_type == 1:  # Sub
4827                 color = (color + left) & 0xff
4828             elif filter_type == 2:  # Up
4829                 color = (color + up) & 0xff
4830             elif filter_type == 3:  # Average
4831                 color = (color + ((left + up) >> 1)) & 0xff
4832             elif filter_type == 4:  # Paeth
4833                 a = left
4834                 b = up
4835                 c = 0
4836
4837                 if x > 2 and y > 0:
4838                     c = _get_pixel(basex - stride - 3)
4839
4840                 p = a + b - c
4841
4842                 pa = abs(p - a)
4843                 pb = abs(p - b)
4844                 pc = abs(p - c)
4845
4846                 if pa <= pb and pa <= pc:
4847                     color = (color + a) & 0xff
4848                 elif pb <= pc:
4849                     color = (color + b) & 0xff
4850                 else:
4851                     color = (color + c) & 0xff
4852
4853             current_row.append(color)
4854
4855     return width, height, pixels
4856
4857
4858 def write_xattr(path, key, value):
4859     # This mess below finds the best xattr tool for the job
4860     try:
4861         # try the pyxattr module...
4862         import xattr
4863
4864         if hasattr(xattr, 'set'):  # pyxattr
4865             # Unicode arguments are not supported in python-pyxattr until
4866             # version 0.5.0
4867             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4868             pyxattr_required_version = '0.5.0'
4869             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4870                 # TODO: fallback to CLI tools
4871                 raise XAttrUnavailableError(
4872                     'python-pyxattr is detected but is too old. '
4873                     'yt-dlp requires %s or above while your version is %s. '
4874                     'Falling back to other xattr implementations' % (
4875                         pyxattr_required_version, xattr.__version__))
4876
4877             setxattr = xattr.set
4878         else:  # xattr
4879             setxattr = xattr.setxattr
4880
4881         try:
4882             setxattr(path, key, value)
4883         except EnvironmentError as e:
4884             raise XAttrMetadataError(e.errno, e.strerror)
4885
4886     except ImportError:
4887         if compat_os_name == 'nt':
4888             # Write xattrs to NTFS Alternate Data Streams:
4889             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4890             assert ':' not in key
4891             assert os.path.exists(path)
4892
4893             ads_fn = path + ':' + key
4894             try:
4895                 with open(ads_fn, 'wb') as f:
4896                     f.write(value)
4897             except EnvironmentError as e:
4898                 raise XAttrMetadataError(e.errno, e.strerror)
4899         else:
4900             user_has_setfattr = check_executable('setfattr', ['--version'])
4901             user_has_xattr = check_executable('xattr', ['-h'])
4902
4903             if user_has_setfattr or user_has_xattr:
4904
4905                 value = value.decode('utf-8')
4906                 if user_has_setfattr:
4907                     executable = 'setfattr'
4908                     opts = ['-n', key, '-v', value]
4909                 elif user_has_xattr:
4910                     executable = 'xattr'
4911                     opts = ['-w', key, value]
4912
4913                 cmd = ([encodeFilename(executable, True)]
4914                        + [encodeArgument(o) for o in opts]
4915                        + [encodeFilename(path, True)])
4916
4917                 try:
4918                     p = Popen(
4919                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4920                 except EnvironmentError as e:
4921                     raise XAttrMetadataError(e.errno, e.strerror)
4922                 stdout, stderr = p.communicate_or_kill()
4923                 stderr = stderr.decode('utf-8', 'replace')
4924                 if p.returncode != 0:
4925                     raise XAttrMetadataError(p.returncode, stderr)
4926
4927             else:
4928                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4929                 if sys.platform.startswith('linux'):
4930                     raise XAttrUnavailableError(
4931                         "Couldn't find a tool to set the xattrs. "
4932                         "Install either the python 'pyxattr' or 'xattr' "
4933                         "modules, or the GNU 'attr' package "
4934                         "(which contains the 'setfattr' tool).")
4935                 else:
4936                     raise XAttrUnavailableError(
4937                         "Couldn't find a tool to set the xattrs. "
4938                         "Install either the python 'xattr' module, "
4939                         "or the 'xattr' binary.")
4940
4941
4942 def random_birthday(year_field, month_field, day_field):
4943     start_date = datetime.date(1950, 1, 1)
4944     end_date = datetime.date(1995, 12, 31)
4945     offset = random.randint(0, (end_date - start_date).days)
4946     random_date = start_date + datetime.timedelta(offset)
4947     return {
4948         year_field: str(random_date.year),
4949         month_field: str(random_date.month),
4950         day_field: str(random_date.day),
4951     }
4952
4953
4954 # Templates for internet shortcut files, which are plain text files.
4955 DOT_URL_LINK_TEMPLATE = '''
4956 [InternetShortcut]
4957 URL=%(url)s
4958 '''.lstrip()
4959
4960 DOT_WEBLOC_LINK_TEMPLATE = '''
4961 <?xml version="1.0" encoding="UTF-8"?>
4962 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4963 <plist version="1.0">
4964 <dict>
4965 \t<key>URL</key>
4966 \t<string>%(url)s</string>
4967 </dict>
4968 </plist>
4969 '''.lstrip()
4970
4971 DOT_DESKTOP_LINK_TEMPLATE = '''
4972 [Desktop Entry]
4973 Encoding=UTF-8
4974 Name=%(filename)s
4975 Type=Link
4976 URL=%(url)s
4977 Icon=text-html
4978 '''.lstrip()
4979
4980 LINK_TEMPLATES = {
4981     'url': DOT_URL_LINK_TEMPLATE,
4982     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4983     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4984 }
4985
4986
4987 def iri_to_uri(iri):
4988     """
4989     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4990
4991     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4992     """
4993
4994     iri_parts = compat_urllib_parse_urlparse(iri)
4995
4996     if '[' in iri_parts.netloc:
4997         raise ValueError('IPv6 URIs are not, yet, supported.')
4998         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4999
5000     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5001
5002     net_location = ''
5003     if iri_parts.username:
5004         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
5005         if iri_parts.password is not None:
5006             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
5007         net_location += '@'
5008
5009     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
5010     # The 'idna' encoding produces ASCII text.
5011     if iri_parts.port is not None and iri_parts.port != 80:
5012         net_location += ':' + str(iri_parts.port)
5013
5014     return compat_urllib_parse_urlunparse(
5015         (iri_parts.scheme,
5016             net_location,
5017
5018             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5019
5020             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5021             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5022
5023             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5024             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5025
5026             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5027
5028     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5029
5030
5031 def to_high_limit_path(path):
5032     if sys.platform in ['win32', 'cygwin']:
5033         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5034         return r'\\?\ '.rstrip() + os.path.abspath(path)
5035
5036     return path
5037
5038
5039 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5040     val = traverse_obj(obj, *variadic(field))
5041     if val in ignore:
5042         return default
5043     return template % (func(val) if func else val)
5044
5045
5046 def clean_podcast_url(url):
5047     return re.sub(r'''(?x)
5048         (?:
5049             (?:
5050                 chtbl\.com/track|
5051                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5052                 play\.podtrac\.com
5053             )/[^/]+|
5054             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5055             flex\.acast\.com|
5056             pd(?:
5057                 cn\.co| # https://podcorn.com/analytics-prefix/
5058                 st\.fm # https://podsights.com/docs/
5059             )/e
5060         )/''', '', url)
5061
5062
5063 _HEX_TABLE = '0123456789abcdef'
5064
5065
5066 def random_uuidv4():
5067     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5068
5069
5070 def make_dir(path, to_screen=None):
5071     try:
5072         dn = os.path.dirname(path)
5073         if dn and not os.path.exists(dn):
5074             os.makedirs(dn)
5075         return True
5076     except (OSError, IOError) as err:
5077         if callable(to_screen) is not None:
5078             to_screen('unable to create directory ' + error_to_compat_str(err))
5079         return False
5080
5081
5082 def get_executable_path():
5083     from zipimport import zipimporter
5084     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5085         path = os.path.dirname(sys.executable)
5086     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5087         path = os.path.join(os.path.dirname(__file__), '../..')
5088     else:
5089         path = os.path.join(os.path.dirname(__file__), '..')
5090     return os.path.abspath(path)
5091
5092
5093 def load_plugins(name, suffix, namespace):
5094     classes = {}
5095     try:
5096         plugins_spec = importlib.util.spec_from_file_location(
5097             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5098         plugins = importlib.util.module_from_spec(plugins_spec)
5099         sys.modules[plugins_spec.name] = plugins
5100         plugins_spec.loader.exec_module(plugins)
5101         for name in dir(plugins):
5102             if name in namespace:
5103                 continue
5104             if not name.endswith(suffix):
5105                 continue
5106             klass = getattr(plugins, name)
5107             classes[name] = namespace[name] = klass
5108     except FileNotFoundError:
5109         pass
5110     return classes
5111
5112
5113 def traverse_obj(
5114         obj, *path_list, default=None, expected_type=None, get_all=True,
5115         casesense=True, is_user_input=False, traverse_string=False):
5116     ''' Traverse nested list/dict/tuple
5117     @param path_list        A list of paths which are checked one by one.
5118                             Each path is a list of keys where each key is a string,
5119                             a function, a tuple of strings/None or "...".
5120                             When a fuction is given, it takes the key as argument and
5121                             returns whether the key matches or not. When a tuple is given,
5122                             all the keys given in the tuple are traversed, and
5123                             "..." traverses all the keys in the object
5124                             "None" returns the object without traversal
5125     @param default          Default value to return
5126     @param expected_type    Only accept final value of this type (Can also be any callable)
5127     @param get_all          Return all the values obtained from a path or only the first one
5128     @param casesense        Whether to consider dictionary keys as case sensitive
5129     @param is_user_input    Whether the keys are generated from user input. If True,
5130                             strings are converted to int/slice if necessary
5131     @param traverse_string  Whether to traverse inside strings. If True, any
5132                             non-compatible object will also be converted into a string
5133     # TODO: Write tests
5134     '''
5135     if not casesense:
5136         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5137         path_list = (map(_lower, variadic(path)) for path in path_list)
5138
5139     def _traverse_obj(obj, path, _current_depth=0):
5140         nonlocal depth
5141         path = tuple(variadic(path))
5142         for i, key in enumerate(path):
5143             if None in (key, obj):
5144                 return obj
5145             if isinstance(key, (list, tuple)):
5146                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5147                 key = ...
5148             if key is ...:
5149                 obj = (obj.values() if isinstance(obj, dict)
5150                        else obj if isinstance(obj, (list, tuple, LazyList))
5151                        else str(obj) if traverse_string else [])
5152                 _current_depth += 1
5153                 depth = max(depth, _current_depth)
5154                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5155             elif callable(key):
5156                 if isinstance(obj, (list, tuple, LazyList)):
5157                     obj = enumerate(obj)
5158                 elif isinstance(obj, dict):
5159                     obj = obj.items()
5160                 else:
5161                     if not traverse_string:
5162                         return None
5163                     obj = str(obj)
5164                 _current_depth += 1
5165                 depth = max(depth, _current_depth)
5166                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5167             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5168                 obj = (obj.get(key) if casesense or (key in obj)
5169                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5170             else:
5171                 if is_user_input:
5172                     key = (int_or_none(key) if ':' not in key
5173                            else slice(*map(int_or_none, key.split(':'))))
5174                     if key == slice(None):
5175                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5176                 if not isinstance(key, (int, slice)):
5177                     return None
5178                 if not isinstance(obj, (list, tuple, LazyList)):
5179                     if not traverse_string:
5180                         return None
5181                     obj = str(obj)
5182                 try:
5183                     obj = obj[key]
5184                 except IndexError:
5185                     return None
5186         return obj
5187
5188     if isinstance(expected_type, type):
5189         type_test = lambda val: val if isinstance(val, expected_type) else None
5190     elif expected_type is not None:
5191         type_test = expected_type
5192     else:
5193         type_test = lambda val: val
5194
5195     for path in path_list:
5196         depth = 0
5197         val = _traverse_obj(obj, path)
5198         if val is not None:
5199             if depth:
5200                 for _ in range(depth - 1):
5201                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5202                 val = [v for v in map(type_test, val) if v is not None]
5203                 if val:
5204                     return val if get_all else val[0]
5205             else:
5206                 val = type_test(val)
5207                 if val is not None:
5208                     return val
5209     return default
5210
5211
5212 def traverse_dict(dictn, keys, casesense=True):
5213     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5214                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5215     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5216
5217
5218 def variadic(x, allowed_types=(str, bytes, dict)):
5219     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5220
5221
5222 def decode_base(value, digits):
5223     # This will convert given base-x string to scalar (long or int)
5224     table = {char: index for index, char in enumerate(digits)}
5225     result = 0
5226     base = len(digits)
5227     for chr in value:
5228         result *= base
5229         result += table[chr]
5230     return result
5231
5232
5233 def time_seconds(**kwargs):
5234     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5235     return t.timestamp()
5236
5237
5238 # create a JSON Web Signature (jws) with HS256 algorithm
5239 # the resulting format is in JWS Compact Serialization
5240 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5241 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5242 def jwt_encode_hs256(payload_data, key, headers={}):
5243     header_data = {
5244         'alg': 'HS256',
5245         'typ': 'JWT',
5246     }
5247     if headers:
5248         header_data.update(headers)
5249     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5250     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5251     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5252     signature_b64 = base64.b64encode(h.digest())
5253     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5254     return token
5255
5256
5257 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5258 def jwt_decode_hs256(jwt):
5259     header_b64, payload_b64, signature_b64 = jwt.split('.')
5260     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5261     return payload_data
5262
5263
5264 def supports_terminal_sequences(stream):
5265     if compat_os_name == 'nt':
5266         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5267         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5268             return False
5269     elif not os.getenv('TERM'):
5270         return False
5271     try:
5272         return stream.isatty()
5273     except BaseException:
5274         return False
5275
5276
5277 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5278
5279
5280 def remove_terminal_sequences(string):
5281     return _terminal_sequences_re.sub('', string)
5282
5283
5284 def number_of_digits(number):
5285     return len('%d' % number)
5286
5287
5288 def join_nonempty(*values, delim='-', from_dict=None):
5289     if from_dict is not None:
5290         values = map(from_dict.get, values)
5291     return delim.join(map(str, filter(None, values)))
5292
5293
5294 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5295     """
5296     Find the largest format dimensions in terms of video width and, for each thumbnail:
5297     * Modify the URL: Match the width with the provided regex and replace with the former width
5298     * Update dimensions
5299
5300     This function is useful with video services that scale the provided thumbnails on demand
5301     """
5302     _keys = ('width', 'height')
5303     max_dimensions = max(
5304         [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5305         default=(0, 0))
5306     if not max_dimensions[0]:
5307         return thumbnails
5308     return [
5309         merge_dicts(
5310             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5311             dict(zip(_keys, max_dimensions)), thumbnail)
5312         for thumbnail in thumbnails
5313     ]
5314
5315
5316 def parse_http_range(range):
5317     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5318     if not range:
5319         return None, None, None
5320     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5321     if not crg:
5322         return None, None, None
5323     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5324
5325
5326 class Config:
5327     own_args = None
5328     filename = None
5329     __initialized = False
5330
5331     def __init__(self, parser, label=None):
5332         self._parser, self.label = parser, label
5333         self._loaded_paths, self.configs = set(), []
5334
5335     def init(self, args=None, filename=None):
5336         assert not self.__initialized
5337         directory = ''
5338         if filename:
5339             location = os.path.realpath(filename)
5340             directory = os.path.dirname(location)
5341             if location in self._loaded_paths:
5342                 return False
5343             self._loaded_paths.add(location)
5344
5345         self.__initialized = True
5346         self.own_args, self.filename = args, filename
5347         for location in self._parser.parse_args(args)[0].config_locations or []:
5348             location = os.path.join(directory, expand_path(location))
5349             if os.path.isdir(location):
5350                 location = os.path.join(location, 'yt-dlp.conf')
5351             if not os.path.exists(location):
5352                 self._parser.error(f'config location {location} does not exist')
5353             self.append_config(self.read_file(location), location)
5354         return True
5355
5356     def __str__(self):
5357         label = join_nonempty(
5358             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5359             delim=' ')
5360         return join_nonempty(
5361             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5362             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5363             delim='\n')
5364
5365     @staticmethod
5366     def read_file(filename, default=[]):
5367         try:
5368             optionf = open(filename)
5369         except IOError:
5370             return default  # silently skip if file is not present
5371         try:
5372             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5373             contents = optionf.read()
5374             if sys.version_info < (3,):
5375                 contents = contents.decode(preferredencoding())
5376             res = compat_shlex_split(contents, comments=True)
5377         finally:
5378             optionf.close()
5379         return res
5380
5381     @staticmethod
5382     def hide_login_info(opts):
5383         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5384         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5385
5386         def _scrub_eq(o):
5387             m = eqre.match(o)
5388             if m:
5389                 return m.group('key') + '=PRIVATE'
5390             else:
5391                 return o
5392
5393         opts = list(map(_scrub_eq, opts))
5394         for idx, opt in enumerate(opts):
5395             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5396                 opts[idx + 1] = 'PRIVATE'
5397         return opts
5398
5399     def append_config(self, *args, label=None):
5400         config = type(self)(self._parser, label)
5401         config._loaded_paths = self._loaded_paths
5402         if config.init(*args):
5403             self.configs.append(config)
5404
5405     @property
5406     def all_args(self):
5407         for config in reversed(self.configs):
5408             yield from config.all_args
5409         yield from self.own_args or []
5410
5411     def parse_args(self):
5412         return self._parser.parse_args(list(self.all_args))
5413
5414
5415 class WebSocketsWrapper():
5416     """Wraps websockets module to use in non-async scopes"""
5417
5418     def __init__(self, url, headers=None):
5419         self.loop = asyncio.events.new_event_loop()
5420         self.conn = compat_websockets.connect(
5421             url, extra_headers=headers, ping_interval=None,
5422             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5423         atexit.register(self.__exit__, None, None, None)
5424
5425     def __enter__(self):
5426         self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5427         return self
5428
5429     def send(self, *args):
5430         self.run_with_loop(self.pool.send(*args), self.loop)
5431
5432     def recv(self, *args):
5433         return self.run_with_loop(self.pool.recv(*args), self.loop)
5434
5435     def __exit__(self, type, value, traceback):
5436         try:
5437             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5438         finally:
5439             self.loop.close()
5440             self._cancel_all_tasks(self.loop)
5441
5442     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5443     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5444     @staticmethod
5445     def run_with_loop(main, loop):
5446         if not asyncio.coroutines.iscoroutine(main):
5447             raise ValueError(f'a coroutine was expected, got {main!r}')
5448
5449         try:
5450             return loop.run_until_complete(main)
5451         finally:
5452             loop.run_until_complete(loop.shutdown_asyncgens())
5453             if hasattr(loop, 'shutdown_default_executor'):
5454                 loop.run_until_complete(loop.shutdown_default_executor())
5455
5456     @staticmethod
5457     def _cancel_all_tasks(loop):
5458         to_cancel = asyncio.tasks.all_tasks(loop)
5459
5460         if not to_cancel:
5461             return
5462
5463         for task in to_cancel:
5464             task.cancel()
5465
5466         loop.run_until_complete(
5467             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5468
5469         for task in to_cancel:
5470             if task.cancelled():
5471                 continue
5472             if task.exception() is not None:
5473                 loop.call_exception_handler({
5474                     'message': 'unhandled exception during asyncio.run() shutdown',
5475                     'exception': task.exception(),
5476                     'task': task,
5477                 })
5478
5479
5480 has_websockets = bool(compat_websockets)
5481
5482
5483 def merge_headers(*dicts):
5484     """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
5485     return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}