yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_brotli,
  51     compat_chr,
  52     compat_cookiejar,
  53     compat_ctypes_WINFUNCTYPE,
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_html_entities,
  57     compat_html_entities_html5,
  58     compat_http_client,
  59     compat_integer_types,
  60     compat_numeric_types,
  61     compat_kwargs,
  62     compat_os_name,
  63     compat_parse_qs,
  64     compat_shlex_split,
  65     compat_shlex_quote,
  66     compat_str,
  67     compat_struct_pack,
  68     compat_struct_unpack,
  69     compat_urllib_error,
  70     compat_urllib_parse,
  71     compat_urllib_parse_urlencode,
  72     compat_urllib_parse_urlparse,
  73     compat_urllib_parse_urlunparse,
  74     compat_urllib_parse_quote,
  75     compat_urllib_parse_quote_plus,
  76     compat_urllib_parse_unquote_plus,
  77     compat_urllib_request,
  78     compat_urlparse,
  79     compat_websockets,
  80     compat_xpath,
  81 )
  82
  83 from .socks import (
  84     ProxyType,
  85     sockssocket,
  86 )
  87
  88
  89 def register_socks_protocols():
  90     # "Register" SOCKS protocols
  91     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  92     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  93     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  94         if scheme not in compat_urlparse.uses_netloc:
  95             compat_urlparse.uses_netloc.append(scheme)
  96
  97
  98 # This is not clearly defined otherwise
  99 compiled_regex_type = type(re.compile(''))
 100
 101
 102 def random_user_agent():
 103     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 104     _CHROME_VERSIONS = (
 105         '90.0.4430.212',
 106         '90.0.4430.24',
 107         '90.0.4430.70',
 108         '90.0.4430.72',
 109         '90.0.4430.85',
 110         '90.0.4430.93',
 111         '91.0.4472.101',
 112         '91.0.4472.106',
 113         '91.0.4472.114',
 114         '91.0.4472.124',
 115         '91.0.4472.164',
 116         '91.0.4472.19',
 117         '91.0.4472.77',
 118         '92.0.4515.107',
 119         '92.0.4515.115',
 120         '92.0.4515.131',
 121         '92.0.4515.159',
 122         '92.0.4515.43',
 123         '93.0.4556.0',
 124         '93.0.4577.15',
 125         '93.0.4577.63',
 126         '93.0.4577.82',
 127         '94.0.4606.41',
 128         '94.0.4606.54',
 129         '94.0.4606.61',
 130         '94.0.4606.71',
 131         '94.0.4606.81',
 132         '94.0.4606.85',
 133         '95.0.4638.17',
 134         '95.0.4638.50',
 135         '95.0.4638.54',
 136         '95.0.4638.69',
 137         '95.0.4638.74',
 138         '96.0.4664.18',
 139         '96.0.4664.45',
 140         '96.0.4664.55',
 141         '96.0.4664.93',
 142         '97.0.4692.20',
 143     )
 144     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 145
 146
 147 SUPPORTED_ENCODINGS = [
 148     'gzip', 'deflate'
 149 ]
 150 if compat_brotli:
 151     SUPPORTED_ENCODINGS.append('br')
 152
 153 std_headers = {
 154     'User-Agent': random_user_agent(),
 155     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 156     'Accept-Encoding': ', '.join(SUPPORTED_ENCODINGS),
 157     'Accept-Language': 'en-us,en;q=0.5',
 158     'Sec-Fetch-Mode': 'navigate',
 159 }
 160
 161
 162 USER_AGENTS = {
 163     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 164 }
 165
 166
 167 NO_DEFAULT = object()
 168
 169 ENGLISH_MONTH_NAMES = [
 170     'January', 'February', 'March', 'April', 'May', 'June',
 171     'July', 'August', 'September', 'October', 'November', 'December']
 172
 173 MONTH_NAMES = {
 174     'en': ENGLISH_MONTH_NAMES,
 175     'fr': [
 176         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 177         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 178 }
 179
 180 KNOWN_EXTENSIONS = (
 181     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 182     'flv', 'f4v', 'f4a', 'f4b',
 183     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 184     'mkv', 'mka', 'mk3d',
 185     'avi', 'divx',
 186     'mov',
 187     'asf', 'wmv', 'wma',
 188     '3gp', '3g2',
 189     'mp3',
 190     'flac',
 191     'ape',
 192     'wav',
 193     'f4f', 'f4m', 'm3u8', 'smil')
 194
 195 # needed for sanitizing filenames in restricted mode
 196 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 197                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 198                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 199
 200 DATE_FORMATS = (
 201     '%d %B %Y',
 202     '%d %b %Y',
 203     '%B %d %Y',
 204     '%B %dst %Y',
 205     '%B %dnd %Y',
 206     '%B %drd %Y',
 207     '%B %dth %Y',
 208     '%b %d %Y',
 209     '%b %dst %Y',
 210     '%b %dnd %Y',
 211     '%b %drd %Y',
 212     '%b %dth %Y',
 213     '%b %dst %Y %I:%M',
 214     '%b %dnd %Y %I:%M',
 215     '%b %drd %Y %I:%M',
 216     '%b %dth %Y %I:%M',
 217     '%Y %m %d',
 218     '%Y-%m-%d',
 219     '%Y.%m.%d.',
 220     '%Y/%m/%d',
 221     '%Y/%m/%d %H:%M',
 222     '%Y/%m/%d %H:%M:%S',
 223     '%Y%m%d%H%M',
 224     '%Y%m%d%H%M%S',
 225     '%Y%m%d',
 226     '%Y-%m-%d %H:%M',
 227     '%Y-%m-%d %H:%M:%S',
 228     '%Y-%m-%d %H:%M:%S.%f',
 229     '%Y-%m-%d %H:%M:%S:%f',
 230     '%d.%m.%Y %H:%M',
 231     '%d.%m.%Y %H.%M',
 232     '%Y-%m-%dT%H:%M:%SZ',
 233     '%Y-%m-%dT%H:%M:%S.%fZ',
 234     '%Y-%m-%dT%H:%M:%S.%f0Z',
 235     '%Y-%m-%dT%H:%M:%S',
 236     '%Y-%m-%dT%H:%M:%S.%f',
 237     '%Y-%m-%dT%H:%M',
 238     '%b %d %Y at %H:%M',
 239     '%b %d %Y at %H:%M:%S',
 240     '%B %d %Y at %H:%M',
 241     '%B %d %Y at %H:%M:%S',
 242     '%H:%M %d-%b-%Y',
 243 )
 244
 245 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 246 DATE_FORMATS_DAY_FIRST.extend([
 247     '%d-%m-%Y',
 248     '%d.%m.%Y',
 249     '%d.%m.%y',
 250     '%d/%m/%Y',
 251     '%d/%m/%y',
 252     '%d/%m/%Y %H:%M:%S',
 253 ])
 254
 255 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 256 DATE_FORMATS_MONTH_FIRST.extend([
 257     '%m-%d-%Y',
 258     '%m.%d.%Y',
 259     '%m/%d/%Y',
 260     '%m/%d/%y',
 261     '%m/%d/%Y %H:%M:%S',
 262 ])
 263
 264 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 265 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 266
 267
 268 def preferredencoding():
 269     """Get preferred encoding.
 270
 271     Returns the best encoding scheme for the system, based on
 272     locale.getpreferredencoding() and some further tweaks.
 273     """
 274     try:
 275         pref = locale.getpreferredencoding()
 276         'TEST'.encode(pref)
 277     except Exception:
 278         pref = 'UTF-8'
 279
 280     return pref
 281
 282
 283 def write_json_file(obj, fn):
 284     """ Encode obj as JSON and write it to fn, atomically if possible """
 285
 286     fn = encodeFilename(fn)
 287     if sys.version_info < (3, 0) and sys.platform != 'win32':
 288         encoding = get_filesystem_encoding()
 289         # os.path.basename returns a bytes object, but NamedTemporaryFile
 290         # will fail if the filename contains non ascii characters unless we
 291         # use a unicode object
 292         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 293         # the same for os.path.dirname
 294         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 295     else:
 296         path_basename = os.path.basename
 297         path_dirname = os.path.dirname
 298
 299     args = {
 300         'suffix': '.tmp',
 301         'prefix': path_basename(fn) + '.',
 302         'dir': path_dirname(fn),
 303         'delete': False,
 304     }
 305
 306     # In Python 2.x, json.dump expects a bytestream.
 307     # In Python 3.x, it writes to a character stream
 308     if sys.version_info < (3, 0):
 309         args['mode'] = 'wb'
 310     else:
 311         args.update({
 312             'mode': 'w',
 313             'encoding': 'utf-8',
 314         })
 315
 316     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 317
 318     try:
 319         with tf:
 320             json.dump(obj, tf, ensure_ascii=False)
 321         if sys.platform == 'win32':
 322             # Need to remove existing file on Windows, else os.rename raises
 323             # WindowsError or FileExistsError.
 324             try:
 325                 os.unlink(fn)
 326             except OSError:
 327                 pass
 328         try:
 329             mask = os.umask(0)
 330             os.umask(mask)
 331             os.chmod(tf.name, 0o666 & ~mask)
 332         except OSError:
 333             pass
 334         os.rename(tf.name, fn)
 335     except Exception:
 336         try:
 337             os.remove(tf.name)
 338         except OSError:
 339             pass
 340         raise
 341
 342
 343 if sys.version_info >= (2, 7):
 344     def find_xpath_attr(node, xpath, key, val=None):
 345         """ Find the xpath xpath[@key=val] """
 346         assert re.match(r'^[a-zA-Z_-]+$', key)
 347         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 348         return node.find(expr)
 349 else:
 350     def find_xpath_attr(node, xpath, key, val=None):
 351         for f in node.findall(compat_xpath(xpath)):
 352             if key not in f.attrib:
 353                 continue
 354             if val is None or f.attrib.get(key) == val:
 355                 return f
 356         return None
 357
 358 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 359 # the namespace parameter
 360
 361
 362 def xpath_with_ns(path, ns_map):
 363     components = [c.split(':') for c in path.split('/')]
 364     replaced = []
 365     for c in components:
 366         if len(c) == 1:
 367             replaced.append(c[0])
 368         else:
 369             ns, tag = c
 370             replaced.append('{%s}%s' % (ns_map[ns], tag))
 371     return '/'.join(replaced)
 372
 373
 374 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 375     def _find_xpath(xpath):
 376         return node.find(compat_xpath(xpath))
 377
 378     if isinstance(xpath, (str, compat_str)):
 379         n = _find_xpath(xpath)
 380     else:
 381         for xp in xpath:
 382             n = _find_xpath(xp)
 383             if n is not None:
 384                 break
 385
 386     if n is None:
 387         if default is not NO_DEFAULT:
 388             return default
 389         elif fatal:
 390             name = xpath if name is None else name
 391             raise ExtractorError('Could not find XML element %s' % name)
 392         else:
 393             return None
 394     return n
 395
 396
 397 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 398     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 399     if n is None or n == default:
 400         return n
 401     if n.text is None:
 402         if default is not NO_DEFAULT:
 403             return default
 404         elif fatal:
 405             name = xpath if name is None else name
 406             raise ExtractorError('Could not find XML element\'s text %s' % name)
 407         else:
 408             return None
 409     return n.text
 410
 411
 412 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 413     n = find_xpath_attr(node, xpath, key)
 414     if n is None:
 415         if default is not NO_DEFAULT:
 416             return default
 417         elif fatal:
 418             name = '%s[@%s]' % (xpath, key) if name is None else name
 419             raise ExtractorError('Could not find XML attribute %s' % name)
 420         else:
 421             return None
 422     return n.attrib[key]
 423
 424
 425 def get_element_by_id(id, html):
 426     """Return the content of the tag with the specified ID in the passed HTML document"""
 427     return get_element_by_attribute('id', id, html)
 428
 429
 430 def get_element_html_by_id(id, html):
 431     """Return the html of the tag with the specified ID in the passed HTML document"""
 432     return get_element_html_by_attribute('id', id, html)
 433
 434
 435 def get_element_by_class(class_name, html):
 436     """Return the content of the first tag with the specified class in the passed HTML document"""
 437     retval = get_elements_by_class(class_name, html)
 438     return retval[0] if retval else None
 439
 440
 441 def get_element_html_by_class(class_name, html):
 442     """Return the html of the first tag with the specified class in the passed HTML document"""
 443     retval = get_elements_html_by_class(class_name, html)
 444     return retval[0] if retval else None
 445
 446
 447 def get_element_by_attribute(attribute, value, html, escape_value=True):
 448     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 449     return retval[0] if retval else None
 450
 451
 452 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 453     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 454     return retval[0] if retval else None
 455
 456
 457 def get_elements_by_class(class_name, html):
 458     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 459     return get_elements_by_attribute(
 460         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 461         html, escape_value=False)
 462
 463
 464 def get_elements_html_by_class(class_name, html):
 465     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 466     return get_elements_html_by_attribute(
 467         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 468         html, escape_value=False)
 469
 470
 471 def get_elements_by_attribute(*args, **kwargs):
 472     """Return the content of the tag with the specified attribute in the passed HTML document"""
 473     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 474
 475
 476 def get_elements_html_by_attribute(*args, **kwargs):
 477     """Return the html of the tag with the specified attribute in the passed HTML document"""
 478     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 479
 480
 481 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 482     """
 483     Return the text (content) and the html (whole) of the tag with the specified
 484     attribute in the passed HTML document
 485     """
 486
 487     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 488
 489     value = re.escape(value) if escape_value else value
 490
 491     partial_element_re = r'''(?x)
 492         <(?P<tag>[a-zA-Z0-9:._-]+)
 493          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 494          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 495         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 496
 497     for m in re.finditer(partial_element_re, html):
 498         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 499
 500         yield (
 501             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 502             whole
 503         )
 504
 505
 506 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 507     """
 508     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 509     closing tag for the first opening tag it has encountered, and can be used
 510     as a context manager
 511     """
 512
 513     class HTMLBreakOnClosingTagException(Exception):
 514         pass
 515
 516     def __init__(self):
 517         self.tagstack = collections.deque()
 518         compat_HTMLParser.__init__(self)
 519
 520     def __enter__(self):
 521         return self
 522
 523     def __exit__(self, *_):
 524         self.close()
 525
 526     def close(self):
 527         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 528         # so data remains buffered; we no longer have any interest in it, thus
 529         # override this method to discard it
 530         pass
 531
 532     def handle_starttag(self, tag, _):
 533         self.tagstack.append(tag)
 534
 535     def handle_endtag(self, tag):
 536         if not self.tagstack:
 537             raise compat_HTMLParseError('no tags in the stack')
 538         while self.tagstack:
 539             inner_tag = self.tagstack.pop()
 540             if inner_tag == tag:
 541                 break
 542         else:
 543             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 544         if not self.tagstack:
 545             raise self.HTMLBreakOnClosingTagException()
 546
 547
 548 def get_element_text_and_html_by_tag(tag, html):
 549     """
 550     For the first element with the specified tag in the passed HTML document
 551     return its' content (text) and the whole element (html)
 552     """
 553     def find_or_raise(haystack, needle, exc):
 554         try:
 555             return haystack.index(needle)
 556         except ValueError:
 557             raise exc
 558     closing_tag = f'</{tag}>'
 559     whole_start = find_or_raise(
 560         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 561     content_start = find_or_raise(
 562         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 563     content_start += whole_start + 1
 564     with HTMLBreakOnClosingTagParser() as parser:
 565         parser.feed(html[whole_start:content_start])
 566         if not parser.tagstack or parser.tagstack[0] != tag:
 567             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 568         offset = content_start
 569         while offset < len(html):
 570             next_closing_tag_start = find_or_raise(
 571                 html[offset:], closing_tag,
 572                 compat_HTMLParseError(f'closing {tag} tag not found'))
 573             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 574             try:
 575                 parser.feed(html[offset:offset + next_closing_tag_end])
 576                 offset += next_closing_tag_end
 577             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 578                 return html[content_start:offset + next_closing_tag_start], \
 579                     html[whole_start:offset + next_closing_tag_end]
 580         raise compat_HTMLParseError('unexpected end of html')
 581
 582
 583 class HTMLAttributeParser(compat_HTMLParser):
 584     """Trivial HTML parser to gather the attributes for a single element"""
 585
 586     def __init__(self):
 587         self.attrs = {}
 588         compat_HTMLParser.__init__(self)
 589
 590     def handle_starttag(self, tag, attrs):
 591         self.attrs = dict(attrs)
 592
 593
 594 class HTMLListAttrsParser(compat_HTMLParser):
 595     """HTML parser to gather the attributes for the elements of a list"""
 596
 597     def __init__(self):
 598         compat_HTMLParser.__init__(self)
 599         self.items = []
 600         self._level = 0
 601
 602     def handle_starttag(self, tag, attrs):
 603         if tag == 'li' and self._level == 0:
 604             self.items.append(dict(attrs))
 605         self._level += 1
 606
 607     def handle_endtag(self, tag):
 608         self._level -= 1
 609
 610
 611 def extract_attributes(html_element):
 612     """Given a string for an HTML element such as
 613     <el
 614          a="foo" B="bar" c="&98;az" d=boz
 615          empty= noval entity="&amp;"
 616          sq='"' dq="'"
 617     >
 618     Decode and return a dictionary of attributes.
 619     {
 620         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 621         'empty': '', 'noval': None, 'entity': '&',
 622         'sq': '"', 'dq': '\''
 623     }.
 624     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 625     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 626     """
 627     parser = HTMLAttributeParser()
 628     try:
 629         parser.feed(html_element)
 630         parser.close()
 631     # Older Python may throw HTMLParseError in case of malformed HTML
 632     except compat_HTMLParseError:
 633         pass
 634     return parser.attrs
 635
 636
 637 def parse_list(webpage):
 638     """Given a string for an series of HTML <li> elements,
 639     return a dictionary of their attributes"""
 640     parser = HTMLListAttrsParser()
 641     parser.feed(webpage)
 642     parser.close()
 643     return parser.items
 644
 645
 646 def clean_html(html):
 647     """Clean an HTML snippet into a readable string"""
 648
 649     if html is None:  # Convenience for sanitizing descriptions etc.
 650         return html
 651
 652     html = re.sub(r'\s+', ' ', html)
 653     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 654     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 655     # Strip html tags
 656     html = re.sub('<.*?>', '', html)
 657     # Replace html entities
 658     html = unescapeHTML(html)
 659     return html.strip()
 660
 661
 662 def sanitize_open(filename, open_mode):
 663     """Try to open the given filename, and slightly tweak it if this fails.
 664
 665     Attempts to open the given filename. If this fails, it tries to change
 666     the filename slightly, step by step, until it's either able to open it
 667     or it fails and raises a final exception, like the standard open()
 668     function.
 669
 670     It returns the tuple (stream, definitive_file_name).
 671     """
 672     try:
 673         if filename == '-':
 674             if sys.platform == 'win32':
 675                 import msvcrt
 676                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 677             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 678         stream = locked_file(filename, open_mode, block=False).open()
 679         return (stream, filename)
 680     except (IOError, OSError) as err:
 681         if err.errno in (errno.EACCES,):
 682             raise
 683
 684         # In case of error, try to remove win32 forbidden chars
 685         alt_filename = sanitize_path(filename)
 686         if alt_filename == filename:
 687             raise
 688         else:
 689             # An exception here should be caught in the caller
 690             stream = locked_file(filename, open_mode, block=False).open()
 691             return (stream, alt_filename)
 692
 693
 694 def timeconvert(timestr):
 695     """Convert RFC 2822 defined time string into system timestamp"""
 696     timestamp = None
 697     timetuple = email.utils.parsedate_tz(timestr)
 698     if timetuple is not None:
 699         timestamp = email.utils.mktime_tz(timetuple)
 700     return timestamp
 701
 702
 703 def sanitize_filename(s, restricted=False, is_id=False):
 704     """Sanitizes a string so it could be used as part of a filename.
 705     If restricted is set, use a stricter subset of allowed characters.
 706     Set is_id if this is not an arbitrary string, but an ID that should be kept
 707     if possible.
 708     """
 709     def replace_insane(char):
 710         if restricted and char in ACCENT_CHARS:
 711             return ACCENT_CHARS[char]
 712         elif not restricted and char == '\n':
 713             return ' '
 714         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 715             return ''
 716         elif char == '"':
 717             return '' if restricted else '\''
 718         elif char == ':':
 719             return '_-' if restricted else ' -'
 720         elif char in '\\/|*<>':
 721             return '_'
 722         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 723             return '_'
 724         if restricted and ord(char) > 127:
 725             return '_'
 726         return char
 727
 728     if s == '':
 729         return ''
 730     # Handle timestamps
 731     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 732     result = ''.join(map(replace_insane, s))
 733     if not is_id:
 734         while '__' in result:
 735             result = result.replace('__', '_')
 736         result = result.strip('_')
 737         # Common case of "Foreign band name - English song title"
 738         if restricted and result.startswith('-_'):
 739             result = result[2:]
 740         if result.startswith('-'):
 741             result = '_' + result[len('-'):]
 742         result = result.lstrip('.')
 743         if not result:
 744             result = '_'
 745     return result
 746
 747
 748 def sanitize_path(s, force=False):
 749     """Sanitizes and normalizes path on Windows"""
 750     if sys.platform == 'win32':
 751         force = False
 752         drive_or_unc, _ = os.path.splitdrive(s)
 753         if sys.version_info < (2, 7) and not drive_or_unc:
 754             drive_or_unc, _ = os.path.splitunc(s)
 755     elif force:
 756         drive_or_unc = ''
 757     else:
 758         return s
 759
 760     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 761     if drive_or_unc:
 762         norm_path.pop(0)
 763     sanitized_path = [
 764         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 765         for path_part in norm_path]
 766     if drive_or_unc:
 767         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 768     elif force and s[0] == os.path.sep:
 769         sanitized_path.insert(0, os.path.sep)
 770     return os.path.join(*sanitized_path)
 771
 772
 773 def sanitize_url(url):
 774     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 775     # the number of unwanted failures due to missing protocol
 776     if url.startswith('//'):
 777         return 'http:%s' % url
 778     # Fix some common typos seen so far
 779     COMMON_TYPOS = (
 780         # https://github.com/ytdl-org/youtube-dl/issues/15649
 781         (r'^httpss://', r'https://'),
 782         # https://bx1.be/lives/direct-tv/
 783         (r'^rmtp([es]?)://', r'rtmp\1://'),
 784     )
 785     for mistake, fixup in COMMON_TYPOS:
 786         if re.match(mistake, url):
 787             return re.sub(mistake, fixup, url)
 788     return url
 789
 790
 791 def extract_basic_auth(url):
 792     parts = compat_urlparse.urlsplit(url)
 793     if parts.username is None:
 794         return url, None
 795     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 796         parts.hostname if parts.port is None
 797         else '%s:%d' % (parts.hostname, parts.port))))
 798     auth_payload = base64.b64encode(
 799         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 800     return url, 'Basic ' + auth_payload.decode('utf-8')
 801
 802
 803 def sanitized_Request(url, *args, **kwargs):
 804     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 805     if auth_header is not None:
 806         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 807         headers['Authorization'] = auth_header
 808     return compat_urllib_request.Request(url, *args, **kwargs)
 809
 810
 811 def expand_path(s):
 812     """Expand shell variables and ~"""
 813     return os.path.expandvars(compat_expanduser(s))
 814
 815
 816 def orderedSet(iterable):
 817     """ Remove all duplicates from the input iterable """
 818     res = []
 819     for el in iterable:
 820         if el not in res:
 821             res.append(el)
 822     return res
 823
 824
 825 def _htmlentity_transform(entity_with_semicolon):
 826     """Transforms an HTML entity to a character."""
 827     entity = entity_with_semicolon[:-1]
 828
 829     # Known non-numeric HTML entity
 830     if entity in compat_html_entities.name2codepoint:
 831         return compat_chr(compat_html_entities.name2codepoint[entity])
 832
 833     # TODO: HTML5 allows entities without a semicolon. For example,
 834     # '&Eacuteric' should be decoded as 'Éric'.
 835     if entity_with_semicolon in compat_html_entities_html5:
 836         return compat_html_entities_html5[entity_with_semicolon]
 837
 838     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 839     if mobj is not None:
 840         numstr = mobj.group(1)
 841         if numstr.startswith('x'):
 842             base = 16
 843             numstr = '0%s' % numstr
 844         else:
 845             base = 10
 846         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 847         try:
 848             return compat_chr(int(numstr, base))
 849         except ValueError:
 850             pass
 851
 852     # Unknown entity in name, return its literal representation
 853     return '&%s;' % entity
 854
 855
 856 def unescapeHTML(s):
 857     if s is None:
 858         return None
 859     assert type(s) == compat_str
 860
 861     return re.sub(
 862         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 863
 864
 865 def escapeHTML(text):
 866     return (
 867         text
 868         .replace('&', '&amp;')
 869         .replace('<', '&lt;')
 870         .replace('>', '&gt;')
 871         .replace('"', '&quot;')
 872         .replace("'", '&#39;')
 873     )
 874
 875
 876 def process_communicate_or_kill(p, *args, **kwargs):
 877     try:
 878         return p.communicate(*args, **kwargs)
 879     except BaseException:  # Including KeyboardInterrupt
 880         p.kill()
 881         p.wait()
 882         raise
 883
 884
 885 class Popen(subprocess.Popen):
 886     if sys.platform == 'win32':
 887         _startupinfo = subprocess.STARTUPINFO()
 888         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 889     else:
 890         _startupinfo = None
 891
 892     def __init__(self, *args, **kwargs):
 893         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 894
 895     def communicate_or_kill(self, *args, **kwargs):
 896         return process_communicate_or_kill(self, *args, **kwargs)
 897
 898
 899 def get_subprocess_encoding():
 900     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 901         # For subprocess calls, encode with locale encoding
 902         # Refer to http://stackoverflow.com/a/9951851/35070
 903         encoding = preferredencoding()
 904     else:
 905         encoding = sys.getfilesystemencoding()
 906     if encoding is None:
 907         encoding = 'utf-8'
 908     return encoding
 909
 910
 911 def encodeFilename(s, for_subprocess=False):
 912     """
 913     @param s The name of the file
 914     """
 915
 916     assert type(s) == compat_str
 917
 918     # Python 3 has a Unicode API
 919     if sys.version_info >= (3, 0):
 920         return s
 921
 922     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 923     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 924     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 925     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 926         return s
 927
 928     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 929     if sys.platform.startswith('java'):
 930         return s
 931
 932     return s.encode(get_subprocess_encoding(), 'ignore')
 933
 934
 935 def decodeFilename(b, for_subprocess=False):
 936
 937     if sys.version_info >= (3, 0):
 938         return b
 939
 940     if not isinstance(b, bytes):
 941         return b
 942
 943     return b.decode(get_subprocess_encoding(), 'ignore')
 944
 945
 946 def encodeArgument(s):
 947     if not isinstance(s, compat_str):
 948         # Legacy code that uses byte strings
 949         # Uncomment the following line after fixing all post processors
 950         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 951         s = s.decode('ascii')
 952     return encodeFilename(s, True)
 953
 954
 955 def decodeArgument(b):
 956     return decodeFilename(b, True)
 957
 958
 959 def decodeOption(optval):
 960     if optval is None:
 961         return optval
 962     if isinstance(optval, bytes):
 963         optval = optval.decode(preferredencoding())
 964
 965     assert isinstance(optval, compat_str)
 966     return optval
 967
 968
 969 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 970
 971
 972 def timetuple_from_msec(msec):
 973     secs, msec = divmod(msec, 1000)
 974     mins, secs = divmod(secs, 60)
 975     hrs, mins = divmod(mins, 60)
 976     return _timetuple(hrs, mins, secs, msec)
 977
 978
 979 def formatSeconds(secs, delim=':', msec=False):
 980     time = timetuple_from_msec(secs * 1000)
 981     if time.hours:
 982         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 983     elif time.minutes:
 984         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 985     else:
 986         ret = '%d' % time.seconds
 987     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 988
 989
 990 def _ssl_load_windows_store_certs(ssl_context, storename):
 991     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 992     try:
 993         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 994                  if encoding == 'x509_asn' and (
 995                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 996     except PermissionError:
 997         return
 998     for cert in certs:
 999         try:
1000             ssl_context.load_verify_locations(cadata=cert)
1001         except ssl.SSLError:
1002             pass
1003
1004
1005 def make_HTTPS_handler(params, **kwargs):
1006     opts_check_certificate = not params.get('nocheckcertificate')
1007     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1008     context.check_hostname = opts_check_certificate
1009     if params.get('legacyserverconnect'):
1010         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1011     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1012     if opts_check_certificate:
1013         try:
1014             context.load_default_certs()
1015             # Work around the issue in load_default_certs when there are bad certificates. See:
1016             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1017             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1018         except ssl.SSLError:
1019             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1020             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1021                 # Create a new context to discard any certificates that were already loaded
1022                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1023                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1024                 for storename in ('CA', 'ROOT'):
1025                     _ssl_load_windows_store_certs(context, storename)
1026             context.set_default_verify_paths()
1027     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1028
1029
1030 def bug_reports_message(before=';'):
1031     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1032            'filling out the appropriate issue template. '
1033            'Confirm you are on the latest version using  yt-dlp -U')
1034
1035     before = before.rstrip()
1036     if not before or before.endswith(('.', '!', '?')):
1037         msg = msg[0].title() + msg[1:]
1038
1039     return (before + ' ' if before else '') + msg
1040
1041
1042 class YoutubeDLError(Exception):
1043     """Base exception for YoutubeDL errors."""
1044     msg = None
1045
1046     def __init__(self, msg=None):
1047         if msg is not None:
1048             self.msg = msg
1049         elif self.msg is None:
1050             self.msg = type(self).__name__
1051         super().__init__(self.msg)
1052
1053
1054 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1055 if hasattr(ssl, 'CertificateError'):
1056     network_exceptions.append(ssl.CertificateError)
1057 network_exceptions = tuple(network_exceptions)
1058
1059
1060 class ExtractorError(YoutubeDLError):
1061     """Error during info extraction."""
1062
1063     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1064         """ tb, if given, is the original traceback (so that it can be printed out).
1065         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1066         """
1067         if sys.exc_info()[0] in network_exceptions:
1068             expected = True
1069
1070         self.orig_msg = str(msg)
1071         self.traceback = tb
1072         self.expected = expected
1073         self.cause = cause
1074         self.video_id = video_id
1075         self.ie = ie
1076         self.exc_info = sys.exc_info()  # preserve original exception
1077
1078         super(ExtractorError, self).__init__(''.join((
1079             format_field(ie, template='[%s] '),
1080             format_field(video_id, template='%s: '),
1081             msg,
1082             format_field(cause, template=' (caused by %r)'),
1083             '' if expected else bug_reports_message())))
1084
1085     def format_traceback(self):
1086         return join_nonempty(
1087             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1088             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1089             delim='\n') or None
1090
1091
1092 class UnsupportedError(ExtractorError):
1093     def __init__(self, url):
1094         super(UnsupportedError, self).__init__(
1095             'Unsupported URL: %s' % url, expected=True)
1096         self.url = url
1097
1098
1099 class RegexNotFoundError(ExtractorError):
1100     """Error when a regex didn't match"""
1101     pass
1102
1103
1104 class GeoRestrictedError(ExtractorError):
1105     """Geographic restriction Error exception.
1106
1107     This exception may be thrown when a video is not available from your
1108     geographic location due to geographic restrictions imposed by a website.
1109     """
1110
1111     def __init__(self, msg, countries=None, **kwargs):
1112         kwargs['expected'] = True
1113         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1114         self.countries = countries
1115
1116
1117 class DownloadError(YoutubeDLError):
1118     """Download Error exception.
1119
1120     This exception may be thrown by FileDownloader objects if they are not
1121     configured to continue on errors. They will contain the appropriate
1122     error message.
1123     """
1124
1125     def __init__(self, msg, exc_info=None):
1126         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1127         super(DownloadError, self).__init__(msg)
1128         self.exc_info = exc_info
1129
1130
1131 class EntryNotInPlaylist(YoutubeDLError):
1132     """Entry not in playlist exception.
1133
1134     This exception will be thrown by YoutubeDL when a requested entry
1135     is not found in the playlist info_dict
1136     """
1137     msg = 'Entry not found in info'
1138
1139
1140 class SameFileError(YoutubeDLError):
1141     """Same File exception.
1142
1143     This exception will be thrown by FileDownloader objects if they detect
1144     multiple files would have to be downloaded to the same file on disk.
1145     """
1146     msg = 'Fixed output name but more than one file to download'
1147
1148     def __init__(self, filename=None):
1149         if filename is not None:
1150             self.msg += f': {filename}'
1151         super().__init__(self.msg)
1152
1153
1154 class PostProcessingError(YoutubeDLError):
1155     """Post Processing exception.
1156
1157     This exception may be raised by PostProcessor's .run() method to
1158     indicate an error in the postprocessing task.
1159     """
1160
1161
1162 class DownloadCancelled(YoutubeDLError):
1163     """ Exception raised when the download queue should be interrupted """
1164     msg = 'The download was cancelled'
1165
1166
1167 class ExistingVideoReached(DownloadCancelled):
1168     """ --break-on-existing triggered """
1169     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1170
1171
1172 class RejectedVideoReached(DownloadCancelled):
1173     """ --break-on-reject triggered """
1174     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1175
1176
1177 class MaxDownloadsReached(DownloadCancelled):
1178     """ --max-downloads limit has been reached. """
1179     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1180
1181
1182 class ReExtractInfo(YoutubeDLError):
1183     """ Video info needs to be re-extracted. """
1184
1185     def __init__(self, msg, expected=False):
1186         super().__init__(msg)
1187         self.expected = expected
1188
1189
1190 class ThrottledDownload(ReExtractInfo):
1191     """ Download speed below --throttled-rate. """
1192     msg = 'The download speed is below throttle limit'
1193
1194     def __init__(self):
1195         super().__init__(self.msg, expected=False)
1196
1197
1198 class UnavailableVideoError(YoutubeDLError):
1199     """Unavailable Format exception.
1200
1201     This exception will be thrown when a video is requested
1202     in a format that is not available for that video.
1203     """
1204     msg = 'Unable to download video'
1205
1206     def __init__(self, err=None):
1207         if err is not None:
1208             self.msg += f': {err}'
1209         super().__init__(self.msg)
1210
1211
1212 class ContentTooShortError(YoutubeDLError):
1213     """Content Too Short exception.
1214
1215     This exception may be raised by FileDownloader objects when a file they
1216     download is too small for what the server announced first, indicating
1217     the connection was probably interrupted.
1218     """
1219
1220     def __init__(self, downloaded, expected):
1221         super(ContentTooShortError, self).__init__(
1222             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1223         )
1224         # Both in bytes
1225         self.downloaded = downloaded
1226         self.expected = expected
1227
1228
1229 class XAttrMetadataError(YoutubeDLError):
1230     def __init__(self, code=None, msg='Unknown error'):
1231         super(XAttrMetadataError, self).__init__(msg)
1232         self.code = code
1233         self.msg = msg
1234
1235         # Parsing code and msg
1236         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1237                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1238             self.reason = 'NO_SPACE'
1239         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1240             self.reason = 'VALUE_TOO_LONG'
1241         else:
1242             self.reason = 'NOT_SUPPORTED'
1243
1244
1245 class XAttrUnavailableError(YoutubeDLError):
1246     pass
1247
1248
1249 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1250     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1251     # expected HTTP responses to meet HTTP/1.0 or later (see also
1252     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1253     if sys.version_info < (3, 0):
1254         kwargs['strict'] = True
1255     hc = http_class(*args, **compat_kwargs(kwargs))
1256     source_address = ydl_handler._params.get('source_address')
1257
1258     if source_address is not None:
1259         # This is to workaround _create_connection() from socket where it will try all
1260         # address data from getaddrinfo() including IPv6. This filters the result from
1261         # getaddrinfo() based on the source_address value.
1262         # This is based on the cpython socket.create_connection() function.
1263         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1264         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1265             host, port = address
1266             err = None
1267             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1268             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1269             ip_addrs = [addr for addr in addrs if addr[0] == af]
1270             if addrs and not ip_addrs:
1271                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1272                 raise socket.error(
1273                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1274                     % (ip_version, source_address[0]))
1275             for res in ip_addrs:
1276                 af, socktype, proto, canonname, sa = res
1277                 sock = None
1278                 try:
1279                     sock = socket.socket(af, socktype, proto)
1280                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1281                         sock.settimeout(timeout)
1282                     sock.bind(source_address)
1283                     sock.connect(sa)
1284                     err = None  # Explicitly break reference cycle
1285                     return sock
1286                 except socket.error as _:
1287                     err = _
1288                     if sock is not None:
1289                         sock.close()
1290             if err is not None:
1291                 raise err
1292             else:
1293                 raise socket.error('getaddrinfo returns an empty list')
1294         if hasattr(hc, '_create_connection'):
1295             hc._create_connection = _create_connection
1296         sa = (source_address, 0)
1297         if hasattr(hc, 'source_address'):  # Python 2.7+
1298             hc.source_address = sa
1299         else:  # Python 2.6
1300             def _hc_connect(self, *args, **kwargs):
1301                 sock = _create_connection(
1302                     (self.host, self.port), self.timeout, sa)
1303                 if is_https:
1304                     self.sock = ssl.wrap_socket(
1305                         sock, self.key_file, self.cert_file,
1306                         ssl_version=ssl.PROTOCOL_TLSv1)
1307                 else:
1308                     self.sock = sock
1309             hc.connect = functools.partial(_hc_connect, hc)
1310
1311     return hc
1312
1313
1314 def handle_youtubedl_headers(headers):
1315     filtered_headers = headers
1316
1317     if 'Youtubedl-no-compression' in filtered_headers:
1318         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1319         del filtered_headers['Youtubedl-no-compression']
1320
1321     return filtered_headers
1322
1323
1324 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1325     """Handler for HTTP requests and responses.
1326
1327     This class, when installed with an OpenerDirector, automatically adds
1328     the standard headers to every HTTP request and handles gzipped and
1329     deflated responses from web servers. If compression is to be avoided in
1330     a particular request, the original request in the program code only has
1331     to include the HTTP header "Youtubedl-no-compression", which will be
1332     removed before making the real request.
1333
1334     Part of this code was copied from:
1335
1336     http://techknack.net/python-urllib2-handlers/
1337
1338     Andrew Rowls, the author of that code, agreed to release it to the
1339     public domain.
1340     """
1341
1342     def __init__(self, params, *args, **kwargs):
1343         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1344         self._params = params
1345
1346     def http_open(self, req):
1347         conn_class = compat_http_client.HTTPConnection
1348
1349         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1350         if socks_proxy:
1351             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1352             del req.headers['Ytdl-socks-proxy']
1353
1354         return self.do_open(functools.partial(
1355             _create_http_connection, self, conn_class, False),
1356             req)
1357
1358     @staticmethod
1359     def deflate(data):
1360         if not data:
1361             return data
1362         try:
1363             return zlib.decompress(data, -zlib.MAX_WBITS)
1364         except zlib.error:
1365             return zlib.decompress(data)
1366
1367     @staticmethod
1368     def brotli(data):
1369         if not data:
1370             return data
1371         return compat_brotli.decompress(data)
1372
1373     def http_request(self, req):
1374         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1375         # always respected by websites, some tend to give out URLs with non percent-encoded
1376         # non-ASCII characters (see telemb.py, ard.py [#3412])
1377         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1378         # To work around aforementioned issue we will replace request's original URL with
1379         # percent-encoded one
1380         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1381         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1382         url = req.get_full_url()
1383         url_escaped = escape_url(url)
1384
1385         # Substitute URL if any change after escaping
1386         if url != url_escaped:
1387             req = update_Request(req, url=url_escaped)
1388
1389         for h, v in self._params.get('http_headers', std_headers).items():
1390             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1391             # The dict keys are capitalized because of this bug by urllib
1392             if h.capitalize() not in req.headers:
1393                 req.add_header(h, v)
1394
1395         req.headers = handle_youtubedl_headers(req.headers)
1396
1397         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1398             # Python 2.6 is brain-dead when it comes to fragments
1399             req._Request__original = req._Request__original.partition('#')[0]
1400             req._Request__r_type = req._Request__r_type.partition('#')[0]
1401
1402         return req
1403
1404     def http_response(self, req, resp):
1405         old_resp = resp
1406         # gzip
1407         if resp.headers.get('Content-encoding', '') == 'gzip':
1408             content = resp.read()
1409             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1410             try:
1411                 uncompressed = io.BytesIO(gz.read())
1412             except IOError as original_ioerror:
1413                 # There may be junk add the end of the file
1414                 # See http://stackoverflow.com/q/4928560/35070 for details
1415                 for i in range(1, 1024):
1416                     try:
1417                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1418                         uncompressed = io.BytesIO(gz.read())
1419                     except IOError:
1420                         continue
1421                     break
1422                 else:
1423                     raise original_ioerror
1424             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1425             resp.msg = old_resp.msg
1426             del resp.headers['Content-encoding']
1427         # deflate
1428         if resp.headers.get('Content-encoding', '') == 'deflate':
1429             gz = io.BytesIO(self.deflate(resp.read()))
1430             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1431             resp.msg = old_resp.msg
1432             del resp.headers['Content-encoding']
1433         # brotli
1434         if resp.headers.get('Content-encoding', '') == 'br':
1435             resp = compat_urllib_request.addinfourl(
1436                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1437             resp.msg = old_resp.msg
1438             del resp.headers['Content-encoding']
1439         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1440         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1441         if 300 <= resp.code < 400:
1442             location = resp.headers.get('Location')
1443             if location:
1444                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1445                 if sys.version_info >= (3, 0):
1446                     location = location.encode('iso-8859-1').decode('utf-8')
1447                 else:
1448                     location = location.decode('utf-8')
1449                 location_escaped = escape_url(location)
1450                 if location != location_escaped:
1451                     del resp.headers['Location']
1452                     if sys.version_info < (3, 0):
1453                         location_escaped = location_escaped.encode('utf-8')
1454                     resp.headers['Location'] = location_escaped
1455         return resp
1456
1457     https_request = http_request
1458     https_response = http_response
1459
1460
1461 def make_socks_conn_class(base_class, socks_proxy):
1462     assert issubclass(base_class, (
1463         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1464
1465     url_components = compat_urlparse.urlparse(socks_proxy)
1466     if url_components.scheme.lower() == 'socks5':
1467         socks_type = ProxyType.SOCKS5
1468     elif url_components.scheme.lower() in ('socks', 'socks4'):
1469         socks_type = ProxyType.SOCKS4
1470     elif url_components.scheme.lower() == 'socks4a':
1471         socks_type = ProxyType.SOCKS4A
1472
1473     def unquote_if_non_empty(s):
1474         if not s:
1475             return s
1476         return compat_urllib_parse_unquote_plus(s)
1477
1478     proxy_args = (
1479         socks_type,
1480         url_components.hostname, url_components.port or 1080,
1481         True,  # Remote DNS
1482         unquote_if_non_empty(url_components.username),
1483         unquote_if_non_empty(url_components.password),
1484     )
1485
1486     class SocksConnection(base_class):
1487         def connect(self):
1488             self.sock = sockssocket()
1489             self.sock.setproxy(*proxy_args)
1490             if type(self.timeout) in (int, float):
1491                 self.sock.settimeout(self.timeout)
1492             self.sock.connect((self.host, self.port))
1493
1494             if isinstance(self, compat_http_client.HTTPSConnection):
1495                 if hasattr(self, '_context'):  # Python > 2.6
1496                     self.sock = self._context.wrap_socket(
1497                         self.sock, server_hostname=self.host)
1498                 else:
1499                     self.sock = ssl.wrap_socket(self.sock)
1500
1501     return SocksConnection
1502
1503
1504 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1505     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1506         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1507         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1508         self._params = params
1509
1510     def https_open(self, req):
1511         kwargs = {}
1512         conn_class = self._https_conn_class
1513
1514         if hasattr(self, '_context'):  # python > 2.6
1515             kwargs['context'] = self._context
1516         if hasattr(self, '_check_hostname'):  # python 3.x
1517             kwargs['check_hostname'] = self._check_hostname
1518
1519         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1520         if socks_proxy:
1521             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1522             del req.headers['Ytdl-socks-proxy']
1523
1524         return self.do_open(functools.partial(
1525             _create_http_connection, self, conn_class, True),
1526             req, **kwargs)
1527
1528
1529 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1530     """
1531     See [1] for cookie file format.
1532
1533     1. https://curl.haxx.se/docs/http-cookies.html
1534     """
1535     _HTTPONLY_PREFIX = '#HttpOnly_'
1536     _ENTRY_LEN = 7
1537     _HEADER = '''# Netscape HTTP Cookie File
1538 # This file is generated by yt-dlp.  Do not edit.
1539
1540 '''
1541     _CookieFileEntry = collections.namedtuple(
1542         'CookieFileEntry',
1543         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1544
1545     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1546         """
1547         Save cookies to a file.
1548
1549         Most of the code is taken from CPython 3.8 and slightly adapted
1550         to support cookie files with UTF-8 in both python 2 and 3.
1551         """
1552         if filename is None:
1553             if self.filename is not None:
1554                 filename = self.filename
1555             else:
1556                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1557
1558         # Store session cookies with `expires` set to 0 instead of an empty
1559         # string
1560         for cookie in self:
1561             if cookie.expires is None:
1562                 cookie.expires = 0
1563
1564         with io.open(filename, 'w', encoding='utf-8') as f:
1565             f.write(self._HEADER)
1566             now = time.time()
1567             for cookie in self:
1568                 if not ignore_discard and cookie.discard:
1569                     continue
1570                 if not ignore_expires and cookie.is_expired(now):
1571                     continue
1572                 if cookie.secure:
1573                     secure = 'TRUE'
1574                 else:
1575                     secure = 'FALSE'
1576                 if cookie.domain.startswith('.'):
1577                     initial_dot = 'TRUE'
1578                 else:
1579                     initial_dot = 'FALSE'
1580                 if cookie.expires is not None:
1581                     expires = compat_str(cookie.expires)
1582                 else:
1583                     expires = ''
1584                 if cookie.value is None:
1585                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1586                     # with no name, whereas http.cookiejar regards it as a
1587                     # cookie with no value.
1588                     name = ''
1589                     value = cookie.name
1590                 else:
1591                     name = cookie.name
1592                     value = cookie.value
1593                 f.write(
1594                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1595                                secure, expires, name, value]) + '\n')
1596
1597     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1598         """Load cookies from a file."""
1599         if filename is None:
1600             if self.filename is not None:
1601                 filename = self.filename
1602             else:
1603                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1604
1605         def prepare_line(line):
1606             if line.startswith(self._HTTPONLY_PREFIX):
1607                 line = line[len(self._HTTPONLY_PREFIX):]
1608             # comments and empty lines are fine
1609             if line.startswith('#') or not line.strip():
1610                 return line
1611             cookie_list = line.split('\t')
1612             if len(cookie_list) != self._ENTRY_LEN:
1613                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1614             cookie = self._CookieFileEntry(*cookie_list)
1615             if cookie.expires_at and not cookie.expires_at.isdigit():
1616                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1617             return line
1618
1619         cf = io.StringIO()
1620         with io.open(filename, encoding='utf-8') as f:
1621             for line in f:
1622                 try:
1623                     cf.write(prepare_line(line))
1624                 except compat_cookiejar.LoadError as e:
1625                     write_string(
1626                         'WARNING: skipping cookie file entry due to %s: %r\n'
1627                         % (e, line), sys.stderr)
1628                     continue
1629         cf.seek(0)
1630         self._really_load(cf, filename, ignore_discard, ignore_expires)
1631         # Session cookies are denoted by either `expires` field set to
1632         # an empty string or 0. MozillaCookieJar only recognizes the former
1633         # (see [1]). So we need force the latter to be recognized as session
1634         # cookies on our own.
1635         # Session cookies may be important for cookies-based authentication,
1636         # e.g. usually, when user does not check 'Remember me' check box while
1637         # logging in on a site, some important cookies are stored as session
1638         # cookies so that not recognizing them will result in failed login.
1639         # 1. https://bugs.python.org/issue17164
1640         for cookie in self:
1641             # Treat `expires=0` cookies as session cookies
1642             if cookie.expires == 0:
1643                 cookie.expires = None
1644                 cookie.discard = True
1645
1646
1647 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1648     def __init__(self, cookiejar=None):
1649         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1650
1651     def http_response(self, request, response):
1652         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1653         # characters in Set-Cookie HTTP header of last response (see
1654         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1655         # In order to at least prevent crashing we will percent encode Set-Cookie
1656         # header before HTTPCookieProcessor starts processing it.
1657         # if sys.version_info < (3, 0) and response.headers:
1658         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1659         #         set_cookie = response.headers.get(set_cookie_header)
1660         #         if set_cookie:
1661         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1662         #             if set_cookie != set_cookie_escaped:
1663         #                 del response.headers[set_cookie_header]
1664         #                 response.headers[set_cookie_header] = set_cookie_escaped
1665         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1666
1667     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1668     https_response = http_response
1669
1670
1671 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1672     """YoutubeDL redirect handler
1673
1674     The code is based on HTTPRedirectHandler implementation from CPython [1].
1675
1676     This redirect handler solves two issues:
1677      - ensures redirect URL is always unicode under python 2
1678      - introduces support for experimental HTTP response status code
1679        308 Permanent Redirect [2] used by some sites [3]
1680
1681     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1682     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1683     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1684     """
1685
1686     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1687
1688     def redirect_request(self, req, fp, code, msg, headers, newurl):
1689         """Return a Request or None in response to a redirect.
1690
1691         This is called by the http_error_30x methods when a
1692         redirection response is received.  If a redirection should
1693         take place, return a new Request to allow http_error_30x to
1694         perform the redirect.  Otherwise, raise HTTPError if no-one
1695         else should try to handle this url.  Return None if you can't
1696         but another Handler might.
1697         """
1698         m = req.get_method()
1699         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1700                  or code in (301, 302, 303) and m == "POST")):
1701             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1702         # Strictly (according to RFC 2616), 301 or 302 in response to
1703         # a POST MUST NOT cause a redirection without confirmation
1704         # from the user (of urllib.request, in this case).  In practice,
1705         # essentially all clients do redirect in this case, so we do
1706         # the same.
1707
1708         # On python 2 urlh.geturl() may sometimes return redirect URL
1709         # as byte string instead of unicode. This workaround allows
1710         # to force it always return unicode.
1711         if sys.version_info[0] < 3:
1712             newurl = compat_str(newurl)
1713
1714         # Be conciliant with URIs containing a space.  This is mainly
1715         # redundant with the more complete encoding done in http_error_302(),
1716         # but it is kept for compatibility with other callers.
1717         newurl = newurl.replace(' ', '%20')
1718
1719         CONTENT_HEADERS = ("content-length", "content-type")
1720         # NB: don't use dict comprehension for python 2.6 compatibility
1721         newheaders = dict((k, v) for k, v in req.headers.items()
1722                           if k.lower() not in CONTENT_HEADERS)
1723         return compat_urllib_request.Request(
1724             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1725             unverifiable=True)
1726
1727
1728 def extract_timezone(date_str):
1729     m = re.search(
1730         r'''(?x)
1731             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1732             (?P<tz>Z|                                            # just the UTC Z, or
1733                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1734                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1735                    [ ]?                                          # optional space
1736                 (?P<sign>\+|-)                                   # +/-
1737                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1738             $)
1739         ''', date_str)
1740     if not m:
1741         timezone = datetime.timedelta()
1742     else:
1743         date_str = date_str[:-len(m.group('tz'))]
1744         if not m.group('sign'):
1745             timezone = datetime.timedelta()
1746         else:
1747             sign = 1 if m.group('sign') == '+' else -1
1748             timezone = datetime.timedelta(
1749                 hours=sign * int(m.group('hours')),
1750                 minutes=sign * int(m.group('minutes')))
1751     return timezone, date_str
1752
1753
1754 def parse_iso8601(date_str, delimiter='T', timezone=None):
1755     """ Return a UNIX timestamp from the given date """
1756
1757     if date_str is None:
1758         return None
1759
1760     date_str = re.sub(r'\.[0-9]+', '', date_str)
1761
1762     if timezone is None:
1763         timezone, date_str = extract_timezone(date_str)
1764
1765     try:
1766         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1767         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1768         return calendar.timegm(dt.timetuple())
1769     except ValueError:
1770         pass
1771
1772
1773 def date_formats(day_first=True):
1774     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1775
1776
1777 def unified_strdate(date_str, day_first=True):
1778     """Return a string with the date in the format YYYYMMDD"""
1779
1780     if date_str is None:
1781         return None
1782     upload_date = None
1783     # Replace commas
1784     date_str = date_str.replace(',', ' ')
1785     # Remove AM/PM + timezone
1786     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1787     _, date_str = extract_timezone(date_str)
1788
1789     for expression in date_formats(day_first):
1790         try:
1791             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1792         except ValueError:
1793             pass
1794     if upload_date is None:
1795         timetuple = email.utils.parsedate_tz(date_str)
1796         if timetuple:
1797             try:
1798                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1799             except ValueError:
1800                 pass
1801     if upload_date is not None:
1802         return compat_str(upload_date)
1803
1804
1805 def unified_timestamp(date_str, day_first=True):
1806     if date_str is None:
1807         return None
1808
1809     date_str = re.sub(r'[,|]', '', date_str)
1810
1811     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1812     timezone, date_str = extract_timezone(date_str)
1813
1814     # Remove AM/PM + timezone
1815     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1816
1817     # Remove unrecognized timezones from ISO 8601 alike timestamps
1818     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1819     if m:
1820         date_str = date_str[:-len(m.group('tz'))]
1821
1822     # Python only supports microseconds, so remove nanoseconds
1823     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1824     if m:
1825         date_str = m.group(1)
1826
1827     for expression in date_formats(day_first):
1828         try:
1829             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1830             return calendar.timegm(dt.timetuple())
1831         except ValueError:
1832             pass
1833     timetuple = email.utils.parsedate_tz(date_str)
1834     if timetuple:
1835         return calendar.timegm(timetuple) + pm_delta * 3600
1836
1837
1838 def determine_ext(url, default_ext='unknown_video'):
1839     if url is None or '.' not in url:
1840         return default_ext
1841     guess = url.partition('?')[0].rpartition('.')[2]
1842     if re.match(r'^[A-Za-z0-9]+$', guess):
1843         return guess
1844     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1845     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1846         return guess.rstrip('/')
1847     else:
1848         return default_ext
1849
1850
1851 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1852     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1853
1854
1855 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1856     """
1857     Return a datetime object from a string in the format YYYYMMDD or
1858     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1859
1860     format: string date format used to return datetime object from
1861     precision: round the time portion of a datetime object.
1862                 auto|microsecond|second|minute|hour|day.
1863                 auto: round to the unit provided in date_str (if applicable).
1864     """
1865     auto_precision = False
1866     if precision == 'auto':
1867         auto_precision = True
1868         precision = 'microsecond'
1869     today = datetime_round(datetime.datetime.utcnow(), precision)
1870     if date_str in ('now', 'today'):
1871         return today
1872     if date_str == 'yesterday':
1873         return today - datetime.timedelta(days=1)
1874     match = re.match(
1875         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1876         date_str)
1877     if match is not None:
1878         start_time = datetime_from_str(match.group('start'), precision, format)
1879         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1880         unit = match.group('unit')
1881         if unit == 'month' or unit == 'year':
1882             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1883             unit = 'day'
1884         else:
1885             if unit == 'week':
1886                 unit = 'day'
1887                 time *= 7
1888             delta = datetime.timedelta(**{unit + 's': time})
1889             new_date = start_time + delta
1890         if auto_precision:
1891             return datetime_round(new_date, unit)
1892         return new_date
1893
1894     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1895
1896
1897 def date_from_str(date_str, format='%Y%m%d', strict=False):
1898     """
1899     Return a datetime object from a string in the format YYYYMMDD or
1900     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1901
1902     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1903
1904     format: string date format used to return datetime object from
1905     """
1906     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1907         raise ValueError(f'Invalid date format {date_str}')
1908     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1909
1910
1911 def datetime_add_months(dt, months):
1912     """Increment/Decrement a datetime object by months."""
1913     month = dt.month + months - 1
1914     year = dt.year + month // 12
1915     month = month % 12 + 1
1916     day = min(dt.day, calendar.monthrange(year, month)[1])
1917     return dt.replace(year, month, day)
1918
1919
1920 def datetime_round(dt, precision='day'):
1921     """
1922     Round a datetime object's time to a specific precision
1923     """
1924     if precision == 'microsecond':
1925         return dt
1926
1927     unit_seconds = {
1928         'day': 86400,
1929         'hour': 3600,
1930         'minute': 60,
1931         'second': 1,
1932     }
1933     roundto = lambda x, n: ((x + n / 2) // n) * n
1934     timestamp = calendar.timegm(dt.timetuple())
1935     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1936
1937
1938 def hyphenate_date(date_str):
1939     """
1940     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1941     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1942     if match is not None:
1943         return '-'.join(match.groups())
1944     else:
1945         return date_str
1946
1947
1948 class DateRange(object):
1949     """Represents a time interval between two dates"""
1950
1951     def __init__(self, start=None, end=None):
1952         """start and end must be strings in the format accepted by date"""
1953         if start is not None:
1954             self.start = date_from_str(start, strict=True)
1955         else:
1956             self.start = datetime.datetime.min.date()
1957         if end is not None:
1958             self.end = date_from_str(end, strict=True)
1959         else:
1960             self.end = datetime.datetime.max.date()
1961         if self.start > self.end:
1962             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1963
1964     @classmethod
1965     def day(cls, day):
1966         """Returns a range that only contains the given day"""
1967         return cls(day, day)
1968
1969     def __contains__(self, date):
1970         """Check if the date is in the range"""
1971         if not isinstance(date, datetime.date):
1972             date = date_from_str(date)
1973         return self.start <= date <= self.end
1974
1975     def __str__(self):
1976         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1977
1978
1979 def platform_name():
1980     """ Returns the platform name as a compat_str """
1981     res = platform.platform()
1982     if isinstance(res, bytes):
1983         res = res.decode(preferredencoding())
1984
1985     assert isinstance(res, compat_str)
1986     return res
1987
1988
1989 def get_windows_version():
1990     ''' Get Windows version. None if it's not running on Windows '''
1991     if compat_os_name == 'nt':
1992         return version_tuple(platform.win32_ver()[1])
1993     else:
1994         return None
1995
1996
1997 def _windows_write_string(s, out):
1998     """ Returns True if the string was written using special methods,
1999     False if it has yet to be written out."""
2000     # Adapted from http://stackoverflow.com/a/3259271/35070
2001
2002     import ctypes.wintypes
2003
2004     WIN_OUTPUT_IDS = {
2005         1: -11,
2006         2: -12,
2007     }
2008
2009     try:
2010         fileno = out.fileno()
2011     except AttributeError:
2012         # If the output stream doesn't have a fileno, it's virtual
2013         return False
2014     except io.UnsupportedOperation:
2015         # Some strange Windows pseudo files?
2016         return False
2017     if fileno not in WIN_OUTPUT_IDS:
2018         return False
2019
2020     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2021         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2022         ('GetStdHandle', ctypes.windll.kernel32))
2023     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2024
2025     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2026         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2027         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2028         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2029     written = ctypes.wintypes.DWORD(0)
2030
2031     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2032     FILE_TYPE_CHAR = 0x0002
2033     FILE_TYPE_REMOTE = 0x8000
2034     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2035         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2036         ctypes.POINTER(ctypes.wintypes.DWORD))(
2037         ('GetConsoleMode', ctypes.windll.kernel32))
2038     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2039
2040     def not_a_console(handle):
2041         if handle == INVALID_HANDLE_VALUE or handle is None:
2042             return True
2043         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2044                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2045
2046     if not_a_console(h):
2047         return False
2048
2049     def next_nonbmp_pos(s):
2050         try:
2051             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2052         except StopIteration:
2053             return len(s)
2054
2055     while s:
2056         count = min(next_nonbmp_pos(s), 1024)
2057
2058         ret = WriteConsoleW(
2059             h, s, count if count else 2, ctypes.byref(written), None)
2060         if ret == 0:
2061             raise OSError('Failed to write string')
2062         if not count:  # We just wrote a non-BMP character
2063             assert written.value == 2
2064             s = s[1:]
2065         else:
2066             assert written.value > 0
2067             s = s[written.value:]
2068     return True
2069
2070
2071 def write_string(s, out=None, encoding=None):
2072     if out is None:
2073         out = sys.stderr
2074     assert type(s) == compat_str
2075
2076     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2077         if _windows_write_string(s, out):
2078             return
2079
2080     if ('b' in getattr(out, 'mode', '')
2081             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2082         byt = s.encode(encoding or preferredencoding(), 'ignore')
2083         out.write(byt)
2084     elif hasattr(out, 'buffer'):
2085         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2086         byt = s.encode(enc, 'ignore')
2087         out.buffer.write(byt)
2088     else:
2089         out.write(s)
2090     out.flush()
2091
2092
2093 def bytes_to_intlist(bs):
2094     if not bs:
2095         return []
2096     if isinstance(bs[0], int):  # Python 3
2097         return list(bs)
2098     else:
2099         return [ord(c) for c in bs]
2100
2101
2102 def intlist_to_bytes(xs):
2103     if not xs:
2104         return b''
2105     return compat_struct_pack('%dB' % len(xs), *xs)
2106
2107
2108 # Cross-platform file locking
2109 if sys.platform == 'win32':
2110     import ctypes.wintypes
2111     import msvcrt
2112
2113     class OVERLAPPED(ctypes.Structure):
2114         _fields_ = [
2115             ('Internal', ctypes.wintypes.LPVOID),
2116             ('InternalHigh', ctypes.wintypes.LPVOID),
2117             ('Offset', ctypes.wintypes.DWORD),
2118             ('OffsetHigh', ctypes.wintypes.DWORD),
2119             ('hEvent', ctypes.wintypes.HANDLE),
2120         ]
2121
2122     kernel32 = ctypes.windll.kernel32
2123     LockFileEx = kernel32.LockFileEx
2124     LockFileEx.argtypes = [
2125         ctypes.wintypes.HANDLE,     # hFile
2126         ctypes.wintypes.DWORD,      # dwFlags
2127         ctypes.wintypes.DWORD,      # dwReserved
2128         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2129         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2130         ctypes.POINTER(OVERLAPPED)  # Overlapped
2131     ]
2132     LockFileEx.restype = ctypes.wintypes.BOOL
2133     UnlockFileEx = kernel32.UnlockFileEx
2134     UnlockFileEx.argtypes = [
2135         ctypes.wintypes.HANDLE,     # hFile
2136         ctypes.wintypes.DWORD,      # dwReserved
2137         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2138         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2139         ctypes.POINTER(OVERLAPPED)  # Overlapped
2140     ]
2141     UnlockFileEx.restype = ctypes.wintypes.BOOL
2142     whole_low = 0xffffffff
2143     whole_high = 0x7fffffff
2144
2145     def _lock_file(f, exclusive, block):
2146         overlapped = OVERLAPPED()
2147         overlapped.Offset = 0
2148         overlapped.OffsetHigh = 0
2149         overlapped.hEvent = 0
2150         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2151
2152         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2153                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2154                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2155             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2156
2157     def _unlock_file(f):
2158         assert f._lock_file_overlapped_p
2159         handle = msvcrt.get_osfhandle(f.fileno())
2160         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2161             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2162
2163 else:
2164     try:
2165         import fcntl
2166
2167         def _lock_file(f, exclusive, block):
2168             try:
2169                 fcntl.flock(f,
2170                             fcntl.LOCK_SH if not exclusive
2171                             else fcntl.LOCK_EX if block
2172                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2173             except BlockingIOError:
2174                 raise
2175             except OSError:  # AOSP does not have flock()
2176                 fcntl.lockf(f,
2177                             fcntl.LOCK_SH if not exclusive
2178                             else fcntl.LOCK_EX if block
2179                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2180
2181         def _unlock_file(f):
2182             try:
2183                 fcntl.flock(f, fcntl.LOCK_UN)
2184             except OSError:
2185                 fcntl.lockf(f, fcntl.LOCK_UN)
2186
2187     except ImportError:
2188         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2189
2190         def _lock_file(f, exclusive, block):
2191             raise IOError(UNSUPPORTED_MSG)
2192
2193         def _unlock_file(f):
2194             raise IOError(UNSUPPORTED_MSG)
2195
2196
2197 class locked_file(object):
2198     _closed = False
2199
2200     def __init__(self, filename, mode, block=True, encoding=None):
2201         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2202         self.f = io.open(filename, mode, encoding=encoding)
2203         self.mode = mode
2204         self.block = block
2205
2206     def __enter__(self):
2207         exclusive = 'r' not in self.mode
2208         try:
2209             _lock_file(self.f, exclusive, self.block)
2210         except IOError:
2211             self.f.close()
2212             raise
2213         return self
2214
2215     def __exit__(self, etype, value, traceback):
2216         try:
2217             if not self._closed:
2218                 _unlock_file(self.f)
2219         finally:
2220             self.f.close()
2221             self._closed = True
2222
2223     def __iter__(self):
2224         return iter(self.f)
2225
2226     def write(self, *args):
2227         return self.f.write(*args)
2228
2229     def read(self, *args):
2230         return self.f.read(*args)
2231
2232     def flush(self):
2233         self.f.flush()
2234
2235     def open(self):
2236         return self.__enter__()
2237
2238     def close(self, *args):
2239         self.__exit__(self, *args, value=False, traceback=False)
2240
2241
2242 def get_filesystem_encoding():
2243     encoding = sys.getfilesystemencoding()
2244     return encoding if encoding is not None else 'utf-8'
2245
2246
2247 def shell_quote(args):
2248     quoted_args = []
2249     encoding = get_filesystem_encoding()
2250     for a in args:
2251         if isinstance(a, bytes):
2252             # We may get a filename encoded with 'encodeFilename'
2253             a = a.decode(encoding)
2254         quoted_args.append(compat_shlex_quote(a))
2255     return ' '.join(quoted_args)
2256
2257
2258 def smuggle_url(url, data):
2259     """ Pass additional data in a URL for internal use. """
2260
2261     url, idata = unsmuggle_url(url, {})
2262     data.update(idata)
2263     sdata = compat_urllib_parse_urlencode(
2264         {'__youtubedl_smuggle': json.dumps(data)})
2265     return url + '#' + sdata
2266
2267
2268 def unsmuggle_url(smug_url, default=None):
2269     if '#__youtubedl_smuggle' not in smug_url:
2270         return smug_url, default
2271     url, _, sdata = smug_url.rpartition('#')
2272     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2273     data = json.loads(jsond)
2274     return url, data
2275
2276
2277 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2278     """ Formats numbers with decimal sufixes like K, M, etc """
2279     num, factor = float_or_none(num), float(factor)
2280     if num is None or num < 0:
2281         return None
2282     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2283     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2284     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2285     if factor == 1024:
2286         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2287     converted = num / (factor ** exponent)
2288     return fmt % (converted, suffix)
2289
2290
2291 def format_bytes(bytes):
2292     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2293
2294
2295 def lookup_unit_table(unit_table, s):
2296     units_re = '|'.join(re.escape(u) for u in unit_table)
2297     m = re.match(
2298         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2299     if not m:
2300         return None
2301     num_str = m.group('num').replace(',', '.')
2302     mult = unit_table[m.group('unit')]
2303     return int(float(num_str) * mult)
2304
2305
2306 def parse_filesize(s):
2307     if s is None:
2308         return None
2309
2310     # The lower-case forms are of course incorrect and unofficial,
2311     # but we support those too
2312     _UNIT_TABLE = {
2313         'B': 1,
2314         'b': 1,
2315         'bytes': 1,
2316         'KiB': 1024,
2317         'KB': 1000,
2318         'kB': 1024,
2319         'Kb': 1000,
2320         'kb': 1000,
2321         'kilobytes': 1000,
2322         'kibibytes': 1024,
2323         'MiB': 1024 ** 2,
2324         'MB': 1000 ** 2,
2325         'mB': 1024 ** 2,
2326         'Mb': 1000 ** 2,
2327         'mb': 1000 ** 2,
2328         'megabytes': 1000 ** 2,
2329         'mebibytes': 1024 ** 2,
2330         'GiB': 1024 ** 3,
2331         'GB': 1000 ** 3,
2332         'gB': 1024 ** 3,
2333         'Gb': 1000 ** 3,
2334         'gb': 1000 ** 3,
2335         'gigabytes': 1000 ** 3,
2336         'gibibytes': 1024 ** 3,
2337         'TiB': 1024 ** 4,
2338         'TB': 1000 ** 4,
2339         'tB': 1024 ** 4,
2340         'Tb': 1000 ** 4,
2341         'tb': 1000 ** 4,
2342         'terabytes': 1000 ** 4,
2343         'tebibytes': 1024 ** 4,
2344         'PiB': 1024 ** 5,
2345         'PB': 1000 ** 5,
2346         'pB': 1024 ** 5,
2347         'Pb': 1000 ** 5,
2348         'pb': 1000 ** 5,
2349         'petabytes': 1000 ** 5,
2350         'pebibytes': 1024 ** 5,
2351         'EiB': 1024 ** 6,
2352         'EB': 1000 ** 6,
2353         'eB': 1024 ** 6,
2354         'Eb': 1000 ** 6,
2355         'eb': 1000 ** 6,
2356         'exabytes': 1000 ** 6,
2357         'exbibytes': 1024 ** 6,
2358         'ZiB': 1024 ** 7,
2359         'ZB': 1000 ** 7,
2360         'zB': 1024 ** 7,
2361         'Zb': 1000 ** 7,
2362         'zb': 1000 ** 7,
2363         'zettabytes': 1000 ** 7,
2364         'zebibytes': 1024 ** 7,
2365         'YiB': 1024 ** 8,
2366         'YB': 1000 ** 8,
2367         'yB': 1024 ** 8,
2368         'Yb': 1000 ** 8,
2369         'yb': 1000 ** 8,
2370         'yottabytes': 1000 ** 8,
2371         'yobibytes': 1024 ** 8,
2372     }
2373
2374     return lookup_unit_table(_UNIT_TABLE, s)
2375
2376
2377 def parse_count(s):
2378     if s is None:
2379         return None
2380
2381     s = re.sub(r'^[^\d]+\s', '', s).strip()
2382
2383     if re.match(r'^[\d,.]+$', s):
2384         return str_to_int(s)
2385
2386     _UNIT_TABLE = {
2387         'k': 1000,
2388         'K': 1000,
2389         'm': 1000 ** 2,
2390         'M': 1000 ** 2,
2391         'kk': 1000 ** 2,
2392         'KK': 1000 ** 2,
2393         'b': 1000 ** 3,
2394         'B': 1000 ** 3,
2395     }
2396
2397     ret = lookup_unit_table(_UNIT_TABLE, s)
2398     if ret is not None:
2399         return ret
2400
2401     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2402     if mobj:
2403         return str_to_int(mobj.group(1))
2404
2405
2406 def parse_resolution(s):
2407     if s is None:
2408         return {}
2409
2410     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2411     if mobj:
2412         return {
2413             'width': int(mobj.group('w')),
2414             'height': int(mobj.group('h')),
2415         }
2416
2417     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2418     if mobj:
2419         return {'height': int(mobj.group(1))}
2420
2421     mobj = re.search(r'\b([48])[kK]\b', s)
2422     if mobj:
2423         return {'height': int(mobj.group(1)) * 540}
2424
2425     return {}
2426
2427
2428 def parse_bitrate(s):
2429     if not isinstance(s, compat_str):
2430         return
2431     mobj = re.search(r'\b(\d+)\s*kbps', s)
2432     if mobj:
2433         return int(mobj.group(1))
2434
2435
2436 def month_by_name(name, lang='en'):
2437     """ Return the number of a month by (locale-independently) English name """
2438
2439     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2440
2441     try:
2442         return month_names.index(name) + 1
2443     except ValueError:
2444         return None
2445
2446
2447 def month_by_abbreviation(abbrev):
2448     """ Return the number of a month by (locale-independently) English
2449         abbreviations """
2450
2451     try:
2452         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2453     except ValueError:
2454         return None
2455
2456
2457 def fix_xml_ampersands(xml_str):
2458     """Replace all the '&' by '&amp;' in XML"""
2459     return re.sub(
2460         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2461         '&amp;',
2462         xml_str)
2463
2464
2465 def setproctitle(title):
2466     assert isinstance(title, compat_str)
2467
2468     # ctypes in Jython is not complete
2469     # http://bugs.jython.org/issue2148
2470     if sys.platform.startswith('java'):
2471         return
2472
2473     try:
2474         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2475     except OSError:
2476         return
2477     except TypeError:
2478         # LoadLibrary in Windows Python 2.7.13 only expects
2479         # a bytestring, but since unicode_literals turns
2480         # every string into a unicode string, it fails.
2481         return
2482     title_bytes = title.encode('utf-8')
2483     buf = ctypes.create_string_buffer(len(title_bytes))
2484     buf.value = title_bytes
2485     try:
2486         libc.prctl(15, buf, 0, 0, 0)
2487     except AttributeError:
2488         return  # Strange libc, just skip this
2489
2490
2491 def remove_start(s, start):
2492     return s[len(start):] if s is not None and s.startswith(start) else s
2493
2494
2495 def remove_end(s, end):
2496     return s[:-len(end)] if s is not None and s.endswith(end) else s
2497
2498
2499 def remove_quotes(s):
2500     if s is None or len(s) < 2:
2501         return s
2502     for quote in ('"', "'", ):
2503         if s[0] == quote and s[-1] == quote:
2504             return s[1:-1]
2505     return s
2506
2507
2508 def get_domain(url):
2509     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2510     return domain.group('domain') if domain else None
2511
2512
2513 def url_basename(url):
2514     path = compat_urlparse.urlparse(url).path
2515     return path.strip('/').split('/')[-1]
2516
2517
2518 def base_url(url):
2519     return re.match(r'https?://[^?#&]+/', url).group()
2520
2521
2522 def urljoin(base, path):
2523     if isinstance(path, bytes):
2524         path = path.decode('utf-8')
2525     if not isinstance(path, compat_str) or not path:
2526         return None
2527     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2528         return path
2529     if isinstance(base, bytes):
2530         base = base.decode('utf-8')
2531     if not isinstance(base, compat_str) or not re.match(
2532             r'^(?:https?:)?//', base):
2533         return None
2534     return compat_urlparse.urljoin(base, path)
2535
2536
2537 class HEADRequest(compat_urllib_request.Request):
2538     def get_method(self):
2539         return 'HEAD'
2540
2541
2542 class PUTRequest(compat_urllib_request.Request):
2543     def get_method(self):
2544         return 'PUT'
2545
2546
2547 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2548     if get_attr and v is not None:
2549         v = getattr(v, get_attr, None)
2550     try:
2551         return int(v) * invscale // scale
2552     except (ValueError, TypeError, OverflowError):
2553         return default
2554
2555
2556 def str_or_none(v, default=None):
2557     return default if v is None else compat_str(v)
2558
2559
2560 def str_to_int(int_str):
2561     """ A more relaxed version of int_or_none """
2562     if isinstance(int_str, compat_integer_types):
2563         return int_str
2564     elif isinstance(int_str, compat_str):
2565         int_str = re.sub(r'[,\.\+]', '', int_str)
2566         return int_or_none(int_str)
2567
2568
2569 def float_or_none(v, scale=1, invscale=1, default=None):
2570     if v is None:
2571         return default
2572     try:
2573         return float(v) * invscale / scale
2574     except (ValueError, TypeError):
2575         return default
2576
2577
2578 def bool_or_none(v, default=None):
2579     return v if isinstance(v, bool) else default
2580
2581
2582 def strip_or_none(v, default=None):
2583     return v.strip() if isinstance(v, compat_str) else default
2584
2585
2586 def url_or_none(url):
2587     if not url or not isinstance(url, compat_str):
2588         return None
2589     url = url.strip()
2590     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2591
2592
2593 def request_to_url(req):
2594     if isinstance(req, compat_urllib_request.Request):
2595         return req.get_full_url()
2596     else:
2597         return req
2598
2599
2600 def strftime_or_none(timestamp, date_format, default=None):
2601     datetime_object = None
2602     try:
2603         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2604             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2605         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2606             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2607         return datetime_object.strftime(date_format)
2608     except (ValueError, TypeError, AttributeError):
2609         return default
2610
2611
2612 def parse_duration(s):
2613     if not isinstance(s, compat_basestring):
2614         return None
2615     s = s.strip()
2616     if not s:
2617         return None
2618
2619     days, hours, mins, secs, ms = [None] * 5
2620     m = re.match(r'''(?x)
2621             (?P<before_secs>
2622                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2623             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2624             (?P<ms>[.:][0-9]+)?Z?$
2625         ''', s)
2626     if m:
2627         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2628     else:
2629         m = re.match(
2630             r'''(?ix)(?:P?
2631                 (?:
2632                     [0-9]+\s*y(?:ears?)?\s*
2633                 )?
2634                 (?:
2635                     [0-9]+\s*m(?:onths?)?\s*
2636                 )?
2637                 (?:
2638                     [0-9]+\s*w(?:eeks?)?\s*
2639                 )?
2640                 (?:
2641                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2642                 )?
2643                 T)?
2644                 (?:
2645                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2646                 )?
2647                 (?:
2648                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2649                 )?
2650                 (?:
2651                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2652                 )?Z?$''', s)
2653         if m:
2654             days, hours, mins, secs, ms = m.groups()
2655         else:
2656             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2657             if m:
2658                 hours, mins = m.groups()
2659             else:
2660                 return None
2661
2662     duration = 0
2663     if secs:
2664         duration += float(secs)
2665     if mins:
2666         duration += float(mins) * 60
2667     if hours:
2668         duration += float(hours) * 60 * 60
2669     if days:
2670         duration += float(days) * 24 * 60 * 60
2671     if ms:
2672         duration += float(ms.replace(':', '.'))
2673     return duration
2674
2675
2676 def prepend_extension(filename, ext, expected_real_ext=None):
2677     name, real_ext = os.path.splitext(filename)
2678     return (
2679         '{0}.{1}{2}'.format(name, ext, real_ext)
2680         if not expected_real_ext or real_ext[1:] == expected_real_ext
2681         else '{0}.{1}'.format(filename, ext))
2682
2683
2684 def replace_extension(filename, ext, expected_real_ext=None):
2685     name, real_ext = os.path.splitext(filename)
2686     return '{0}.{1}'.format(
2687         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2688         ext)
2689
2690
2691 def check_executable(exe, args=[]):
2692     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2693     args can be a list of arguments for a short output (like -version) """
2694     try:
2695         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2696     except OSError:
2697         return False
2698     return exe
2699
2700
2701 def _get_exe_version_output(exe, args):
2702     try:
2703         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2704         # SIGTTOU if yt-dlp is run in the background.
2705         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2706         out, _ = Popen(
2707             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2708             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2709     except OSError:
2710         return False
2711     if isinstance(out, bytes):  # Python 2.x
2712         out = out.decode('ascii', 'ignore')
2713     return out
2714
2715
2716 def detect_exe_version(output, version_re=None, unrecognized='present'):
2717     assert isinstance(output, compat_str)
2718     if version_re is None:
2719         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2720     m = re.search(version_re, output)
2721     if m:
2722         return m.group(1)
2723     else:
2724         return unrecognized
2725
2726
2727 def get_exe_version(exe, args=['--version'],
2728                     version_re=None, unrecognized='present'):
2729     """ Returns the version of the specified executable,
2730     or False if the executable is not present """
2731     out = _get_exe_version_output(exe, args)
2732     return detect_exe_version(out, version_re, unrecognized) if out else False
2733
2734
2735 class LazyList(collections.abc.Sequence):
2736     ''' Lazy immutable list from an iterable
2737     Note that slices of a LazyList are lists and not LazyList'''
2738
2739     class IndexError(IndexError):
2740         pass
2741
2742     def __init__(self, iterable, *, reverse=False, _cache=None):
2743         self.__iterable = iter(iterable)
2744         self.__cache = [] if _cache is None else _cache
2745         self.__reversed = reverse
2746
2747     def __iter__(self):
2748         if self.__reversed:
2749             # We need to consume the entire iterable to iterate in reverse
2750             yield from self.exhaust()
2751             return
2752         yield from self.__cache
2753         for item in self.__iterable:
2754             self.__cache.append(item)
2755             yield item
2756
2757     def __exhaust(self):
2758         self.__cache.extend(self.__iterable)
2759         # Discard the emptied iterable to make it pickle-able
2760         self.__iterable = []
2761         return self.__cache
2762
2763     def exhaust(self):
2764         ''' Evaluate the entire iterable '''
2765         return self.__exhaust()[::-1 if self.__reversed else 1]
2766
2767     @staticmethod
2768     def __reverse_index(x):
2769         return None if x is None else -(x + 1)
2770
2771     def __getitem__(self, idx):
2772         if isinstance(idx, slice):
2773             if self.__reversed:
2774                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2775             start, stop, step = idx.start, idx.stop, idx.step or 1
2776         elif isinstance(idx, int):
2777             if self.__reversed:
2778                 idx = self.__reverse_index(idx)
2779             start, stop, step = idx, idx, 0
2780         else:
2781             raise TypeError('indices must be integers or slices')
2782         if ((start or 0) < 0 or (stop or 0) < 0
2783                 or (start is None and step < 0)
2784                 or (stop is None and step > 0)):
2785             # We need to consume the entire iterable to be able to slice from the end
2786             # Obviously, never use this with infinite iterables
2787             self.__exhaust()
2788             try:
2789                 return self.__cache[idx]
2790             except IndexError as e:
2791                 raise self.IndexError(e) from e
2792         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2793         if n > 0:
2794             self.__cache.extend(itertools.islice(self.__iterable, n))
2795         try:
2796             return self.__cache[idx]
2797         except IndexError as e:
2798             raise self.IndexError(e) from e
2799
2800     def __bool__(self):
2801         try:
2802             self[-1] if self.__reversed else self[0]
2803         except self.IndexError:
2804             return False
2805         return True
2806
2807     def __len__(self):
2808         self.__exhaust()
2809         return len(self.__cache)
2810
2811     def __reversed__(self):
2812         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2813
2814     def __copy__(self):
2815         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2816
2817     def __repr__(self):
2818         # repr and str should mimic a list. So we exhaust the iterable
2819         return repr(self.exhaust())
2820
2821     def __str__(self):
2822         return repr(self.exhaust())
2823
2824
2825 class PagedList:
2826
2827     class IndexError(IndexError):
2828         pass
2829
2830     def __len__(self):
2831         # This is only useful for tests
2832         return len(self.getslice())
2833
2834     def __init__(self, pagefunc, pagesize, use_cache=True):
2835         self._pagefunc = pagefunc
2836         self._pagesize = pagesize
2837         self._pagecount = float('inf')
2838         self._use_cache = use_cache
2839         self._cache = {}
2840
2841     def getpage(self, pagenum):
2842         page_results = self._cache.get(pagenum)
2843         if page_results is None:
2844             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2845         if self._use_cache:
2846             self._cache[pagenum] = page_results
2847         return page_results
2848
2849     def getslice(self, start=0, end=None):
2850         return list(self._getslice(start, end))
2851
2852     def _getslice(self, start, end):
2853         raise NotImplementedError('This method must be implemented by subclasses')
2854
2855     def __getitem__(self, idx):
2856         assert self._use_cache, 'Indexing PagedList requires cache'
2857         if not isinstance(idx, int) or idx < 0:
2858             raise TypeError('indices must be non-negative integers')
2859         entries = self.getslice(idx, idx + 1)
2860         if not entries:
2861             raise self.IndexError()
2862         return entries[0]
2863
2864
2865 class OnDemandPagedList(PagedList):
2866     def _getslice(self, start, end):
2867         for pagenum in itertools.count(start // self._pagesize):
2868             firstid = pagenum * self._pagesize
2869             nextfirstid = pagenum * self._pagesize + self._pagesize
2870             if start >= nextfirstid:
2871                 continue
2872
2873             startv = (
2874                 start % self._pagesize
2875                 if firstid <= start < nextfirstid
2876                 else 0)
2877             endv = (
2878                 ((end - 1) % self._pagesize) + 1
2879                 if (end is not None and firstid <= end <= nextfirstid)
2880                 else None)
2881
2882             try:
2883                 page_results = self.getpage(pagenum)
2884             except Exception:
2885                 self._pagecount = pagenum - 1
2886                 raise
2887             if startv != 0 or endv is not None:
2888                 page_results = page_results[startv:endv]
2889             yield from page_results
2890
2891             # A little optimization - if current page is not "full", ie. does
2892             # not contain page_size videos then we can assume that this page
2893             # is the last one - there are no more ids on further pages -
2894             # i.e. no need to query again.
2895             if len(page_results) + startv < self._pagesize:
2896                 break
2897
2898             # If we got the whole page, but the next page is not interesting,
2899             # break out early as well
2900             if end == nextfirstid:
2901                 break
2902
2903
2904 class InAdvancePagedList(PagedList):
2905     def __init__(self, pagefunc, pagecount, pagesize):
2906         PagedList.__init__(self, pagefunc, pagesize, True)
2907         self._pagecount = pagecount
2908
2909     def _getslice(self, start, end):
2910         start_page = start // self._pagesize
2911         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2912         skip_elems = start - start_page * self._pagesize
2913         only_more = None if end is None else end - start
2914         for pagenum in range(start_page, end_page):
2915             page_results = self.getpage(pagenum)
2916             if skip_elems:
2917                 page_results = page_results[skip_elems:]
2918                 skip_elems = None
2919             if only_more is not None:
2920                 if len(page_results) < only_more:
2921                     only_more -= len(page_results)
2922                 else:
2923                     yield from page_results[:only_more]
2924                     break
2925             yield from page_results
2926
2927
2928 def uppercase_escape(s):
2929     unicode_escape = codecs.getdecoder('unicode_escape')
2930     return re.sub(
2931         r'\\U[0-9a-fA-F]{8}',
2932         lambda m: unicode_escape(m.group(0))[0],
2933         s)
2934
2935
2936 def lowercase_escape(s):
2937     unicode_escape = codecs.getdecoder('unicode_escape')
2938     return re.sub(
2939         r'\\u[0-9a-fA-F]{4}',
2940         lambda m: unicode_escape(m.group(0))[0],
2941         s)
2942
2943
2944 def escape_rfc3986(s):
2945     """Escape non-ASCII characters as suggested by RFC 3986"""
2946     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2947         s = s.encode('utf-8')
2948     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2949
2950
2951 def escape_url(url):
2952     """Escape URL as suggested by RFC 3986"""
2953     url_parsed = compat_urllib_parse_urlparse(url)
2954     return url_parsed._replace(
2955         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2956         path=escape_rfc3986(url_parsed.path),
2957         params=escape_rfc3986(url_parsed.params),
2958         query=escape_rfc3986(url_parsed.query),
2959         fragment=escape_rfc3986(url_parsed.fragment)
2960     ).geturl()
2961
2962
2963 def parse_qs(url):
2964     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2965
2966
2967 def read_batch_urls(batch_fd):
2968     def fixup(url):
2969         if not isinstance(url, compat_str):
2970             url = url.decode('utf-8', 'replace')
2971         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2972         for bom in BOM_UTF8:
2973             if url.startswith(bom):
2974                 url = url[len(bom):]
2975         url = url.lstrip()
2976         if not url or url.startswith(('#', ';', ']')):
2977             return False
2978         # "#" cannot be stripped out since it is part of the URI
2979         # However, it can be safely stipped out if follwing a whitespace
2980         return re.split(r'\s#', url, 1)[0].rstrip()
2981
2982     with contextlib.closing(batch_fd) as fd:
2983         return [url for url in map(fixup, fd) if url]
2984
2985
2986 def urlencode_postdata(*args, **kargs):
2987     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2988
2989
2990 def update_url_query(url, query):
2991     if not query:
2992         return url
2993     parsed_url = compat_urlparse.urlparse(url)
2994     qs = compat_parse_qs(parsed_url.query)
2995     qs.update(query)
2996     return compat_urlparse.urlunparse(parsed_url._replace(
2997         query=compat_urllib_parse_urlencode(qs, True)))
2998
2999
3000 def update_Request(req, url=None, data=None, headers={}, query={}):
3001     req_headers = req.headers.copy()
3002     req_headers.update(headers)
3003     req_data = data or req.data
3004     req_url = update_url_query(url or req.get_full_url(), query)
3005     req_get_method = req.get_method()
3006     if req_get_method == 'HEAD':
3007         req_type = HEADRequest
3008     elif req_get_method == 'PUT':
3009         req_type = PUTRequest
3010     else:
3011         req_type = compat_urllib_request.Request
3012     new_req = req_type(
3013         req_url, data=req_data, headers=req_headers,
3014         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3015     if hasattr(req, 'timeout'):
3016         new_req.timeout = req.timeout
3017     return new_req
3018
3019
3020 def _multipart_encode_impl(data, boundary):
3021     content_type = 'multipart/form-data; boundary=%s' % boundary
3022
3023     out = b''
3024     for k, v in data.items():
3025         out += b'--' + boundary.encode('ascii') + b'\r\n'
3026         if isinstance(k, compat_str):
3027             k = k.encode('utf-8')
3028         if isinstance(v, compat_str):
3029             v = v.encode('utf-8')
3030         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3031         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3032         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3033         if boundary.encode('ascii') in content:
3034             raise ValueError('Boundary overlaps with data')
3035         out += content
3036
3037     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3038
3039     return out, content_type
3040
3041
3042 def multipart_encode(data, boundary=None):
3043     '''
3044     Encode a dict to RFC 7578-compliant form-data
3045
3046     data:
3047         A dict where keys and values can be either Unicode or bytes-like
3048         objects.
3049     boundary:
3050         If specified a Unicode object, it's used as the boundary. Otherwise
3051         a random boundary is generated.
3052
3053     Reference: https://tools.ietf.org/html/rfc7578
3054     '''
3055     has_specified_boundary = boundary is not None
3056
3057     while True:
3058         if boundary is None:
3059             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3060
3061         try:
3062             out, content_type = _multipart_encode_impl(data, boundary)
3063             break
3064         except ValueError:
3065             if has_specified_boundary:
3066                 raise
3067             boundary = None
3068
3069     return out, content_type
3070
3071
3072 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3073     if isinstance(key_or_keys, (list, tuple)):
3074         for key in key_or_keys:
3075             if key not in d or d[key] is None or skip_false_values and not d[key]:
3076                 continue
3077             return d[key]
3078         return default
3079     return d.get(key_or_keys, default)
3080
3081
3082 def try_get(src, getter, expected_type=None):
3083     for get in variadic(getter):
3084         try:
3085             v = get(src)
3086         except (AttributeError, KeyError, TypeError, IndexError):
3087             pass
3088         else:
3089             if expected_type is None or isinstance(v, expected_type):
3090                 return v
3091
3092
3093 def merge_dicts(*dicts):
3094     merged = {}
3095     for a_dict in dicts:
3096         for k, v in a_dict.items():
3097             if v is None:
3098                 continue
3099             if (k not in merged
3100                     or (isinstance(v, compat_str) and v
3101                         and isinstance(merged[k], compat_str)
3102                         and not merged[k])):
3103                 merged[k] = v
3104     return merged
3105
3106
3107 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3108     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3109
3110
3111 US_RATINGS = {
3112     'G': 0,
3113     'PG': 10,
3114     'PG-13': 13,
3115     'R': 16,
3116     'NC': 18,
3117 }
3118
3119
3120 TV_PARENTAL_GUIDELINES = {
3121     'TV-Y': 0,
3122     'TV-Y7': 7,
3123     'TV-G': 0,
3124     'TV-PG': 0,
3125     'TV-14': 14,
3126     'TV-MA': 17,
3127 }
3128
3129
3130 def parse_age_limit(s):
3131     if type(s) == int:
3132         return s if 0 <= s <= 21 else None
3133     if not isinstance(s, compat_basestring):
3134         return None
3135     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3136     if m:
3137         return int(m.group('age'))
3138     s = s.upper()
3139     if s in US_RATINGS:
3140         return US_RATINGS[s]
3141     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3142     if m:
3143         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3144     return None
3145
3146
3147 def strip_jsonp(code):
3148     return re.sub(
3149         r'''(?sx)^
3150             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3151             (?:\s*&&\s*(?P=func_name))?
3152             \s*\(\s*(?P<callback_data>.*)\);?
3153             \s*?(?://[^\n]*)*$''',
3154         r'\g<callback_data>', code)
3155
3156
3157 def js_to_json(code, vars={}):
3158     # vars is a dict of var, val pairs to substitute
3159     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3160     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3161     INTEGER_TABLE = (
3162         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3163         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3164     )
3165
3166     def fix_kv(m):
3167         v = m.group(0)
3168         if v in ('true', 'false', 'null'):
3169             return v
3170         elif v in ('undefined', 'void 0'):
3171             return 'null'
3172         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3173             return ""
3174
3175         if v[0] in ("'", '"'):
3176             v = re.sub(r'(?s)\\.|"', lambda m: {
3177                 '"': '\\"',
3178                 "\\'": "'",
3179                 '\\\n': '',
3180                 '\\x': '\\u00',
3181             }.get(m.group(0), m.group(0)), v[1:-1])
3182         else:
3183             for regex, base in INTEGER_TABLE:
3184                 im = re.match(regex, v)
3185                 if im:
3186                     i = int(im.group(1), base)
3187                     return '"%d":' % i if v.endswith(':') else '%d' % i
3188
3189             if v in vars:
3190                 return vars[v]
3191
3192         return '"%s"' % v
3193
3194     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3195
3196     return re.sub(r'''(?sx)
3197         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3198         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3199         {comment}|,(?={skip}[\]}}])|
3200         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3201         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3202         [0-9]+(?={skip}:)|
3203         !+
3204         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3205
3206
3207 def qualities(quality_ids):
3208     """ Get a numeric quality value out of a list of possible values """
3209     def q(qid):
3210         try:
3211             return quality_ids.index(qid)
3212         except ValueError:
3213             return -1
3214     return q
3215
3216
3217 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3218
3219
3220 DEFAULT_OUTTMPL = {
3221     'default': '%(title)s [%(id)s].%(ext)s',
3222     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3223 }
3224 OUTTMPL_TYPES = {
3225     'chapter': None,
3226     'subtitle': None,
3227     'thumbnail': None,
3228     'description': 'description',
3229     'annotation': 'annotations.xml',
3230     'infojson': 'info.json',
3231     'link': None,
3232     'pl_video': None,
3233     'pl_thumbnail': None,
3234     'pl_description': 'description',
3235     'pl_infojson': 'info.json',
3236 }
3237
3238 # As of [1] format syntax is:
3239 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3240 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3241 STR_FORMAT_RE_TMPL = r'''(?x)
3242     (?<!%)(?P<prefix>(?:%%)*)
3243     %
3244     (?P<has_key>\((?P<key>{0})\))?
3245     (?P<format>
3246         (?P<conversion>[#0\-+ ]+)?
3247         (?P<min_width>\d+)?
3248         (?P<precision>\.\d+)?
3249         (?P<len_mod>[hlL])?  # unused in python
3250         {1}  # conversion type
3251     )
3252 '''
3253
3254
3255 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3256
3257
3258 def limit_length(s, length):
3259     """ Add ellipses to overly long strings """
3260     if s is None:
3261         return None
3262     ELLIPSES = '...'
3263     if len(s) > length:
3264         return s[:length - len(ELLIPSES)] + ELLIPSES
3265     return s
3266
3267
3268 def version_tuple(v):
3269     return tuple(int(e) for e in re.split(r'[-.]', v))
3270
3271
3272 def is_outdated_version(version, limit, assume_new=True):
3273     if not version:
3274         return not assume_new
3275     try:
3276         return version_tuple(version) < version_tuple(limit)
3277     except ValueError:
3278         return not assume_new
3279
3280
3281 def ytdl_is_updateable():
3282     """ Returns if yt-dlp can be updated with -U """
3283
3284     from .update import is_non_updateable
3285
3286     return not is_non_updateable()
3287
3288
3289 def args_to_str(args):
3290     # Get a short string representation for a subprocess command
3291     return ' '.join(compat_shlex_quote(a) for a in args)
3292
3293
3294 def error_to_compat_str(err):
3295     err_str = str(err)
3296     # On python 2 error byte string must be decoded with proper
3297     # encoding rather than ascii
3298     if sys.version_info[0] < 3:
3299         err_str = err_str.decode(preferredencoding())
3300     return err_str
3301
3302
3303 def mimetype2ext(mt):
3304     if mt is None:
3305         return None
3306
3307     mt, _, params = mt.partition(';')
3308     mt = mt.strip()
3309
3310     FULL_MAP = {
3311         'audio/mp4': 'm4a',
3312         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3313         # it's the most popular one
3314         'audio/mpeg': 'mp3',
3315         'audio/x-wav': 'wav',
3316         'audio/wav': 'wav',
3317         'audio/wave': 'wav',
3318     }
3319
3320     ext = FULL_MAP.get(mt)
3321     if ext is not None:
3322         return ext
3323
3324     SUBTYPE_MAP = {
3325         '3gpp': '3gp',
3326         'smptett+xml': 'tt',
3327         'ttaf+xml': 'dfxp',
3328         'ttml+xml': 'ttml',
3329         'x-flv': 'flv',
3330         'x-mp4-fragmented': 'mp4',
3331         'x-ms-sami': 'sami',
3332         'x-ms-wmv': 'wmv',
3333         'mpegurl': 'm3u8',
3334         'x-mpegurl': 'm3u8',
3335         'vnd.apple.mpegurl': 'm3u8',
3336         'dash+xml': 'mpd',
3337         'f4m+xml': 'f4m',
3338         'hds+xml': 'f4m',
3339         'vnd.ms-sstr+xml': 'ism',
3340         'quicktime': 'mov',
3341         'mp2t': 'ts',
3342         'x-wav': 'wav',
3343         'filmstrip+json': 'fs',
3344         'svg+xml': 'svg',
3345     }
3346
3347     _, _, subtype = mt.rpartition('/')
3348     ext = SUBTYPE_MAP.get(subtype.lower())
3349     if ext is not None:
3350         return ext
3351
3352     SUFFIX_MAP = {
3353         'json': 'json',
3354         'xml': 'xml',
3355         'zip': 'zip',
3356         'gzip': 'gz',
3357     }
3358
3359     _, _, suffix = subtype.partition('+')
3360     ext = SUFFIX_MAP.get(suffix)
3361     if ext is not None:
3362         return ext
3363
3364     return subtype.replace('+', '.')
3365
3366
3367 def ext2mimetype(ext_or_url):
3368     if not ext_or_url:
3369         return None
3370     if '.' not in ext_or_url:
3371         ext_or_url = f'file.{ext_or_url}'
3372     return mimetypes.guess_type(ext_or_url)[0]
3373
3374
3375 def parse_codecs(codecs_str):
3376     # http://tools.ietf.org/html/rfc6381
3377     if not codecs_str:
3378         return {}
3379     split_codecs = list(filter(None, map(
3380         str.strip, codecs_str.strip().strip(',').split(','))))
3381     vcodec, acodec, tcodec, hdr = None, None, None, None
3382     for full_codec in split_codecs:
3383         parts = full_codec.split('.')
3384         codec = parts[0].replace('0', '')
3385         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3386                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3387             if not vcodec:
3388                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3389                 if codec in ('dvh1', 'dvhe'):
3390                     hdr = 'DV'
3391                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3392                     hdr = 'HDR10'
3393                 elif full_codec.replace('0', '').startswith('vp9.2'):
3394                     hdr = 'HDR10'
3395         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3396             if not acodec:
3397                 acodec = full_codec
3398         elif codec in ('stpp', 'wvtt',):
3399             if not tcodec:
3400                 tcodec = full_codec
3401         else:
3402             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3403     if vcodec or acodec or tcodec:
3404         return {
3405             'vcodec': vcodec or 'none',
3406             'acodec': acodec or 'none',
3407             'dynamic_range': hdr,
3408             **({'tcodec': tcodec} if tcodec is not None else {}),
3409         }
3410     elif len(split_codecs) == 2:
3411         return {
3412             'vcodec': split_codecs[0],
3413             'acodec': split_codecs[1],
3414         }
3415     return {}
3416
3417
3418 def urlhandle_detect_ext(url_handle):
3419     getheader = url_handle.headers.get
3420
3421     cd = getheader('Content-Disposition')
3422     if cd:
3423         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3424         if m:
3425             e = determine_ext(m.group('filename'), default_ext=None)
3426             if e:
3427                 return e
3428
3429     return mimetype2ext(getheader('Content-Type'))
3430
3431
3432 def encode_data_uri(data, mime_type):
3433     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3434
3435
3436 def age_restricted(content_limit, age_limit):
3437     """ Returns True iff the content should be blocked """
3438
3439     if age_limit is None:  # No limit set
3440         return False
3441     if content_limit is None:
3442         return False  # Content available for everyone
3443     return age_limit < content_limit
3444
3445
3446 def is_html(first_bytes):
3447     """ Detect whether a file contains HTML by examining its first bytes. """
3448
3449     BOMS = [
3450         (b'\xef\xbb\xbf', 'utf-8'),
3451         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3452         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3453         (b'\xff\xfe', 'utf-16-le'),
3454         (b'\xfe\xff', 'utf-16-be'),
3455     ]
3456     for bom, enc in BOMS:
3457         if first_bytes.startswith(bom):
3458             s = first_bytes[len(bom):].decode(enc, 'replace')
3459             break
3460     else:
3461         s = first_bytes.decode('utf-8', 'replace')
3462
3463     return re.match(r'^\s*<', s)
3464
3465
3466 def determine_protocol(info_dict):
3467     protocol = info_dict.get('protocol')
3468     if protocol is not None:
3469         return protocol
3470
3471     url = sanitize_url(info_dict['url'])
3472     if url.startswith('rtmp'):
3473         return 'rtmp'
3474     elif url.startswith('mms'):
3475         return 'mms'
3476     elif url.startswith('rtsp'):
3477         return 'rtsp'
3478
3479     ext = determine_ext(url)
3480     if ext == 'm3u8':
3481         return 'm3u8'
3482     elif ext == 'f4m':
3483         return 'f4m'
3484
3485     return compat_urllib_parse_urlparse(url).scheme
3486
3487
3488 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3489     """ Render a list of rows, each as a list of values.
3490     Text after a \t will be right aligned """
3491     def width(string):
3492         return len(remove_terminal_sequences(string).replace('\t', ''))
3493
3494     def get_max_lens(table):
3495         return [max(width(str(v)) for v in col) for col in zip(*table)]
3496
3497     def filter_using_list(row, filterArray):
3498         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3499
3500     max_lens = get_max_lens(data) if hide_empty else []
3501     header_row = filter_using_list(header_row, max_lens)
3502     data = [filter_using_list(row, max_lens) for row in data]
3503
3504     table = [header_row] + data
3505     max_lens = get_max_lens(table)
3506     extra_gap += 1
3507     if delim:
3508         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3509         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3510     for row in table:
3511         for pos, text in enumerate(map(str, row)):
3512             if '\t' in text:
3513                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3514             else:
3515                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3516     ret = '\n'.join(''.join(row).rstrip() for row in table)
3517     return ret
3518
3519
3520 def _match_one(filter_part, dct, incomplete):
3521     # TODO: Generalize code with YoutubeDL._build_format_filter
3522     STRING_OPERATORS = {
3523         '*=': operator.contains,
3524         '^=': lambda attr, value: attr.startswith(value),
3525         '$=': lambda attr, value: attr.endswith(value),
3526         '~=': lambda attr, value: re.search(value, attr),
3527     }
3528     COMPARISON_OPERATORS = {
3529         **STRING_OPERATORS,
3530         '<=': operator.le,  # "<=" must be defined above "<"
3531         '<': operator.lt,
3532         '>=': operator.ge,
3533         '>': operator.gt,
3534         '=': operator.eq,
3535     }
3536
3537     operator_rex = re.compile(r'''(?x)\s*
3538         (?P<key>[a-z_]+)
3539         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3540         (?:
3541             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3542             (?P<strval>.+?)
3543         )
3544         \s*$
3545         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3546     m = operator_rex.search(filter_part)
3547     if m:
3548         m = m.groupdict()
3549         unnegated_op = COMPARISON_OPERATORS[m['op']]
3550         if m['negation']:
3551             op = lambda attr, value: not unnegated_op(attr, value)
3552         else:
3553             op = unnegated_op
3554         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3555         if m['quote']:
3556             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3557         actual_value = dct.get(m['key'])
3558         numeric_comparison = None
3559         if isinstance(actual_value, compat_numeric_types):
3560             # If the original field is a string and matching comparisonvalue is
3561             # a number we should respect the origin of the original field
3562             # and process comparison value as a string (see
3563             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3564             try:
3565                 numeric_comparison = int(comparison_value)
3566             except ValueError:
3567                 numeric_comparison = parse_filesize(comparison_value)
3568                 if numeric_comparison is None:
3569                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3570                 if numeric_comparison is None:
3571                     numeric_comparison = parse_duration(comparison_value)
3572         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3573             raise ValueError('Operator %s only supports string values!' % m['op'])
3574         if actual_value is None:
3575             return incomplete or m['none_inclusive']
3576         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3577
3578     UNARY_OPERATORS = {
3579         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3580         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3581     }
3582     operator_rex = re.compile(r'''(?x)\s*
3583         (?P<op>%s)\s*(?P<key>[a-z_]+)
3584         \s*$
3585         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3586     m = operator_rex.search(filter_part)
3587     if m:
3588         op = UNARY_OPERATORS[m.group('op')]
3589         actual_value = dct.get(m.group('key'))
3590         if incomplete and actual_value is None:
3591             return True
3592         return op(actual_value)
3593
3594     raise ValueError('Invalid filter part %r' % filter_part)
3595
3596
3597 def match_str(filter_str, dct, incomplete=False):
3598     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3599         When incomplete, all conditions passes on missing fields
3600     """
3601     return all(
3602         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3603         for filter_part in re.split(r'(?<!\\)&', filter_str))
3604
3605
3606 def match_filter_func(filter_str):
3607     if filter_str is None:
3608         return None
3609
3610     def _match_func(info_dict, *args, **kwargs):
3611         if match_str(filter_str, info_dict, *args, **kwargs):
3612             return None
3613         else:
3614             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3615             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3616     return _match_func
3617
3618
3619 def parse_dfxp_time_expr(time_expr):
3620     if not time_expr:
3621         return
3622
3623     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3624     if mobj:
3625         return float(mobj.group('time_offset'))
3626
3627     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3628     if mobj:
3629         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3630
3631
3632 def srt_subtitles_timecode(seconds):
3633     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3634
3635
3636 def ass_subtitles_timecode(seconds):
3637     time = timetuple_from_msec(seconds * 1000)
3638     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3639
3640
3641 def dfxp2srt(dfxp_data):
3642     '''
3643     @param dfxp_data A bytes-like object containing DFXP data
3644     @returns A unicode object containing converted SRT data
3645     '''
3646     LEGACY_NAMESPACES = (
3647         (b'http://www.w3.org/ns/ttml', [
3648             b'http://www.w3.org/2004/11/ttaf1',
3649             b'http://www.w3.org/2006/04/ttaf1',
3650             b'http://www.w3.org/2006/10/ttaf1',
3651         ]),
3652         (b'http://www.w3.org/ns/ttml#styling', [
3653             b'http://www.w3.org/ns/ttml#style',
3654         ]),
3655     )
3656
3657     SUPPORTED_STYLING = [
3658         'color',
3659         'fontFamily',
3660         'fontSize',
3661         'fontStyle',
3662         'fontWeight',
3663         'textDecoration'
3664     ]
3665
3666     _x = functools.partial(xpath_with_ns, ns_map={
3667         'xml': 'http://www.w3.org/XML/1998/namespace',
3668         'ttml': 'http://www.w3.org/ns/ttml',
3669         'tts': 'http://www.w3.org/ns/ttml#styling',
3670     })
3671
3672     styles = {}
3673     default_style = {}
3674
3675     class TTMLPElementParser(object):
3676         _out = ''
3677         _unclosed_elements = []
3678         _applied_styles = []
3679
3680         def start(self, tag, attrib):
3681             if tag in (_x('ttml:br'), 'br'):
3682                 self._out += '\n'
3683             else:
3684                 unclosed_elements = []
3685                 style = {}
3686                 element_style_id = attrib.get('style')
3687                 if default_style:
3688                     style.update(default_style)
3689                 if element_style_id:
3690                     style.update(styles.get(element_style_id, {}))
3691                 for prop in SUPPORTED_STYLING:
3692                     prop_val = attrib.get(_x('tts:' + prop))
3693                     if prop_val:
3694                         style[prop] = prop_val
3695                 if style:
3696                     font = ''
3697                     for k, v in sorted(style.items()):
3698                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3699                             continue
3700                         if k == 'color':
3701                             font += ' color="%s"' % v
3702                         elif k == 'fontSize':
3703                             font += ' size="%s"' % v
3704                         elif k == 'fontFamily':
3705                             font += ' face="%s"' % v
3706                         elif k == 'fontWeight' and v == 'bold':
3707                             self._out += '<b>'
3708                             unclosed_elements.append('b')
3709                         elif k == 'fontStyle' and v == 'italic':
3710                             self._out += '<i>'
3711                             unclosed_elements.append('i')
3712                         elif k == 'textDecoration' and v == 'underline':
3713                             self._out += '<u>'
3714                             unclosed_elements.append('u')
3715                     if font:
3716                         self._out += '<font' + font + '>'
3717                         unclosed_elements.append('font')
3718                     applied_style = {}
3719                     if self._applied_styles:
3720                         applied_style.update(self._applied_styles[-1])
3721                     applied_style.update(style)
3722                     self._applied_styles.append(applied_style)
3723                 self._unclosed_elements.append(unclosed_elements)
3724
3725         def end(self, tag):
3726             if tag not in (_x('ttml:br'), 'br'):
3727                 unclosed_elements = self._unclosed_elements.pop()
3728                 for element in reversed(unclosed_elements):
3729                     self._out += '</%s>' % element
3730                 if unclosed_elements and self._applied_styles:
3731                     self._applied_styles.pop()
3732
3733         def data(self, data):
3734             self._out += data
3735
3736         def close(self):
3737             return self._out.strip()
3738
3739     def parse_node(node):
3740         target = TTMLPElementParser()
3741         parser = xml.etree.ElementTree.XMLParser(target=target)
3742         parser.feed(xml.etree.ElementTree.tostring(node))
3743         return parser.close()
3744
3745     for k, v in LEGACY_NAMESPACES:
3746         for ns in v:
3747             dfxp_data = dfxp_data.replace(ns, k)
3748
3749     dfxp = compat_etree_fromstring(dfxp_data)
3750     out = []
3751     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3752
3753     if not paras:
3754         raise ValueError('Invalid dfxp/TTML subtitle')
3755
3756     repeat = False
3757     while True:
3758         for style in dfxp.findall(_x('.//ttml:style')):
3759             style_id = style.get('id') or style.get(_x('xml:id'))
3760             if not style_id:
3761                 continue
3762             parent_style_id = style.get('style')
3763             if parent_style_id:
3764                 if parent_style_id not in styles:
3765                     repeat = True
3766                     continue
3767                 styles[style_id] = styles[parent_style_id].copy()
3768             for prop in SUPPORTED_STYLING:
3769                 prop_val = style.get(_x('tts:' + prop))
3770                 if prop_val:
3771                     styles.setdefault(style_id, {})[prop] = prop_val
3772         if repeat:
3773             repeat = False
3774         else:
3775             break
3776
3777     for p in ('body', 'div'):
3778         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3779         if ele is None:
3780             continue
3781         style = styles.get(ele.get('style'))
3782         if not style:
3783             continue
3784         default_style.update(style)
3785
3786     for para, index in zip(paras, itertools.count(1)):
3787         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3788         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3789         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3790         if begin_time is None:
3791             continue
3792         if not end_time:
3793             if not dur:
3794                 continue
3795             end_time = begin_time + dur
3796         out.append('%d\n%s --> %s\n%s\n\n' % (
3797             index,
3798             srt_subtitles_timecode(begin_time),
3799             srt_subtitles_timecode(end_time),
3800             parse_node(para)))
3801
3802     return ''.join(out)
3803
3804
3805 def cli_option(params, command_option, param):
3806     param = params.get(param)
3807     if param:
3808         param = compat_str(param)
3809     return [command_option, param] if param is not None else []
3810
3811
3812 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3813     param = params.get(param)
3814     if param is None:
3815         return []
3816     assert isinstance(param, bool)
3817     if separator:
3818         return [command_option + separator + (true_value if param else false_value)]
3819     return [command_option, true_value if param else false_value]
3820
3821
3822 def cli_valueless_option(params, command_option, param, expected_value=True):
3823     param = params.get(param)
3824     return [command_option] if param == expected_value else []
3825
3826
3827 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3828     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3829         if use_compat:
3830             return argdict
3831         else:
3832             argdict = None
3833     if argdict is None:
3834         return default
3835     assert isinstance(argdict, dict)
3836
3837     assert isinstance(keys, (list, tuple))
3838     for key_list in keys:
3839         arg_list = list(filter(
3840             lambda x: x is not None,
3841             [argdict.get(key.lower()) for key in variadic(key_list)]))
3842         if arg_list:
3843             return [arg for args in arg_list for arg in args]
3844     return default
3845
3846
3847 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3848     main_key, exe = main_key.lower(), exe.lower()
3849     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3850     keys = [f'{root_key}{k}' for k in (keys or [''])]
3851     if root_key in keys:
3852         if main_key != exe:
3853             keys.append((main_key, exe))
3854         keys.append('default')
3855     else:
3856         use_compat = False
3857     return cli_configuration_args(argdict, keys, default, use_compat)
3858
3859
3860 class ISO639Utils(object):
3861     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3862     _lang_map = {
3863         'aa': 'aar',
3864         'ab': 'abk',
3865         'ae': 'ave',
3866         'af': 'afr',
3867         'ak': 'aka',
3868         'am': 'amh',
3869         'an': 'arg',
3870         'ar': 'ara',
3871         'as': 'asm',
3872         'av': 'ava',
3873         'ay': 'aym',
3874         'az': 'aze',
3875         'ba': 'bak',
3876         'be': 'bel',
3877         'bg': 'bul',
3878         'bh': 'bih',
3879         'bi': 'bis',
3880         'bm': 'bam',
3881         'bn': 'ben',
3882         'bo': 'bod',
3883         'br': 'bre',
3884         'bs': 'bos',
3885         'ca': 'cat',
3886         'ce': 'che',
3887         'ch': 'cha',
3888         'co': 'cos',
3889         'cr': 'cre',
3890         'cs': 'ces',
3891         'cu': 'chu',
3892         'cv': 'chv',
3893         'cy': 'cym',
3894         'da': 'dan',
3895         'de': 'deu',
3896         'dv': 'div',
3897         'dz': 'dzo',
3898         'ee': 'ewe',
3899         'el': 'ell',
3900         'en': 'eng',
3901         'eo': 'epo',
3902         'es': 'spa',
3903         'et': 'est',
3904         'eu': 'eus',
3905         'fa': 'fas',
3906         'ff': 'ful',
3907         'fi': 'fin',
3908         'fj': 'fij',
3909         'fo': 'fao',
3910         'fr': 'fra',
3911         'fy': 'fry',
3912         'ga': 'gle',
3913         'gd': 'gla',
3914         'gl': 'glg',
3915         'gn': 'grn',
3916         'gu': 'guj',
3917         'gv': 'glv',
3918         'ha': 'hau',
3919         'he': 'heb',
3920         'iw': 'heb',  # Replaced by he in 1989 revision
3921         'hi': 'hin',
3922         'ho': 'hmo',
3923         'hr': 'hrv',
3924         'ht': 'hat',
3925         'hu': 'hun',
3926         'hy': 'hye',
3927         'hz': 'her',
3928         'ia': 'ina',
3929         'id': 'ind',
3930         'in': 'ind',  # Replaced by id in 1989 revision
3931         'ie': 'ile',
3932         'ig': 'ibo',
3933         'ii': 'iii',
3934         'ik': 'ipk',
3935         'io': 'ido',
3936         'is': 'isl',
3937         'it': 'ita',
3938         'iu': 'iku',
3939         'ja': 'jpn',
3940         'jv': 'jav',
3941         'ka': 'kat',
3942         'kg': 'kon',
3943         'ki': 'kik',
3944         'kj': 'kua',
3945         'kk': 'kaz',
3946         'kl': 'kal',
3947         'km': 'khm',
3948         'kn': 'kan',
3949         'ko': 'kor',
3950         'kr': 'kau',
3951         'ks': 'kas',
3952         'ku': 'kur',
3953         'kv': 'kom',
3954         'kw': 'cor',
3955         'ky': 'kir',
3956         'la': 'lat',
3957         'lb': 'ltz',
3958         'lg': 'lug',
3959         'li': 'lim',
3960         'ln': 'lin',
3961         'lo': 'lao',
3962         'lt': 'lit',
3963         'lu': 'lub',
3964         'lv': 'lav',
3965         'mg': 'mlg',
3966         'mh': 'mah',
3967         'mi': 'mri',
3968         'mk': 'mkd',
3969         'ml': 'mal',
3970         'mn': 'mon',
3971         'mr': 'mar',
3972         'ms': 'msa',
3973         'mt': 'mlt',
3974         'my': 'mya',
3975         'na': 'nau',
3976         'nb': 'nob',
3977         'nd': 'nde',
3978         'ne': 'nep',
3979         'ng': 'ndo',
3980         'nl': 'nld',
3981         'nn': 'nno',
3982         'no': 'nor',
3983         'nr': 'nbl',
3984         'nv': 'nav',
3985         'ny': 'nya',
3986         'oc': 'oci',
3987         'oj': 'oji',
3988         'om': 'orm',
3989         'or': 'ori',
3990         'os': 'oss',
3991         'pa': 'pan',
3992         'pi': 'pli',
3993         'pl': 'pol',
3994         'ps': 'pus',
3995         'pt': 'por',
3996         'qu': 'que',
3997         'rm': 'roh',
3998         'rn': 'run',
3999         'ro': 'ron',
4000         'ru': 'rus',
4001         'rw': 'kin',
4002         'sa': 'san',
4003         'sc': 'srd',
4004         'sd': 'snd',
4005         'se': 'sme',
4006         'sg': 'sag',
4007         'si': 'sin',
4008         'sk': 'slk',
4009         'sl': 'slv',
4010         'sm': 'smo',
4011         'sn': 'sna',
4012         'so': 'som',
4013         'sq': 'sqi',
4014         'sr': 'srp',
4015         'ss': 'ssw',
4016         'st': 'sot',
4017         'su': 'sun',
4018         'sv': 'swe',
4019         'sw': 'swa',
4020         'ta': 'tam',
4021         'te': 'tel',
4022         'tg': 'tgk',
4023         'th': 'tha',
4024         'ti': 'tir',
4025         'tk': 'tuk',
4026         'tl': 'tgl',
4027         'tn': 'tsn',
4028         'to': 'ton',
4029         'tr': 'tur',
4030         'ts': 'tso',
4031         'tt': 'tat',
4032         'tw': 'twi',
4033         'ty': 'tah',
4034         'ug': 'uig',
4035         'uk': 'ukr',
4036         'ur': 'urd',
4037         'uz': 'uzb',
4038         've': 'ven',
4039         'vi': 'vie',
4040         'vo': 'vol',
4041         'wa': 'wln',
4042         'wo': 'wol',
4043         'xh': 'xho',
4044         'yi': 'yid',
4045         'ji': 'yid',  # Replaced by yi in 1989 revision
4046         'yo': 'yor',
4047         'za': 'zha',
4048         'zh': 'zho',
4049         'zu': 'zul',
4050     }
4051
4052     @classmethod
4053     def short2long(cls, code):
4054         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4055         return cls._lang_map.get(code[:2])
4056
4057     @classmethod
4058     def long2short(cls, code):
4059         """Convert language code from ISO 639-2/T to ISO 639-1"""
4060         for short_name, long_name in cls._lang_map.items():
4061             if long_name == code:
4062                 return short_name
4063
4064
4065 class ISO3166Utils(object):
4066     # From http://data.okfn.org/data/core/country-list
4067     _country_map = {
4068         'AF': 'Afghanistan',
4069         'AX': 'Åland Islands',
4070         'AL': 'Albania',
4071         'DZ': 'Algeria',
4072         'AS': 'American Samoa',
4073         'AD': 'Andorra',
4074         'AO': 'Angola',
4075         'AI': 'Anguilla',
4076         'AQ': 'Antarctica',
4077         'AG': 'Antigua and Barbuda',
4078         'AR': 'Argentina',
4079         'AM': 'Armenia',
4080         'AW': 'Aruba',
4081         'AU': 'Australia',
4082         'AT': 'Austria',
4083         'AZ': 'Azerbaijan',
4084         'BS': 'Bahamas',
4085         'BH': 'Bahrain',
4086         'BD': 'Bangladesh',
4087         'BB': 'Barbados',
4088         'BY': 'Belarus',
4089         'BE': 'Belgium',
4090         'BZ': 'Belize',
4091         'BJ': 'Benin',
4092         'BM': 'Bermuda',
4093         'BT': 'Bhutan',
4094         'BO': 'Bolivia, Plurinational State of',
4095         'BQ': 'Bonaire, Sint Eustatius and Saba',
4096         'BA': 'Bosnia and Herzegovina',
4097         'BW': 'Botswana',
4098         'BV': 'Bouvet Island',
4099         'BR': 'Brazil',
4100         'IO': 'British Indian Ocean Territory',
4101         'BN': 'Brunei Darussalam',
4102         'BG': 'Bulgaria',
4103         'BF': 'Burkina Faso',
4104         'BI': 'Burundi',
4105         'KH': 'Cambodia',
4106         'CM': 'Cameroon',
4107         'CA': 'Canada',
4108         'CV': 'Cape Verde',
4109         'KY': 'Cayman Islands',
4110         'CF': 'Central African Republic',
4111         'TD': 'Chad',
4112         'CL': 'Chile',
4113         'CN': 'China',
4114         'CX': 'Christmas Island',
4115         'CC': 'Cocos (Keeling) Islands',
4116         'CO': 'Colombia',
4117         'KM': 'Comoros',
4118         'CG': 'Congo',
4119         'CD': 'Congo, the Democratic Republic of the',
4120         'CK': 'Cook Islands',
4121         'CR': 'Costa Rica',
4122         'CI': 'Côte d\'Ivoire',
4123         'HR': 'Croatia',
4124         'CU': 'Cuba',
4125         'CW': 'Curaçao',
4126         'CY': 'Cyprus',
4127         'CZ': 'Czech Republic',
4128         'DK': 'Denmark',
4129         'DJ': 'Djibouti',
4130         'DM': 'Dominica',
4131         'DO': 'Dominican Republic',
4132         'EC': 'Ecuador',
4133         'EG': 'Egypt',
4134         'SV': 'El Salvador',
4135         'GQ': 'Equatorial Guinea',
4136         'ER': 'Eritrea',
4137         'EE': 'Estonia',
4138         'ET': 'Ethiopia',
4139         'FK': 'Falkland Islands (Malvinas)',
4140         'FO': 'Faroe Islands',
4141         'FJ': 'Fiji',
4142         'FI': 'Finland',
4143         'FR': 'France',
4144         'GF': 'French Guiana',
4145         'PF': 'French Polynesia',
4146         'TF': 'French Southern Territories',
4147         'GA': 'Gabon',
4148         'GM': 'Gambia',
4149         'GE': 'Georgia',
4150         'DE': 'Germany',
4151         'GH': 'Ghana',
4152         'GI': 'Gibraltar',
4153         'GR': 'Greece',
4154         'GL': 'Greenland',
4155         'GD': 'Grenada',
4156         'GP': 'Guadeloupe',
4157         'GU': 'Guam',
4158         'GT': 'Guatemala',
4159         'GG': 'Guernsey',
4160         'GN': 'Guinea',
4161         'GW': 'Guinea-Bissau',
4162         'GY': 'Guyana',
4163         'HT': 'Haiti',
4164         'HM': 'Heard Island and McDonald Islands',
4165         'VA': 'Holy See (Vatican City State)',
4166         'HN': 'Honduras',
4167         'HK': 'Hong Kong',
4168         'HU': 'Hungary',
4169         'IS': 'Iceland',
4170         'IN': 'India',
4171         'ID': 'Indonesia',
4172         'IR': 'Iran, Islamic Republic of',
4173         'IQ': 'Iraq',
4174         'IE': 'Ireland',
4175         'IM': 'Isle of Man',
4176         'IL': 'Israel',
4177         'IT': 'Italy',
4178         'JM': 'Jamaica',
4179         'JP': 'Japan',
4180         'JE': 'Jersey',
4181         'JO': 'Jordan',
4182         'KZ': 'Kazakhstan',
4183         'KE': 'Kenya',
4184         'KI': 'Kiribati',
4185         'KP': 'Korea, Democratic People\'s Republic of',
4186         'KR': 'Korea, Republic of',
4187         'KW': 'Kuwait',
4188         'KG': 'Kyrgyzstan',
4189         'LA': 'Lao People\'s Democratic Republic',
4190         'LV': 'Latvia',
4191         'LB': 'Lebanon',
4192         'LS': 'Lesotho',
4193         'LR': 'Liberia',
4194         'LY': 'Libya',
4195         'LI': 'Liechtenstein',
4196         'LT': 'Lithuania',
4197         'LU': 'Luxembourg',
4198         'MO': 'Macao',
4199         'MK': 'Macedonia, the Former Yugoslav Republic of',
4200         'MG': 'Madagascar',
4201         'MW': 'Malawi',
4202         'MY': 'Malaysia',
4203         'MV': 'Maldives',
4204         'ML': 'Mali',
4205         'MT': 'Malta',
4206         'MH': 'Marshall Islands',
4207         'MQ': 'Martinique',
4208         'MR': 'Mauritania',
4209         'MU': 'Mauritius',
4210         'YT': 'Mayotte',
4211         'MX': 'Mexico',
4212         'FM': 'Micronesia, Federated States of',
4213         'MD': 'Moldova, Republic of',
4214         'MC': 'Monaco',
4215         'MN': 'Mongolia',
4216         'ME': 'Montenegro',
4217         'MS': 'Montserrat',
4218         'MA': 'Morocco',
4219         'MZ': 'Mozambique',
4220         'MM': 'Myanmar',
4221         'NA': 'Namibia',
4222         'NR': 'Nauru',
4223         'NP': 'Nepal',
4224         'NL': 'Netherlands',
4225         'NC': 'New Caledonia',
4226         'NZ': 'New Zealand',
4227         'NI': 'Nicaragua',
4228         'NE': 'Niger',
4229         'NG': 'Nigeria',
4230         'NU': 'Niue',
4231         'NF': 'Norfolk Island',
4232         'MP': 'Northern Mariana Islands',
4233         'NO': 'Norway',
4234         'OM': 'Oman',
4235         'PK': 'Pakistan',
4236         'PW': 'Palau',
4237         'PS': 'Palestine, State of',
4238         'PA': 'Panama',
4239         'PG': 'Papua New Guinea',
4240         'PY': 'Paraguay',
4241         'PE': 'Peru',
4242         'PH': 'Philippines',
4243         'PN': 'Pitcairn',
4244         'PL': 'Poland',
4245         'PT': 'Portugal',
4246         'PR': 'Puerto Rico',
4247         'QA': 'Qatar',
4248         'RE': 'Réunion',
4249         'RO': 'Romania',
4250         'RU': 'Russian Federation',
4251         'RW': 'Rwanda',
4252         'BL': 'Saint Barthélemy',
4253         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4254         'KN': 'Saint Kitts and Nevis',
4255         'LC': 'Saint Lucia',
4256         'MF': 'Saint Martin (French part)',
4257         'PM': 'Saint Pierre and Miquelon',
4258         'VC': 'Saint Vincent and the Grenadines',
4259         'WS': 'Samoa',
4260         'SM': 'San Marino',
4261         'ST': 'Sao Tome and Principe',
4262         'SA': 'Saudi Arabia',
4263         'SN': 'Senegal',
4264         'RS': 'Serbia',
4265         'SC': 'Seychelles',
4266         'SL': 'Sierra Leone',
4267         'SG': 'Singapore',
4268         'SX': 'Sint Maarten (Dutch part)',
4269         'SK': 'Slovakia',
4270         'SI': 'Slovenia',
4271         'SB': 'Solomon Islands',
4272         'SO': 'Somalia',
4273         'ZA': 'South Africa',
4274         'GS': 'South Georgia and the South Sandwich Islands',
4275         'SS': 'South Sudan',
4276         'ES': 'Spain',
4277         'LK': 'Sri Lanka',
4278         'SD': 'Sudan',
4279         'SR': 'Suriname',
4280         'SJ': 'Svalbard and Jan Mayen',
4281         'SZ': 'Swaziland',
4282         'SE': 'Sweden',
4283         'CH': 'Switzerland',
4284         'SY': 'Syrian Arab Republic',
4285         'TW': 'Taiwan, Province of China',
4286         'TJ': 'Tajikistan',
4287         'TZ': 'Tanzania, United Republic of',
4288         'TH': 'Thailand',
4289         'TL': 'Timor-Leste',
4290         'TG': 'Togo',
4291         'TK': 'Tokelau',
4292         'TO': 'Tonga',
4293         'TT': 'Trinidad and Tobago',
4294         'TN': 'Tunisia',
4295         'TR': 'Turkey',
4296         'TM': 'Turkmenistan',
4297         'TC': 'Turks and Caicos Islands',
4298         'TV': 'Tuvalu',
4299         'UG': 'Uganda',
4300         'UA': 'Ukraine',
4301         'AE': 'United Arab Emirates',
4302         'GB': 'United Kingdom',
4303         'US': 'United States',
4304         'UM': 'United States Minor Outlying Islands',
4305         'UY': 'Uruguay',
4306         'UZ': 'Uzbekistan',
4307         'VU': 'Vanuatu',
4308         'VE': 'Venezuela, Bolivarian Republic of',
4309         'VN': 'Viet Nam',
4310         'VG': 'Virgin Islands, British',
4311         'VI': 'Virgin Islands, U.S.',
4312         'WF': 'Wallis and Futuna',
4313         'EH': 'Western Sahara',
4314         'YE': 'Yemen',
4315         'ZM': 'Zambia',
4316         'ZW': 'Zimbabwe',
4317     }
4318
4319     @classmethod
4320     def short2full(cls, code):
4321         """Convert an ISO 3166-2 country code to the corresponding full name"""
4322         return cls._country_map.get(code.upper())
4323
4324
4325 class GeoUtils(object):
4326     # Major IPv4 address blocks per country
4327     _country_ip_map = {
4328         'AD': '46.172.224.0/19',
4329         'AE': '94.200.0.0/13',
4330         'AF': '149.54.0.0/17',
4331         'AG': '209.59.64.0/18',
4332         'AI': '204.14.248.0/21',
4333         'AL': '46.99.0.0/16',
4334         'AM': '46.70.0.0/15',
4335         'AO': '105.168.0.0/13',
4336         'AP': '182.50.184.0/21',
4337         'AQ': '23.154.160.0/24',
4338         'AR': '181.0.0.0/12',
4339         'AS': '202.70.112.0/20',
4340         'AT': '77.116.0.0/14',
4341         'AU': '1.128.0.0/11',
4342         'AW': '181.41.0.0/18',
4343         'AX': '185.217.4.0/22',
4344         'AZ': '5.197.0.0/16',
4345         'BA': '31.176.128.0/17',
4346         'BB': '65.48.128.0/17',
4347         'BD': '114.130.0.0/16',
4348         'BE': '57.0.0.0/8',
4349         'BF': '102.178.0.0/15',
4350         'BG': '95.42.0.0/15',
4351         'BH': '37.131.0.0/17',
4352         'BI': '154.117.192.0/18',
4353         'BJ': '137.255.0.0/16',
4354         'BL': '185.212.72.0/23',
4355         'BM': '196.12.64.0/18',
4356         'BN': '156.31.0.0/16',
4357         'BO': '161.56.0.0/16',
4358         'BQ': '161.0.80.0/20',
4359         'BR': '191.128.0.0/12',
4360         'BS': '24.51.64.0/18',
4361         'BT': '119.2.96.0/19',
4362         'BW': '168.167.0.0/16',
4363         'BY': '178.120.0.0/13',
4364         'BZ': '179.42.192.0/18',
4365         'CA': '99.224.0.0/11',
4366         'CD': '41.243.0.0/16',
4367         'CF': '197.242.176.0/21',
4368         'CG': '160.113.0.0/16',
4369         'CH': '85.0.0.0/13',
4370         'CI': '102.136.0.0/14',
4371         'CK': '202.65.32.0/19',
4372         'CL': '152.172.0.0/14',
4373         'CM': '102.244.0.0/14',
4374         'CN': '36.128.0.0/10',
4375         'CO': '181.240.0.0/12',
4376         'CR': '201.192.0.0/12',
4377         'CU': '152.206.0.0/15',
4378         'CV': '165.90.96.0/19',
4379         'CW': '190.88.128.0/17',
4380         'CY': '31.153.0.0/16',
4381         'CZ': '88.100.0.0/14',
4382         'DE': '53.0.0.0/8',
4383         'DJ': '197.241.0.0/17',
4384         'DK': '87.48.0.0/12',
4385         'DM': '192.243.48.0/20',
4386         'DO': '152.166.0.0/15',
4387         'DZ': '41.96.0.0/12',
4388         'EC': '186.68.0.0/15',
4389         'EE': '90.190.0.0/15',
4390         'EG': '156.160.0.0/11',
4391         'ER': '196.200.96.0/20',
4392         'ES': '88.0.0.0/11',
4393         'ET': '196.188.0.0/14',
4394         'EU': '2.16.0.0/13',
4395         'FI': '91.152.0.0/13',
4396         'FJ': '144.120.0.0/16',
4397         'FK': '80.73.208.0/21',
4398         'FM': '119.252.112.0/20',
4399         'FO': '88.85.32.0/19',
4400         'FR': '90.0.0.0/9',
4401         'GA': '41.158.0.0/15',
4402         'GB': '25.0.0.0/8',
4403         'GD': '74.122.88.0/21',
4404         'GE': '31.146.0.0/16',
4405         'GF': '161.22.64.0/18',
4406         'GG': '62.68.160.0/19',
4407         'GH': '154.160.0.0/12',
4408         'GI': '95.164.0.0/16',
4409         'GL': '88.83.0.0/19',
4410         'GM': '160.182.0.0/15',
4411         'GN': '197.149.192.0/18',
4412         'GP': '104.250.0.0/19',
4413         'GQ': '105.235.224.0/20',
4414         'GR': '94.64.0.0/13',
4415         'GT': '168.234.0.0/16',
4416         'GU': '168.123.0.0/16',
4417         'GW': '197.214.80.0/20',
4418         'GY': '181.41.64.0/18',
4419         'HK': '113.252.0.0/14',
4420         'HN': '181.210.0.0/16',
4421         'HR': '93.136.0.0/13',
4422         'HT': '148.102.128.0/17',
4423         'HU': '84.0.0.0/14',
4424         'ID': '39.192.0.0/10',
4425         'IE': '87.32.0.0/12',
4426         'IL': '79.176.0.0/13',
4427         'IM': '5.62.80.0/20',
4428         'IN': '117.192.0.0/10',
4429         'IO': '203.83.48.0/21',
4430         'IQ': '37.236.0.0/14',
4431         'IR': '2.176.0.0/12',
4432         'IS': '82.221.0.0/16',
4433         'IT': '79.0.0.0/10',
4434         'JE': '87.244.64.0/18',
4435         'JM': '72.27.0.0/17',
4436         'JO': '176.29.0.0/16',
4437         'JP': '133.0.0.0/8',
4438         'KE': '105.48.0.0/12',
4439         'KG': '158.181.128.0/17',
4440         'KH': '36.37.128.0/17',
4441         'KI': '103.25.140.0/22',
4442         'KM': '197.255.224.0/20',
4443         'KN': '198.167.192.0/19',
4444         'KP': '175.45.176.0/22',
4445         'KR': '175.192.0.0/10',
4446         'KW': '37.36.0.0/14',
4447         'KY': '64.96.0.0/15',
4448         'KZ': '2.72.0.0/13',
4449         'LA': '115.84.64.0/18',
4450         'LB': '178.135.0.0/16',
4451         'LC': '24.92.144.0/20',
4452         'LI': '82.117.0.0/19',
4453         'LK': '112.134.0.0/15',
4454         'LR': '102.183.0.0/16',
4455         'LS': '129.232.0.0/17',
4456         'LT': '78.56.0.0/13',
4457         'LU': '188.42.0.0/16',
4458         'LV': '46.109.0.0/16',
4459         'LY': '41.252.0.0/14',
4460         'MA': '105.128.0.0/11',
4461         'MC': '88.209.64.0/18',
4462         'MD': '37.246.0.0/16',
4463         'ME': '178.175.0.0/17',
4464         'MF': '74.112.232.0/21',
4465         'MG': '154.126.0.0/17',
4466         'MH': '117.103.88.0/21',
4467         'MK': '77.28.0.0/15',
4468         'ML': '154.118.128.0/18',
4469         'MM': '37.111.0.0/17',
4470         'MN': '49.0.128.0/17',
4471         'MO': '60.246.0.0/16',
4472         'MP': '202.88.64.0/20',
4473         'MQ': '109.203.224.0/19',
4474         'MR': '41.188.64.0/18',
4475         'MS': '208.90.112.0/22',
4476         'MT': '46.11.0.0/16',
4477         'MU': '105.16.0.0/12',
4478         'MV': '27.114.128.0/18',
4479         'MW': '102.70.0.0/15',
4480         'MX': '187.192.0.0/11',
4481         'MY': '175.136.0.0/13',
4482         'MZ': '197.218.0.0/15',
4483         'NA': '41.182.0.0/16',
4484         'NC': '101.101.0.0/18',
4485         'NE': '197.214.0.0/18',
4486         'NF': '203.17.240.0/22',
4487         'NG': '105.112.0.0/12',
4488         'NI': '186.76.0.0/15',
4489         'NL': '145.96.0.0/11',
4490         'NO': '84.208.0.0/13',
4491         'NP': '36.252.0.0/15',
4492         'NR': '203.98.224.0/19',
4493         'NU': '49.156.48.0/22',
4494         'NZ': '49.224.0.0/14',
4495         'OM': '5.36.0.0/15',
4496         'PA': '186.72.0.0/15',
4497         'PE': '186.160.0.0/14',
4498         'PF': '123.50.64.0/18',
4499         'PG': '124.240.192.0/19',
4500         'PH': '49.144.0.0/13',
4501         'PK': '39.32.0.0/11',
4502         'PL': '83.0.0.0/11',
4503         'PM': '70.36.0.0/20',
4504         'PR': '66.50.0.0/16',
4505         'PS': '188.161.0.0/16',
4506         'PT': '85.240.0.0/13',
4507         'PW': '202.124.224.0/20',
4508         'PY': '181.120.0.0/14',
4509         'QA': '37.210.0.0/15',
4510         'RE': '102.35.0.0/16',
4511         'RO': '79.112.0.0/13',
4512         'RS': '93.86.0.0/15',
4513         'RU': '5.136.0.0/13',
4514         'RW': '41.186.0.0/16',
4515         'SA': '188.48.0.0/13',
4516         'SB': '202.1.160.0/19',
4517         'SC': '154.192.0.0/11',
4518         'SD': '102.120.0.0/13',
4519         'SE': '78.64.0.0/12',
4520         'SG': '8.128.0.0/10',
4521         'SI': '188.196.0.0/14',
4522         'SK': '78.98.0.0/15',
4523         'SL': '102.143.0.0/17',
4524         'SM': '89.186.32.0/19',
4525         'SN': '41.82.0.0/15',
4526         'SO': '154.115.192.0/18',
4527         'SR': '186.179.128.0/17',
4528         'SS': '105.235.208.0/21',
4529         'ST': '197.159.160.0/19',
4530         'SV': '168.243.0.0/16',
4531         'SX': '190.102.0.0/20',
4532         'SY': '5.0.0.0/16',
4533         'SZ': '41.84.224.0/19',
4534         'TC': '65.255.48.0/20',
4535         'TD': '154.68.128.0/19',
4536         'TG': '196.168.0.0/14',
4537         'TH': '171.96.0.0/13',
4538         'TJ': '85.9.128.0/18',
4539         'TK': '27.96.24.0/21',
4540         'TL': '180.189.160.0/20',
4541         'TM': '95.85.96.0/19',
4542         'TN': '197.0.0.0/11',
4543         'TO': '175.176.144.0/21',
4544         'TR': '78.160.0.0/11',
4545         'TT': '186.44.0.0/15',
4546         'TV': '202.2.96.0/19',
4547         'TW': '120.96.0.0/11',
4548         'TZ': '156.156.0.0/14',
4549         'UA': '37.52.0.0/14',
4550         'UG': '102.80.0.0/13',
4551         'US': '6.0.0.0/8',
4552         'UY': '167.56.0.0/13',
4553         'UZ': '84.54.64.0/18',
4554         'VA': '212.77.0.0/19',
4555         'VC': '207.191.240.0/21',
4556         'VE': '186.88.0.0/13',
4557         'VG': '66.81.192.0/20',
4558         'VI': '146.226.0.0/16',
4559         'VN': '14.160.0.0/11',
4560         'VU': '202.80.32.0/20',
4561         'WF': '117.20.32.0/21',
4562         'WS': '202.4.32.0/19',
4563         'YE': '134.35.0.0/16',
4564         'YT': '41.242.116.0/22',
4565         'ZA': '41.0.0.0/11',
4566         'ZM': '102.144.0.0/13',
4567         'ZW': '102.177.192.0/18',
4568     }
4569
4570     @classmethod
4571     def random_ipv4(cls, code_or_block):
4572         if len(code_or_block) == 2:
4573             block = cls._country_ip_map.get(code_or_block.upper())
4574             if not block:
4575                 return None
4576         else:
4577             block = code_or_block
4578         addr, preflen = block.split('/')
4579         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4580         addr_max = addr_min | (0xffffffff >> int(preflen))
4581         return compat_str(socket.inet_ntoa(
4582             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4583
4584
4585 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4586     def __init__(self, proxies=None):
4587         # Set default handlers
4588         for type in ('http', 'https'):
4589             setattr(self, '%s_open' % type,
4590                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4591                         meth(r, proxy, type))
4592         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4593
4594     def proxy_open(self, req, proxy, type):
4595         req_proxy = req.headers.get('Ytdl-request-proxy')
4596         if req_proxy is not None:
4597             proxy = req_proxy
4598             del req.headers['Ytdl-request-proxy']
4599
4600         if proxy == '__noproxy__':
4601             return None  # No Proxy
4602         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4603             req.add_header('Ytdl-socks-proxy', proxy)
4604             # yt-dlp's http/https handlers do wrapping the socket with socks
4605             return None
4606         return compat_urllib_request.ProxyHandler.proxy_open(
4607             self, req, proxy, type)
4608
4609
4610 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4611 # released into Public Domain
4612 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4613
4614 def long_to_bytes(n, blocksize=0):
4615     """long_to_bytes(n:long, blocksize:int) : string
4616     Convert a long integer to a byte string.
4617
4618     If optional blocksize is given and greater than zero, pad the front of the
4619     byte string with binary zeros so that the length is a multiple of
4620     blocksize.
4621     """
4622     # after much testing, this algorithm was deemed to be the fastest
4623     s = b''
4624     n = int(n)
4625     while n > 0:
4626         s = compat_struct_pack('>I', n & 0xffffffff) + s
4627         n = n >> 32
4628     # strip off leading zeros
4629     for i in range(len(s)):
4630         if s[i] != b'\000'[0]:
4631             break
4632     else:
4633         # only happens when n == 0
4634         s = b'\000'
4635         i = 0
4636     s = s[i:]
4637     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4638     # de-padding being done above, but sigh...
4639     if blocksize > 0 and len(s) % blocksize:
4640         s = (blocksize - len(s) % blocksize) * b'\000' + s
4641     return s
4642
4643
4644 def bytes_to_long(s):
4645     """bytes_to_long(string) : long
4646     Convert a byte string to a long integer.
4647
4648     This is (essentially) the inverse of long_to_bytes().
4649     """
4650     acc = 0
4651     length = len(s)
4652     if length % 4:
4653         extra = (4 - length % 4)
4654         s = b'\000' * extra + s
4655         length = length + extra
4656     for i in range(0, length, 4):
4657         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4658     return acc
4659
4660
4661 def ohdave_rsa_encrypt(data, exponent, modulus):
4662     '''
4663     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4664
4665     Input:
4666         data: data to encrypt, bytes-like object
4667         exponent, modulus: parameter e and N of RSA algorithm, both integer
4668     Output: hex string of encrypted data
4669
4670     Limitation: supports one block encryption only
4671     '''
4672
4673     payload = int(binascii.hexlify(data[::-1]), 16)
4674     encrypted = pow(payload, exponent, modulus)
4675     return '%x' % encrypted
4676
4677
4678 def pkcs1pad(data, length):
4679     """
4680     Padding input data with PKCS#1 scheme
4681
4682     @param {int[]} data        input data
4683     @param {int}   length      target length
4684     @returns {int[]}           padded data
4685     """
4686     if len(data) > length - 11:
4687         raise ValueError('Input data too long for PKCS#1 padding')
4688
4689     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4690     return [0, 2] + pseudo_random + [0] + data
4691
4692
4693 def encode_base_n(num, n, table=None):
4694     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4695     if not table:
4696         table = FULL_TABLE[:n]
4697
4698     if n > len(table):
4699         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4700
4701     if num == 0:
4702         return table[0]
4703
4704     ret = ''
4705     while num:
4706         ret = table[num % n] + ret
4707         num = num // n
4708     return ret
4709
4710
4711 def decode_packed_codes(code):
4712     mobj = re.search(PACKED_CODES_RE, code)
4713     obfuscated_code, base, count, symbols = mobj.groups()
4714     base = int(base)
4715     count = int(count)
4716     symbols = symbols.split('|')
4717     symbol_table = {}
4718
4719     while count:
4720         count -= 1
4721         base_n_count = encode_base_n(count, base)
4722         symbol_table[base_n_count] = symbols[count] or base_n_count
4723
4724     return re.sub(
4725         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4726         obfuscated_code)
4727
4728
4729 def caesar(s, alphabet, shift):
4730     if shift == 0:
4731         return s
4732     l = len(alphabet)
4733     return ''.join(
4734         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4735         for c in s)
4736
4737
4738 def rot47(s):
4739     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4740
4741
4742 def parse_m3u8_attributes(attrib):
4743     info = {}
4744     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4745         if val.startswith('"'):
4746             val = val[1:-1]
4747         info[key] = val
4748     return info
4749
4750
4751 def urshift(val, n):
4752     return val >> n if val >= 0 else (val + 0x100000000) >> n
4753
4754
4755 # Based on png2str() written by @gdkchan and improved by @yokrysty
4756 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4757 def decode_png(png_data):
4758     # Reference: https://www.w3.org/TR/PNG/
4759     header = png_data[8:]
4760
4761     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4762         raise IOError('Not a valid PNG file.')
4763
4764     int_map = {1: '>B', 2: '>H', 4: '>I'}
4765     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4766
4767     chunks = []
4768
4769     while header:
4770         length = unpack_integer(header[:4])
4771         header = header[4:]
4772
4773         chunk_type = header[:4]
4774         header = header[4:]
4775
4776         chunk_data = header[:length]
4777         header = header[length:]
4778
4779         header = header[4:]  # Skip CRC
4780
4781         chunks.append({
4782             'type': chunk_type,
4783             'length': length,
4784             'data': chunk_data
4785         })
4786
4787     ihdr = chunks[0]['data']
4788
4789     width = unpack_integer(ihdr[:4])
4790     height = unpack_integer(ihdr[4:8])
4791
4792     idat = b''
4793
4794     for chunk in chunks:
4795         if chunk['type'] == b'IDAT':
4796             idat += chunk['data']
4797
4798     if not idat:
4799         raise IOError('Unable to read PNG data.')
4800
4801     decompressed_data = bytearray(zlib.decompress(idat))
4802
4803     stride = width * 3
4804     pixels = []
4805
4806     def _get_pixel(idx):
4807         x = idx % stride
4808         y = idx // stride
4809         return pixels[y][x]
4810
4811     for y in range(height):
4812         basePos = y * (1 + stride)
4813         filter_type = decompressed_data[basePos]
4814
4815         current_row = []
4816
4817         pixels.append(current_row)
4818
4819         for x in range(stride):
4820             color = decompressed_data[1 + basePos + x]
4821             basex = y * stride + x
4822             left = 0
4823             up = 0
4824
4825             if x > 2:
4826                 left = _get_pixel(basex - 3)
4827             if y > 0:
4828                 up = _get_pixel(basex - stride)
4829
4830             if filter_type == 1:  # Sub
4831                 color = (color + left) & 0xff
4832             elif filter_type == 2:  # Up
4833                 color = (color + up) & 0xff
4834             elif filter_type == 3:  # Average
4835                 color = (color + ((left + up) >> 1)) & 0xff
4836             elif filter_type == 4:  # Paeth
4837                 a = left
4838                 b = up
4839                 c = 0
4840
4841                 if x > 2 and y > 0:
4842                     c = _get_pixel(basex - stride - 3)
4843
4844                 p = a + b - c
4845
4846                 pa = abs(p - a)
4847                 pb = abs(p - b)
4848                 pc = abs(p - c)
4849
4850                 if pa <= pb and pa <= pc:
4851                     color = (color + a) & 0xff
4852                 elif pb <= pc:
4853                     color = (color + b) & 0xff
4854                 else:
4855                     color = (color + c) & 0xff
4856
4857             current_row.append(color)
4858
4859     return width, height, pixels
4860
4861
4862 def write_xattr(path, key, value):
4863     # This mess below finds the best xattr tool for the job
4864     try:
4865         # try the pyxattr module...
4866         import xattr
4867
4868         if hasattr(xattr, 'set'):  # pyxattr
4869             # Unicode arguments are not supported in python-pyxattr until
4870             # version 0.5.0
4871             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4872             pyxattr_required_version = '0.5.0'
4873             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4874                 # TODO: fallback to CLI tools
4875                 raise XAttrUnavailableError(
4876                     'python-pyxattr is detected but is too old. '
4877                     'yt-dlp requires %s or above while your version is %s. '
4878                     'Falling back to other xattr implementations' % (
4879                         pyxattr_required_version, xattr.__version__))
4880
4881             setxattr = xattr.set
4882         else:  # xattr
4883             setxattr = xattr.setxattr
4884
4885         try:
4886             setxattr(path, key, value)
4887         except EnvironmentError as e:
4888             raise XAttrMetadataError(e.errno, e.strerror)
4889
4890     except ImportError:
4891         if compat_os_name == 'nt':
4892             # Write xattrs to NTFS Alternate Data Streams:
4893             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4894             assert ':' not in key
4895             assert os.path.exists(path)
4896
4897             ads_fn = path + ':' + key
4898             try:
4899                 with open(ads_fn, 'wb') as f:
4900                     f.write(value)
4901             except EnvironmentError as e:
4902                 raise XAttrMetadataError(e.errno, e.strerror)
4903         else:
4904             user_has_setfattr = check_executable('setfattr', ['--version'])
4905             user_has_xattr = check_executable('xattr', ['-h'])
4906
4907             if user_has_setfattr or user_has_xattr:
4908
4909                 value = value.decode('utf-8')
4910                 if user_has_setfattr:
4911                     executable = 'setfattr'
4912                     opts = ['-n', key, '-v', value]
4913                 elif user_has_xattr:
4914                     executable = 'xattr'
4915                     opts = ['-w', key, value]
4916
4917                 cmd = ([encodeFilename(executable, True)]
4918                        + [encodeArgument(o) for o in opts]
4919                        + [encodeFilename(path, True)])
4920
4921                 try:
4922                     p = Popen(
4923                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4924                 except EnvironmentError as e:
4925                     raise XAttrMetadataError(e.errno, e.strerror)
4926                 stdout, stderr = p.communicate_or_kill()
4927                 stderr = stderr.decode('utf-8', 'replace')
4928                 if p.returncode != 0:
4929                     raise XAttrMetadataError(p.returncode, stderr)
4930
4931             else:
4932                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4933                 if sys.platform.startswith('linux'):
4934                     raise XAttrUnavailableError(
4935                         "Couldn't find a tool to set the xattrs. "
4936                         "Install either the python 'pyxattr' or 'xattr' "
4937                         "modules, or the GNU 'attr' package "
4938                         "(which contains the 'setfattr' tool).")
4939                 else:
4940                     raise XAttrUnavailableError(
4941                         "Couldn't find a tool to set the xattrs. "
4942                         "Install either the python 'xattr' module, "
4943                         "or the 'xattr' binary.")
4944
4945
4946 def random_birthday(year_field, month_field, day_field):
4947     start_date = datetime.date(1950, 1, 1)
4948     end_date = datetime.date(1995, 12, 31)
4949     offset = random.randint(0, (end_date - start_date).days)
4950     random_date = start_date + datetime.timedelta(offset)
4951     return {
4952         year_field: str(random_date.year),
4953         month_field: str(random_date.month),
4954         day_field: str(random_date.day),
4955     }
4956
4957
4958 # Templates for internet shortcut files, which are plain text files.
4959 DOT_URL_LINK_TEMPLATE = '''
4960 [InternetShortcut]
4961 URL=%(url)s
4962 '''.lstrip()
4963
4964 DOT_WEBLOC_LINK_TEMPLATE = '''
4965 <?xml version="1.0" encoding="UTF-8"?>
4966 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4967 <plist version="1.0">
4968 <dict>
4969 \t<key>URL</key>
4970 \t<string>%(url)s</string>
4971 </dict>
4972 </plist>
4973 '''.lstrip()
4974
4975 DOT_DESKTOP_LINK_TEMPLATE = '''
4976 [Desktop Entry]
4977 Encoding=UTF-8
4978 Name=%(filename)s
4979 Type=Link
4980 URL=%(url)s
4981 Icon=text-html
4982 '''.lstrip()
4983
4984 LINK_TEMPLATES = {
4985     'url': DOT_URL_LINK_TEMPLATE,
4986     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4987     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4988 }
4989
4990
4991 def iri_to_uri(iri):
4992     """
4993     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4994
4995     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4996     """
4997
4998     iri_parts = compat_urllib_parse_urlparse(iri)
4999
5000     if '[' in iri_parts.netloc:
5001         raise ValueError('IPv6 URIs are not, yet, supported.')
5002         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5003
5004     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5005
5006     net_location = ''
5007     if iri_parts.username:
5008         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
5009         if iri_parts.password is not None:
5010             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
5011         net_location += '@'
5012
5013     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
5014     # The 'idna' encoding produces ASCII text.
5015     if iri_parts.port is not None and iri_parts.port != 80:
5016         net_location += ':' + str(iri_parts.port)
5017
5018     return compat_urllib_parse_urlunparse(
5019         (iri_parts.scheme,
5020             net_location,
5021
5022             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5023
5024             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5025             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5026
5027             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5028             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5029
5030             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5031
5032     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5033
5034
5035 def to_high_limit_path(path):
5036     if sys.platform in ['win32', 'cygwin']:
5037         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5038         return r'\\?\ '.rstrip() + os.path.abspath(path)
5039
5040     return path
5041
5042
5043 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5044     val = traverse_obj(obj, *variadic(field))
5045     if val in ignore:
5046         return default
5047     return template % (func(val) if func else val)
5048
5049
5050 def clean_podcast_url(url):
5051     return re.sub(r'''(?x)
5052         (?:
5053             (?:
5054                 chtbl\.com/track|
5055                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5056                 play\.podtrac\.com
5057             )/[^/]+|
5058             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5059             flex\.acast\.com|
5060             pd(?:
5061                 cn\.co| # https://podcorn.com/analytics-prefix/
5062                 st\.fm # https://podsights.com/docs/
5063             )/e
5064         )/''', '', url)
5065
5066
5067 _HEX_TABLE = '0123456789abcdef'
5068
5069
5070 def random_uuidv4():
5071     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5072
5073
5074 def make_dir(path, to_screen=None):
5075     try:
5076         dn = os.path.dirname(path)
5077         if dn and not os.path.exists(dn):
5078             os.makedirs(dn)
5079         return True
5080     except (OSError, IOError) as err:
5081         if callable(to_screen) is not None:
5082             to_screen('unable to create directory ' + error_to_compat_str(err))
5083         return False
5084
5085
5086 def get_executable_path():
5087     from zipimport import zipimporter
5088     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5089         path = os.path.dirname(sys.executable)
5090     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5091         path = os.path.join(os.path.dirname(__file__), '../..')
5092     else:
5093         path = os.path.join(os.path.dirname(__file__), '..')
5094     return os.path.abspath(path)
5095
5096
5097 def load_plugins(name, suffix, namespace):
5098     classes = {}
5099     try:
5100         plugins_spec = importlib.util.spec_from_file_location(
5101             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5102         plugins = importlib.util.module_from_spec(plugins_spec)
5103         sys.modules[plugins_spec.name] = plugins
5104         plugins_spec.loader.exec_module(plugins)
5105         for name in dir(plugins):
5106             if name in namespace:
5107                 continue
5108             if not name.endswith(suffix):
5109                 continue
5110             klass = getattr(plugins, name)
5111             classes[name] = namespace[name] = klass
5112     except FileNotFoundError:
5113         pass
5114     return classes
5115
5116
5117 def traverse_obj(
5118         obj, *path_list, default=None, expected_type=None, get_all=True,
5119         casesense=True, is_user_input=False, traverse_string=False):
5120     ''' Traverse nested list/dict/tuple
5121     @param path_list        A list of paths which are checked one by one.
5122                             Each path is a list of keys where each key is a string,
5123                             a function, a tuple of strings/None or "...".
5124                             When a fuction is given, it takes the key as argument and
5125                             returns whether the key matches or not. When a tuple is given,
5126                             all the keys given in the tuple are traversed, and
5127                             "..." traverses all the keys in the object
5128                             "None" returns the object without traversal
5129     @param default          Default value to return
5130     @param expected_type    Only accept final value of this type (Can also be any callable)
5131     @param get_all          Return all the values obtained from a path or only the first one
5132     @param casesense        Whether to consider dictionary keys as case sensitive
5133     @param is_user_input    Whether the keys are generated from user input. If True,
5134                             strings are converted to int/slice if necessary
5135     @param traverse_string  Whether to traverse inside strings. If True, any
5136                             non-compatible object will also be converted into a string
5137     # TODO: Write tests
5138     '''
5139     if not casesense:
5140         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5141         path_list = (map(_lower, variadic(path)) for path in path_list)
5142
5143     def _traverse_obj(obj, path, _current_depth=0):
5144         nonlocal depth
5145         path = tuple(variadic(path))
5146         for i, key in enumerate(path):
5147             if None in (key, obj):
5148                 return obj
5149             if isinstance(key, (list, tuple)):
5150                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5151                 key = ...
5152             if key is ...:
5153                 obj = (obj.values() if isinstance(obj, dict)
5154                        else obj if isinstance(obj, (list, tuple, LazyList))
5155                        else str(obj) if traverse_string else [])
5156                 _current_depth += 1
5157                 depth = max(depth, _current_depth)
5158                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5159             elif callable(key):
5160                 if isinstance(obj, (list, tuple, LazyList)):
5161                     obj = enumerate(obj)
5162                 elif isinstance(obj, dict):
5163                     obj = obj.items()
5164                 else:
5165                     if not traverse_string:
5166                         return None
5167                     obj = str(obj)
5168                 _current_depth += 1
5169                 depth = max(depth, _current_depth)
5170                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5171             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5172                 obj = (obj.get(key) if casesense or (key in obj)
5173                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5174             else:
5175                 if is_user_input:
5176                     key = (int_or_none(key) if ':' not in key
5177                            else slice(*map(int_or_none, key.split(':'))))
5178                     if key == slice(None):
5179                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5180                 if not isinstance(key, (int, slice)):
5181                     return None
5182                 if not isinstance(obj, (list, tuple, LazyList)):
5183                     if not traverse_string:
5184                         return None
5185                     obj = str(obj)
5186                 try:
5187                     obj = obj[key]
5188                 except IndexError:
5189                     return None
5190         return obj
5191
5192     if isinstance(expected_type, type):
5193         type_test = lambda val: val if isinstance(val, expected_type) else None
5194     elif expected_type is not None:
5195         type_test = expected_type
5196     else:
5197         type_test = lambda val: val
5198
5199     for path in path_list:
5200         depth = 0
5201         val = _traverse_obj(obj, path)
5202         if val is not None:
5203             if depth:
5204                 for _ in range(depth - 1):
5205                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5206                 val = [v for v in map(type_test, val) if v is not None]
5207                 if val:
5208                     return val if get_all else val[0]
5209             else:
5210                 val = type_test(val)
5211                 if val is not None:
5212                     return val
5213     return default
5214
5215
5216 def traverse_dict(dictn, keys, casesense=True):
5217     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5218                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5219     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5220
5221
5222 def get_first(obj, keys, **kwargs):
5223     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5224
5225
5226 def variadic(x, allowed_types=(str, bytes, dict)):
5227     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5228
5229
5230 def decode_base(value, digits):
5231     # This will convert given base-x string to scalar (long or int)
5232     table = {char: index for index, char in enumerate(digits)}
5233     result = 0
5234     base = len(digits)
5235     for chr in value:
5236         result *= base
5237         result += table[chr]
5238     return result
5239
5240
5241 def time_seconds(**kwargs):
5242     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5243     return t.timestamp()
5244
5245
5246 # create a JSON Web Signature (jws) with HS256 algorithm
5247 # the resulting format is in JWS Compact Serialization
5248 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5249 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5250 def jwt_encode_hs256(payload_data, key, headers={}):
5251     header_data = {
5252         'alg': 'HS256',
5253         'typ': 'JWT',
5254     }
5255     if headers:
5256         header_data.update(headers)
5257     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5258     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5259     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5260     signature_b64 = base64.b64encode(h.digest())
5261     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5262     return token
5263
5264
5265 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5266 def jwt_decode_hs256(jwt):
5267     header_b64, payload_b64, signature_b64 = jwt.split('.')
5268     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5269     return payload_data
5270
5271
5272 def supports_terminal_sequences(stream):
5273     if compat_os_name == 'nt':
5274         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5275         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5276             return False
5277     elif not os.getenv('TERM'):
5278         return False
5279     try:
5280         return stream.isatty()
5281     except BaseException:
5282         return False
5283
5284
5285 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5286
5287
5288 def remove_terminal_sequences(string):
5289     return _terminal_sequences_re.sub('', string)
5290
5291
5292 def number_of_digits(number):
5293     return len('%d' % number)
5294
5295
5296 def join_nonempty(*values, delim='-', from_dict=None):
5297     if from_dict is not None:
5298         values = map(from_dict.get, values)
5299     return delim.join(map(str, filter(None, values)))
5300
5301
5302 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5303     """
5304     Find the largest format dimensions in terms of video width and, for each thumbnail:
5305     * Modify the URL: Match the width with the provided regex and replace with the former width
5306     * Update dimensions
5307
5308     This function is useful with video services that scale the provided thumbnails on demand
5309     """
5310     _keys = ('width', 'height')
5311     max_dimensions = max(
5312         [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5313         default=(0, 0))
5314     if not max_dimensions[0]:
5315         return thumbnails
5316     return [
5317         merge_dicts(
5318             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5319             dict(zip(_keys, max_dimensions)), thumbnail)
5320         for thumbnail in thumbnails
5321     ]
5322
5323
5324 def parse_http_range(range):
5325     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5326     if not range:
5327         return None, None, None
5328     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5329     if not crg:
5330         return None, None, None
5331     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5332
5333
5334 class Config:
5335     own_args = None
5336     filename = None
5337     __initialized = False
5338
5339     def __init__(self, parser, label=None):
5340         self._parser, self.label = parser, label
5341         self._loaded_paths, self.configs = set(), []
5342
5343     def init(self, args=None, filename=None):
5344         assert not self.__initialized
5345         directory = ''
5346         if filename:
5347             location = os.path.realpath(filename)
5348             directory = os.path.dirname(location)
5349             if location in self._loaded_paths:
5350                 return False
5351             self._loaded_paths.add(location)
5352
5353         self.__initialized = True
5354         self.own_args, self.filename = args, filename
5355         for location in self._parser.parse_args(args)[0].config_locations or []:
5356             location = os.path.join(directory, expand_path(location))
5357             if os.path.isdir(location):
5358                 location = os.path.join(location, 'yt-dlp.conf')
5359             if not os.path.exists(location):
5360                 self._parser.error(f'config location {location} does not exist')
5361             self.append_config(self.read_file(location), location)
5362         return True
5363
5364     def __str__(self):
5365         label = join_nonempty(
5366             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5367             delim=' ')
5368         return join_nonempty(
5369             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5370             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5371             delim='\n')
5372
5373     @staticmethod
5374     def read_file(filename, default=[]):
5375         try:
5376             optionf = open(filename)
5377         except IOError:
5378             return default  # silently skip if file is not present
5379         try:
5380             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5381             contents = optionf.read()
5382             if sys.version_info < (3,):
5383                 contents = contents.decode(preferredencoding())
5384             res = compat_shlex_split(contents, comments=True)
5385         finally:
5386             optionf.close()
5387         return res
5388
5389     @staticmethod
5390     def hide_login_info(opts):
5391         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5392         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5393
5394         def _scrub_eq(o):
5395             m = eqre.match(o)
5396             if m:
5397                 return m.group('key') + '=PRIVATE'
5398             else:
5399                 return o
5400
5401         opts = list(map(_scrub_eq, opts))
5402         for idx, opt in enumerate(opts):
5403             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5404                 opts[idx + 1] = 'PRIVATE'
5405         return opts
5406
5407     def append_config(self, *args, label=None):
5408         config = type(self)(self._parser, label)
5409         config._loaded_paths = self._loaded_paths
5410         if config.init(*args):
5411             self.configs.append(config)
5412
5413     @property
5414     def all_args(self):
5415         for config in reversed(self.configs):
5416             yield from config.all_args
5417         yield from self.own_args or []
5418
5419     def parse_args(self):
5420         return self._parser.parse_args(list(self.all_args))
5421
5422
5423 class WebSocketsWrapper():
5424     """Wraps websockets module to use in non-async scopes"""
5425
5426     def __init__(self, url, headers=None):
5427         self.loop = asyncio.events.new_event_loop()
5428         self.conn = compat_websockets.connect(
5429             url, extra_headers=headers, ping_interval=None,
5430             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5431         atexit.register(self.__exit__, None, None, None)
5432
5433     def __enter__(self):
5434         self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5435         return self
5436
5437     def send(self, *args):
5438         self.run_with_loop(self.pool.send(*args), self.loop)
5439
5440     def recv(self, *args):
5441         return self.run_with_loop(self.pool.recv(*args), self.loop)
5442
5443     def __exit__(self, type, value, traceback):
5444         try:
5445             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5446         finally:
5447             self.loop.close()
5448             self._cancel_all_tasks(self.loop)
5449
5450     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5451     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5452     @staticmethod
5453     def run_with_loop(main, loop):
5454         if not asyncio.coroutines.iscoroutine(main):
5455             raise ValueError(f'a coroutine was expected, got {main!r}')
5456
5457         try:
5458             return loop.run_until_complete(main)
5459         finally:
5460             loop.run_until_complete(loop.shutdown_asyncgens())
5461             if hasattr(loop, 'shutdown_default_executor'):
5462                 loop.run_until_complete(loop.shutdown_default_executor())
5463
5464     @staticmethod
5465     def _cancel_all_tasks(loop):
5466         to_cancel = asyncio.tasks.all_tasks(loop)
5467
5468         if not to_cancel:
5469             return
5470
5471         for task in to_cancel:
5472             task.cancel()
5473
5474         loop.run_until_complete(
5475             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5476
5477         for task in to_cancel:
5478             if task.cancelled():
5479                 continue
5480             if task.exception() is not None:
5481                 loop.call_exception_handler({
5482                     'message': 'unhandled exception during asyncio.run() shutdown',
5483                     'exception': task.exception(),
5484                     'task': task,
5485                 })
5486
5487
5488 has_websockets = bool(compat_websockets)
5489
5490
5491 def merge_headers(*dicts):
5492     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5493     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}