yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_split,
  62     compat_shlex_quote,
  63     compat_str,
  64     compat_struct_pack,
  65     compat_struct_unpack,
  66     compat_urllib_error,
  67     compat_urllib_parse,
  68     compat_urllib_parse_urlencode,
  69     compat_urllib_parse_urlparse,
  70     compat_urllib_parse_urlunparse,
  71     compat_urllib_parse_quote,
  72     compat_urllib_parse_quote_plus,
  73     compat_urllib_parse_unquote_plus,
  74     compat_urllib_request,
  75     compat_urlparse,
  76     compat_xpath,
  77 )
  78
  79 from .socks import (
  80     ProxyType,
  81     sockssocket,
  82 )
  83
  84
  85 def register_socks_protocols():
  86     # "Register" SOCKS protocols
  87     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  88     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  89     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  90         if scheme not in compat_urlparse.uses_netloc:
  91             compat_urlparse.uses_netloc.append(scheme)
  92
  93
  94 # This is not clearly defined otherwise
  95 compiled_regex_type = type(re.compile(''))
  96
  97
  98 def random_user_agent():
  99     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 100     _CHROME_VERSIONS = (
 101         '90.0.4430.212',
 102         '90.0.4430.24',
 103         '90.0.4430.70',
 104         '90.0.4430.72',
 105         '90.0.4430.85',
 106         '90.0.4430.93',
 107         '91.0.4472.101',
 108         '91.0.4472.106',
 109         '91.0.4472.114',
 110         '91.0.4472.124',
 111         '91.0.4472.164',
 112         '91.0.4472.19',
 113         '91.0.4472.77',
 114         '92.0.4515.107',
 115         '92.0.4515.115',
 116         '92.0.4515.131',
 117         '92.0.4515.159',
 118         '92.0.4515.43',
 119         '93.0.4556.0',
 120         '93.0.4577.15',
 121         '93.0.4577.63',
 122         '93.0.4577.82',
 123         '94.0.4606.41',
 124         '94.0.4606.54',
 125         '94.0.4606.61',
 126         '94.0.4606.71',
 127         '94.0.4606.81',
 128         '94.0.4606.85',
 129         '95.0.4638.17',
 130         '95.0.4638.50',
 131         '95.0.4638.54',
 132         '95.0.4638.69',
 133         '95.0.4638.74',
 134         '96.0.4664.18',
 135         '96.0.4664.45',
 136         '96.0.4664.55',
 137         '96.0.4664.93',
 138         '97.0.4692.20',
 139     )
 140     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 141
 142
 143 std_headers = {
 144     'User-Agent': random_user_agent(),
 145     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 146     'Accept-Encoding': 'gzip, deflate',
 147     'Accept-Language': 'en-us,en;q=0.5',
 148     'Sec-Fetch-Mode': 'navigate',
 149 }
 150
 151
 152 USER_AGENTS = {
 153     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 154 }
 155
 156
 157 NO_DEFAULT = object()
 158
 159 ENGLISH_MONTH_NAMES = [
 160     'January', 'February', 'March', 'April', 'May', 'June',
 161     'July', 'August', 'September', 'October', 'November', 'December']
 162
 163 MONTH_NAMES = {
 164     'en': ENGLISH_MONTH_NAMES,
 165     'fr': [
 166         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 167         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 168 }
 169
 170 KNOWN_EXTENSIONS = (
 171     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 172     'flv', 'f4v', 'f4a', 'f4b',
 173     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 174     'mkv', 'mka', 'mk3d',
 175     'avi', 'divx',
 176     'mov',
 177     'asf', 'wmv', 'wma',
 178     '3gp', '3g2',
 179     'mp3',
 180     'flac',
 181     'ape',
 182     'wav',
 183     'f4f', 'f4m', 'm3u8', 'smil')
 184
 185 # needed for sanitizing filenames in restricted mode
 186 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 187                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 188                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 189
 190 DATE_FORMATS = (
 191     '%d %B %Y',
 192     '%d %b %Y',
 193     '%B %d %Y',
 194     '%B %dst %Y',
 195     '%B %dnd %Y',
 196     '%B %drd %Y',
 197     '%B %dth %Y',
 198     '%b %d %Y',
 199     '%b %dst %Y',
 200     '%b %dnd %Y',
 201     '%b %drd %Y',
 202     '%b %dth %Y',
 203     '%b %dst %Y %I:%M',
 204     '%b %dnd %Y %I:%M',
 205     '%b %drd %Y %I:%M',
 206     '%b %dth %Y %I:%M',
 207     '%Y %m %d',
 208     '%Y-%m-%d',
 209     '%Y.%m.%d.',
 210     '%Y/%m/%d',
 211     '%Y/%m/%d %H:%M',
 212     '%Y/%m/%d %H:%M:%S',
 213     '%Y%m%d%H%M',
 214     '%Y%m%d%H%M%S',
 215     '%Y%m%d',
 216     '%Y-%m-%d %H:%M',
 217     '%Y-%m-%d %H:%M:%S',
 218     '%Y-%m-%d %H:%M:%S.%f',
 219     '%Y-%m-%d %H:%M:%S:%f',
 220     '%d.%m.%Y %H:%M',
 221     '%d.%m.%Y %H.%M',
 222     '%Y-%m-%dT%H:%M:%SZ',
 223     '%Y-%m-%dT%H:%M:%S.%fZ',
 224     '%Y-%m-%dT%H:%M:%S.%f0Z',
 225     '%Y-%m-%dT%H:%M:%S',
 226     '%Y-%m-%dT%H:%M:%S.%f',
 227     '%Y-%m-%dT%H:%M',
 228     '%b %d %Y at %H:%M',
 229     '%b %d %Y at %H:%M:%S',
 230     '%B %d %Y at %H:%M',
 231     '%B %d %Y at %H:%M:%S',
 232     '%H:%M %d-%b-%Y',
 233 )
 234
 235 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 236 DATE_FORMATS_DAY_FIRST.extend([
 237     '%d-%m-%Y',
 238     '%d.%m.%Y',
 239     '%d.%m.%y',
 240     '%d/%m/%Y',
 241     '%d/%m/%y',
 242     '%d/%m/%Y %H:%M:%S',
 243 ])
 244
 245 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 246 DATE_FORMATS_MONTH_FIRST.extend([
 247     '%m-%d-%Y',
 248     '%m.%d.%Y',
 249     '%m/%d/%Y',
 250     '%m/%d/%y',
 251     '%m/%d/%Y %H:%M:%S',
 252 ])
 253
 254 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 255 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 256
 257
 258 def preferredencoding():
 259     """Get preferred encoding.
 260
 261     Returns the best encoding scheme for the system, based on
 262     locale.getpreferredencoding() and some further tweaks.
 263     """
 264     try:
 265         pref = locale.getpreferredencoding()
 266         'TEST'.encode(pref)
 267     except Exception:
 268         pref = 'UTF-8'
 269
 270     return pref
 271
 272
 273 def write_json_file(obj, fn):
 274     """ Encode obj as JSON and write it to fn, atomically if possible """
 275
 276     fn = encodeFilename(fn)
 277     if sys.version_info < (3, 0) and sys.platform != 'win32':
 278         encoding = get_filesystem_encoding()
 279         # os.path.basename returns a bytes object, but NamedTemporaryFile
 280         # will fail if the filename contains non ascii characters unless we
 281         # use a unicode object
 282         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 283         # the same for os.path.dirname
 284         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 285     else:
 286         path_basename = os.path.basename
 287         path_dirname = os.path.dirname
 288
 289     args = {
 290         'suffix': '.tmp',
 291         'prefix': path_basename(fn) + '.',
 292         'dir': path_dirname(fn),
 293         'delete': False,
 294     }
 295
 296     # In Python 2.x, json.dump expects a bytestream.
 297     # In Python 3.x, it writes to a character stream
 298     if sys.version_info < (3, 0):
 299         args['mode'] = 'wb'
 300     else:
 301         args.update({
 302             'mode': 'w',
 303             'encoding': 'utf-8',
 304         })
 305
 306     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 307
 308     try:
 309         with tf:
 310             json.dump(obj, tf, ensure_ascii=False)
 311         if sys.platform == 'win32':
 312             # Need to remove existing file on Windows, else os.rename raises
 313             # WindowsError or FileExistsError.
 314             try:
 315                 os.unlink(fn)
 316             except OSError:
 317                 pass
 318         try:
 319             mask = os.umask(0)
 320             os.umask(mask)
 321             os.chmod(tf.name, 0o666 & ~mask)
 322         except OSError:
 323             pass
 324         os.rename(tf.name, fn)
 325     except Exception:
 326         try:
 327             os.remove(tf.name)
 328         except OSError:
 329             pass
 330         raise
 331
 332
 333 if sys.version_info >= (2, 7):
 334     def find_xpath_attr(node, xpath, key, val=None):
 335         """ Find the xpath xpath[@key=val] """
 336         assert re.match(r'^[a-zA-Z_-]+$', key)
 337         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 338         return node.find(expr)
 339 else:
 340     def find_xpath_attr(node, xpath, key, val=None):
 341         for f in node.findall(compat_xpath(xpath)):
 342             if key not in f.attrib:
 343                 continue
 344             if val is None or f.attrib.get(key) == val:
 345                 return f
 346         return None
 347
 348 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 349 # the namespace parameter
 350
 351
 352 def xpath_with_ns(path, ns_map):
 353     components = [c.split(':') for c in path.split('/')]
 354     replaced = []
 355     for c in components:
 356         if len(c) == 1:
 357             replaced.append(c[0])
 358         else:
 359             ns, tag = c
 360             replaced.append('{%s}%s' % (ns_map[ns], tag))
 361     return '/'.join(replaced)
 362
 363
 364 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 365     def _find_xpath(xpath):
 366         return node.find(compat_xpath(xpath))
 367
 368     if isinstance(xpath, (str, compat_str)):
 369         n = _find_xpath(xpath)
 370     else:
 371         for xp in xpath:
 372             n = _find_xpath(xp)
 373             if n is not None:
 374                 break
 375
 376     if n is None:
 377         if default is not NO_DEFAULT:
 378             return default
 379         elif fatal:
 380             name = xpath if name is None else name
 381             raise ExtractorError('Could not find XML element %s' % name)
 382         else:
 383             return None
 384     return n
 385
 386
 387 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 388     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 389     if n is None or n == default:
 390         return n
 391     if n.text is None:
 392         if default is not NO_DEFAULT:
 393             return default
 394         elif fatal:
 395             name = xpath if name is None else name
 396             raise ExtractorError('Could not find XML element\'s text %s' % name)
 397         else:
 398             return None
 399     return n.text
 400
 401
 402 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 403     n = find_xpath_attr(node, xpath, key)
 404     if n is None:
 405         if default is not NO_DEFAULT:
 406             return default
 407         elif fatal:
 408             name = '%s[@%s]' % (xpath, key) if name is None else name
 409             raise ExtractorError('Could not find XML attribute %s' % name)
 410         else:
 411             return None
 412     return n.attrib[key]
 413
 414
 415 def get_element_by_id(id, html):
 416     """Return the content of the tag with the specified ID in the passed HTML document"""
 417     return get_element_by_attribute('id', id, html)
 418
 419
 420 def get_element_html_by_id(id, html):
 421     """Return the html of the tag with the specified ID in the passed HTML document"""
 422     return get_element_html_by_attribute('id', id, html)
 423
 424
 425 def get_element_by_class(class_name, html):
 426     """Return the content of the first tag with the specified class in the passed HTML document"""
 427     retval = get_elements_by_class(class_name, html)
 428     return retval[0] if retval else None
 429
 430
 431 def get_element_html_by_class(class_name, html):
 432     """Return the html of the first tag with the specified class in the passed HTML document"""
 433     retval = get_elements_html_by_class(class_name, html)
 434     return retval[0] if retval else None
 435
 436
 437 def get_element_by_attribute(attribute, value, html, escape_value=True):
 438     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 439     return retval[0] if retval else None
 440
 441
 442 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 443     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 444     return retval[0] if retval else None
 445
 446
 447 def get_elements_by_class(class_name, html):
 448     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 449     return get_elements_by_attribute(
 450         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 451         html, escape_value=False)
 452
 453
 454 def get_elements_html_by_class(class_name, html):
 455     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 456     return get_elements_html_by_attribute(
 457         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 458         html, escape_value=False)
 459
 460
 461 def get_elements_by_attribute(*args, **kwargs):
 462     """Return the content of the tag with the specified attribute in the passed HTML document"""
 463     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 464
 465
 466 def get_elements_html_by_attribute(*args, **kwargs):
 467     """Return the html of the tag with the specified attribute in the passed HTML document"""
 468     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 469
 470
 471 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 472     """
 473     Return the text (content) and the html (whole) of the tag with the specified
 474     attribute in the passed HTML document
 475     """
 476
 477     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 478
 479     value = re.escape(value) if escape_value else value
 480
 481     partial_element_re = r'''(?x)
 482         <(?P<tag>[a-zA-Z0-9:._-]+)
 483          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 484          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 485         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 486
 487     for m in re.finditer(partial_element_re, html):
 488         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 489
 490         yield (
 491             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 492             whole
 493         )
 494
 495
 496 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 497     """
 498     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 499     closing tag for the first opening tag it has encountered, and can be used
 500     as a context manager
 501     """
 502
 503     class HTMLBreakOnClosingTagException(Exception):
 504         pass
 505
 506     def __init__(self):
 507         self.tagstack = collections.deque()
 508         compat_HTMLParser.__init__(self)
 509
 510     def __enter__(self):
 511         return self
 512
 513     def __exit__(self, *_):
 514         self.close()
 515
 516     def close(self):
 517         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 518         # so data remains buffered; we no longer have any interest in it, thus
 519         # override this method to discard it
 520         pass
 521
 522     def handle_starttag(self, tag, _):
 523         self.tagstack.append(tag)
 524
 525     def handle_endtag(self, tag):
 526         if not self.tagstack:
 527             raise compat_HTMLParseError('no tags in the stack')
 528         while self.tagstack:
 529             inner_tag = self.tagstack.pop()
 530             if inner_tag == tag:
 531                 break
 532         else:
 533             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 534         if not self.tagstack:
 535             raise self.HTMLBreakOnClosingTagException()
 536
 537
 538 def get_element_text_and_html_by_tag(tag, html):
 539     """
 540     For the first element with the specified tag in the passed HTML document
 541     return its' content (text) and the whole element (html)
 542     """
 543     def find_or_raise(haystack, needle, exc):
 544         try:
 545             return haystack.index(needle)
 546         except ValueError:
 547             raise exc
 548     closing_tag = f'</{tag}>'
 549     whole_start = find_or_raise(
 550         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 551     content_start = find_or_raise(
 552         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 553     content_start += whole_start + 1
 554     with HTMLBreakOnClosingTagParser() as parser:
 555         parser.feed(html[whole_start:content_start])
 556         if not parser.tagstack or parser.tagstack[0] != tag:
 557             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 558         offset = content_start
 559         while offset < len(html):
 560             next_closing_tag_start = find_or_raise(
 561                 html[offset:], closing_tag,
 562                 compat_HTMLParseError(f'closing {tag} tag not found'))
 563             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 564             try:
 565                 parser.feed(html[offset:offset + next_closing_tag_end])
 566                 offset += next_closing_tag_end
 567             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 568                 return html[content_start:offset + next_closing_tag_start], \
 569                     html[whole_start:offset + next_closing_tag_end]
 570         raise compat_HTMLParseError('unexpected end of html')
 571
 572
 573 class HTMLAttributeParser(compat_HTMLParser):
 574     """Trivial HTML parser to gather the attributes for a single element"""
 575
 576     def __init__(self):
 577         self.attrs = {}
 578         compat_HTMLParser.__init__(self)
 579
 580     def handle_starttag(self, tag, attrs):
 581         self.attrs = dict(attrs)
 582
 583
 584 class HTMLListAttrsParser(compat_HTMLParser):
 585     """HTML parser to gather the attributes for the elements of a list"""
 586
 587     def __init__(self):
 588         compat_HTMLParser.__init__(self)
 589         self.items = []
 590         self._level = 0
 591
 592     def handle_starttag(self, tag, attrs):
 593         if tag == 'li' and self._level == 0:
 594             self.items.append(dict(attrs))
 595         self._level += 1
 596
 597     def handle_endtag(self, tag):
 598         self._level -= 1
 599
 600
 601 def extract_attributes(html_element):
 602     """Given a string for an HTML element such as
 603     <el
 604          a="foo" B="bar" c="&98;az" d=boz
 605          empty= noval entity="&amp;"
 606          sq='"' dq="'"
 607     >
 608     Decode and return a dictionary of attributes.
 609     {
 610         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 611         'empty': '', 'noval': None, 'entity': '&',
 612         'sq': '"', 'dq': '\''
 613     }.
 614     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 615     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 616     """
 617     parser = HTMLAttributeParser()
 618     try:
 619         parser.feed(html_element)
 620         parser.close()
 621     # Older Python may throw HTMLParseError in case of malformed HTML
 622     except compat_HTMLParseError:
 623         pass
 624     return parser.attrs
 625
 626
 627 def parse_list(webpage):
 628     """Given a string for an series of HTML <li> elements,
 629     return a dictionary of their attributes"""
 630     parser = HTMLListAttrsParser()
 631     parser.feed(webpage)
 632     parser.close()
 633     return parser.items
 634
 635
 636 def clean_html(html):
 637     """Clean an HTML snippet into a readable string"""
 638
 639     if html is None:  # Convenience for sanitizing descriptions etc.
 640         return html
 641
 642     html = re.sub(r'\s+', ' ', html)
 643     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 644     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 645     # Strip html tags
 646     html = re.sub('<.*?>', '', html)
 647     # Replace html entities
 648     html = unescapeHTML(html)
 649     return html.strip()
 650
 651
 652 def sanitize_open(filename, open_mode):
 653     """Try to open the given filename, and slightly tweak it if this fails.
 654
 655     Attempts to open the given filename. If this fails, it tries to change
 656     the filename slightly, step by step, until it's either able to open it
 657     or it fails and raises a final exception, like the standard open()
 658     function.
 659
 660     It returns the tuple (stream, definitive_file_name).
 661     """
 662     try:
 663         if filename == '-':
 664             if sys.platform == 'win32':
 665                 import msvcrt
 666                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 667             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 668         stream = locked_file(filename, open_mode, block=False).open()
 669         return (stream, filename)
 670     except (IOError, OSError) as err:
 671         if err.errno in (errno.EACCES,):
 672             raise
 673
 674         # In case of error, try to remove win32 forbidden chars
 675         alt_filename = sanitize_path(filename)
 676         if alt_filename == filename:
 677             raise
 678         else:
 679             # An exception here should be caught in the caller
 680             stream = locked_file(filename, open_mode, block=False).open()
 681             return (stream, alt_filename)
 682
 683
 684 def timeconvert(timestr):
 685     """Convert RFC 2822 defined time string into system timestamp"""
 686     timestamp = None
 687     timetuple = email.utils.parsedate_tz(timestr)
 688     if timetuple is not None:
 689         timestamp = email.utils.mktime_tz(timetuple)
 690     return timestamp
 691
 692
 693 def sanitize_filename(s, restricted=False, is_id=False):
 694     """Sanitizes a string so it could be used as part of a filename.
 695     If restricted is set, use a stricter subset of allowed characters.
 696     Set is_id if this is not an arbitrary string, but an ID that should be kept
 697     if possible.
 698     """
 699     def replace_insane(char):
 700         if restricted and char in ACCENT_CHARS:
 701             return ACCENT_CHARS[char]
 702         elif not restricted and char == '\n':
 703             return ' '
 704         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 705             return ''
 706         elif char == '"':
 707             return '' if restricted else '\''
 708         elif char == ':':
 709             return '_-' if restricted else ' -'
 710         elif char in '\\/|*<>':
 711             return '_'
 712         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 713             return '_'
 714         if restricted and ord(char) > 127:
 715             return '_'
 716         return char
 717
 718     if s == '':
 719         return ''
 720     # Handle timestamps
 721     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 722     result = ''.join(map(replace_insane, s))
 723     if not is_id:
 724         while '__' in result:
 725             result = result.replace('__', '_')
 726         result = result.strip('_')
 727         # Common case of "Foreign band name - English song title"
 728         if restricted and result.startswith('-_'):
 729             result = result[2:]
 730         if result.startswith('-'):
 731             result = '_' + result[len('-'):]
 732         result = result.lstrip('.')
 733         if not result:
 734             result = '_'
 735     return result
 736
 737
 738 def sanitize_path(s, force=False):
 739     """Sanitizes and normalizes path on Windows"""
 740     if sys.platform == 'win32':
 741         force = False
 742         drive_or_unc, _ = os.path.splitdrive(s)
 743         if sys.version_info < (2, 7) and not drive_or_unc:
 744             drive_or_unc, _ = os.path.splitunc(s)
 745     elif force:
 746         drive_or_unc = ''
 747     else:
 748         return s
 749
 750     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 751     if drive_or_unc:
 752         norm_path.pop(0)
 753     sanitized_path = [
 754         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 755         for path_part in norm_path]
 756     if drive_or_unc:
 757         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 758     elif force and s[0] == os.path.sep:
 759         sanitized_path.insert(0, os.path.sep)
 760     return os.path.join(*sanitized_path)
 761
 762
 763 def sanitize_url(url):
 764     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 765     # the number of unwanted failures due to missing protocol
 766     if url.startswith('//'):
 767         return 'http:%s' % url
 768     # Fix some common typos seen so far
 769     COMMON_TYPOS = (
 770         # https://github.com/ytdl-org/youtube-dl/issues/15649
 771         (r'^httpss://', r'https://'),
 772         # https://bx1.be/lives/direct-tv/
 773         (r'^rmtp([es]?)://', r'rtmp\1://'),
 774     )
 775     for mistake, fixup in COMMON_TYPOS:
 776         if re.match(mistake, url):
 777             return re.sub(mistake, fixup, url)
 778     return url
 779
 780
 781 def extract_basic_auth(url):
 782     parts = compat_urlparse.urlsplit(url)
 783     if parts.username is None:
 784         return url, None
 785     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 786         parts.hostname if parts.port is None
 787         else '%s:%d' % (parts.hostname, parts.port))))
 788     auth_payload = base64.b64encode(
 789         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 790     return url, 'Basic ' + auth_payload.decode('utf-8')
 791
 792
 793 def sanitized_Request(url, *args, **kwargs):
 794     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 795     if auth_header is not None:
 796         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 797         headers['Authorization'] = auth_header
 798     return compat_urllib_request.Request(url, *args, **kwargs)
 799
 800
 801 def expand_path(s):
 802     """Expand shell variables and ~"""
 803     return os.path.expandvars(compat_expanduser(s))
 804
 805
 806 def orderedSet(iterable):
 807     """ Remove all duplicates from the input iterable """
 808     res = []
 809     for el in iterable:
 810         if el not in res:
 811             res.append(el)
 812     return res
 813
 814
 815 def _htmlentity_transform(entity_with_semicolon):
 816     """Transforms an HTML entity to a character."""
 817     entity = entity_with_semicolon[:-1]
 818
 819     # Known non-numeric HTML entity
 820     if entity in compat_html_entities.name2codepoint:
 821         return compat_chr(compat_html_entities.name2codepoint[entity])
 822
 823     # TODO: HTML5 allows entities without a semicolon. For example,
 824     # '&Eacuteric' should be decoded as 'Éric'.
 825     if entity_with_semicolon in compat_html_entities_html5:
 826         return compat_html_entities_html5[entity_with_semicolon]
 827
 828     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 829     if mobj is not None:
 830         numstr = mobj.group(1)
 831         if numstr.startswith('x'):
 832             base = 16
 833             numstr = '0%s' % numstr
 834         else:
 835             base = 10
 836         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 837         try:
 838             return compat_chr(int(numstr, base))
 839         except ValueError:
 840             pass
 841
 842     # Unknown entity in name, return its literal representation
 843     return '&%s;' % entity
 844
 845
 846 def unescapeHTML(s):
 847     if s is None:
 848         return None
 849     assert type(s) == compat_str
 850
 851     return re.sub(
 852         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 853
 854
 855 def escapeHTML(text):
 856     return (
 857         text
 858         .replace('&', '&amp;')
 859         .replace('<', '&lt;')
 860         .replace('>', '&gt;')
 861         .replace('"', '&quot;')
 862         .replace("'", '&#39;')
 863     )
 864
 865
 866 def process_communicate_or_kill(p, *args, **kwargs):
 867     try:
 868         return p.communicate(*args, **kwargs)
 869     except BaseException:  # Including KeyboardInterrupt
 870         p.kill()
 871         p.wait()
 872         raise
 873
 874
 875 class Popen(subprocess.Popen):
 876     if sys.platform == 'win32':
 877         _startupinfo = subprocess.STARTUPINFO()
 878         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 879     else:
 880         _startupinfo = None
 881
 882     def __init__(self, *args, **kwargs):
 883         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 884
 885     def communicate_or_kill(self, *args, **kwargs):
 886         return process_communicate_or_kill(self, *args, **kwargs)
 887
 888
 889 def get_subprocess_encoding():
 890     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 891         # For subprocess calls, encode with locale encoding
 892         # Refer to http://stackoverflow.com/a/9951851/35070
 893         encoding = preferredencoding()
 894     else:
 895         encoding = sys.getfilesystemencoding()
 896     if encoding is None:
 897         encoding = 'utf-8'
 898     return encoding
 899
 900
 901 def encodeFilename(s, for_subprocess=False):
 902     """
 903     @param s The name of the file
 904     """
 905
 906     assert type(s) == compat_str
 907
 908     # Python 3 has a Unicode API
 909     if sys.version_info >= (3, 0):
 910         return s
 911
 912     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 913     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 914     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 915     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 916         return s
 917
 918     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 919     if sys.platform.startswith('java'):
 920         return s
 921
 922     return s.encode(get_subprocess_encoding(), 'ignore')
 923
 924
 925 def decodeFilename(b, for_subprocess=False):
 926
 927     if sys.version_info >= (3, 0):
 928         return b
 929
 930     if not isinstance(b, bytes):
 931         return b
 932
 933     return b.decode(get_subprocess_encoding(), 'ignore')
 934
 935
 936 def encodeArgument(s):
 937     if not isinstance(s, compat_str):
 938         # Legacy code that uses byte strings
 939         # Uncomment the following line after fixing all post processors
 940         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 941         s = s.decode('ascii')
 942     return encodeFilename(s, True)
 943
 944
 945 def decodeArgument(b):
 946     return decodeFilename(b, True)
 947
 948
 949 def decodeOption(optval):
 950     if optval is None:
 951         return optval
 952     if isinstance(optval, bytes):
 953         optval = optval.decode(preferredencoding())
 954
 955     assert isinstance(optval, compat_str)
 956     return optval
 957
 958
 959 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 960
 961
 962 def timetuple_from_msec(msec):
 963     secs, msec = divmod(msec, 1000)
 964     mins, secs = divmod(secs, 60)
 965     hrs, mins = divmod(mins, 60)
 966     return _timetuple(hrs, mins, secs, msec)
 967
 968
 969 def formatSeconds(secs, delim=':', msec=False):
 970     time = timetuple_from_msec(secs * 1000)
 971     if time.hours:
 972         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 973     elif time.minutes:
 974         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 975     else:
 976         ret = '%d' % time.seconds
 977     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 978
 979
 980 def _ssl_load_windows_store_certs(ssl_context, storename):
 981     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 982     try:
 983         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 984                  if encoding == 'x509_asn' and (
 985                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 986     except PermissionError:
 987         return
 988     for cert in certs:
 989         try:
 990             ssl_context.load_verify_locations(cadata=cert)
 991         except ssl.SSLError:
 992             pass
 993
 994
 995 def make_HTTPS_handler(params, **kwargs):
 996     opts_check_certificate = not params.get('nocheckcertificate')
 997     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 998     context.check_hostname = opts_check_certificate
 999     if params.get('legacyserverconnect'):
1000         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1001     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1002     if opts_check_certificate:
1003         try:
1004             context.load_default_certs()
1005             # Work around the issue in load_default_certs when there are bad certificates. See:
1006             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1007             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1008         except ssl.SSLError:
1009             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1010             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1011                 # Create a new context to discard any certificates that were already loaded
1012                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1013                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1014                 for storename in ('CA', 'ROOT'):
1015                     _ssl_load_windows_store_certs(context, storename)
1016             context.set_default_verify_paths()
1017     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1018
1019
1020 def bug_reports_message(before=';'):
1021     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1022            'filling out the "Broken site" issue template properly. '
1023            'Confirm you are on the latest version using -U')
1024
1025     before = before.rstrip()
1026     if not before or before.endswith(('.', '!', '?')):
1027         msg = msg[0].title() + msg[1:]
1028
1029     return (before + ' ' if before else '') + msg
1030
1031
1032 class YoutubeDLError(Exception):
1033     """Base exception for YoutubeDL errors."""
1034     msg = None
1035
1036     def __init__(self, msg=None):
1037         if msg is not None:
1038             self.msg = msg
1039         elif self.msg is None:
1040             self.msg = type(self).__name__
1041         super().__init__(self.msg)
1042
1043
1044 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1045 if hasattr(ssl, 'CertificateError'):
1046     network_exceptions.append(ssl.CertificateError)
1047 network_exceptions = tuple(network_exceptions)
1048
1049
1050 class ExtractorError(YoutubeDLError):
1051     """Error during info extraction."""
1052
1053     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1054         """ tb, if given, is the original traceback (so that it can be printed out).
1055         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1056         """
1057         if sys.exc_info()[0] in network_exceptions:
1058             expected = True
1059
1060         self.msg = str(msg)
1061         self.traceback = tb
1062         self.expected = expected
1063         self.cause = cause
1064         self.video_id = video_id
1065         self.ie = ie
1066         self.exc_info = sys.exc_info()  # preserve original exception
1067
1068         super(ExtractorError, self).__init__(''.join((
1069             format_field(ie, template='[%s] '),
1070             format_field(video_id, template='%s: '),
1071             self.msg,
1072             format_field(cause, template=' (caused by %r)'),
1073             '' if expected else bug_reports_message())))
1074
1075     def format_traceback(self):
1076         if self.traceback is None:
1077             return None
1078         return ''.join(traceback.format_tb(self.traceback))
1079
1080
1081 class UnsupportedError(ExtractorError):
1082     def __init__(self, url):
1083         super(UnsupportedError, self).__init__(
1084             'Unsupported URL: %s' % url, expected=True)
1085         self.url = url
1086
1087
1088 class RegexNotFoundError(ExtractorError):
1089     """Error when a regex didn't match"""
1090     pass
1091
1092
1093 class GeoRestrictedError(ExtractorError):
1094     """Geographic restriction Error exception.
1095
1096     This exception may be thrown when a video is not available from your
1097     geographic location due to geographic restrictions imposed by a website.
1098     """
1099
1100     def __init__(self, msg, countries=None, **kwargs):
1101         kwargs['expected'] = True
1102         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1103         self.countries = countries
1104
1105
1106 class DownloadError(YoutubeDLError):
1107     """Download Error exception.
1108
1109     This exception may be thrown by FileDownloader objects if they are not
1110     configured to continue on errors. They will contain the appropriate
1111     error message.
1112     """
1113
1114     def __init__(self, msg, exc_info=None):
1115         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1116         super(DownloadError, self).__init__(msg)
1117         self.exc_info = exc_info
1118
1119
1120 class EntryNotInPlaylist(YoutubeDLError):
1121     """Entry not in playlist exception.
1122
1123     This exception will be thrown by YoutubeDL when a requested entry
1124     is not found in the playlist info_dict
1125     """
1126     msg = 'Entry not found in info'
1127
1128
1129 class SameFileError(YoutubeDLError):
1130     """Same File exception.
1131
1132     This exception will be thrown by FileDownloader objects if they detect
1133     multiple files would have to be downloaded to the same file on disk.
1134     """
1135     msg = 'Fixed output name but more than one file to download'
1136
1137     def __init__(self, filename=None):
1138         if filename is not None:
1139             self.msg += f': {filename}'
1140         super().__init__(self.msg)
1141
1142
1143 class PostProcessingError(YoutubeDLError):
1144     """Post Processing exception.
1145
1146     This exception may be raised by PostProcessor's .run() method to
1147     indicate an error in the postprocessing task.
1148     """
1149
1150
1151 class DownloadCancelled(YoutubeDLError):
1152     """ Exception raised when the download queue should be interrupted """
1153     msg = 'The download was cancelled'
1154
1155
1156 class ExistingVideoReached(DownloadCancelled):
1157     """ --break-on-existing triggered """
1158     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1159
1160
1161 class RejectedVideoReached(DownloadCancelled):
1162     """ --break-on-reject triggered """
1163     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1164
1165
1166 class MaxDownloadsReached(DownloadCancelled):
1167     """ --max-downloads limit has been reached. """
1168     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1169
1170
1171 class ReExtractInfo(YoutubeDLError):
1172     """ Video info needs to be re-extracted. """
1173
1174     def __init__(self, msg, expected=False):
1175         super().__init__(msg)
1176         self.expected = expected
1177
1178
1179 class ThrottledDownload(ReExtractInfo):
1180     """ Download speed below --throttled-rate. """
1181     msg = 'The download speed is below throttle limit'
1182
1183     def __init__(self):
1184         super().__init__(self.msg, expected=False)
1185
1186
1187 class UnavailableVideoError(YoutubeDLError):
1188     """Unavailable Format exception.
1189
1190     This exception will be thrown when a video is requested
1191     in a format that is not available for that video.
1192     """
1193     msg = 'Unable to download video'
1194
1195     def __init__(self, err=None):
1196         if err is not None:
1197             self.msg += f': {err}'
1198         super().__init__(self.msg)
1199
1200
1201 class ContentTooShortError(YoutubeDLError):
1202     """Content Too Short exception.
1203
1204     This exception may be raised by FileDownloader objects when a file they
1205     download is too small for what the server announced first, indicating
1206     the connection was probably interrupted.
1207     """
1208
1209     def __init__(self, downloaded, expected):
1210         super(ContentTooShortError, self).__init__(
1211             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1212         )
1213         # Both in bytes
1214         self.downloaded = downloaded
1215         self.expected = expected
1216
1217
1218 class XAttrMetadataError(YoutubeDLError):
1219     def __init__(self, code=None, msg='Unknown error'):
1220         super(XAttrMetadataError, self).__init__(msg)
1221         self.code = code
1222         self.msg = msg
1223
1224         # Parsing code and msg
1225         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1226                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1227             self.reason = 'NO_SPACE'
1228         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1229             self.reason = 'VALUE_TOO_LONG'
1230         else:
1231             self.reason = 'NOT_SUPPORTED'
1232
1233
1234 class XAttrUnavailableError(YoutubeDLError):
1235     pass
1236
1237
1238 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1239     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1240     # expected HTTP responses to meet HTTP/1.0 or later (see also
1241     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1242     if sys.version_info < (3, 0):
1243         kwargs['strict'] = True
1244     hc = http_class(*args, **compat_kwargs(kwargs))
1245     source_address = ydl_handler._params.get('source_address')
1246
1247     if source_address is not None:
1248         # This is to workaround _create_connection() from socket where it will try all
1249         # address data from getaddrinfo() including IPv6. This filters the result from
1250         # getaddrinfo() based on the source_address value.
1251         # This is based on the cpython socket.create_connection() function.
1252         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1253         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1254             host, port = address
1255             err = None
1256             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1257             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1258             ip_addrs = [addr for addr in addrs if addr[0] == af]
1259             if addrs and not ip_addrs:
1260                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1261                 raise socket.error(
1262                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1263                     % (ip_version, source_address[0]))
1264             for res in ip_addrs:
1265                 af, socktype, proto, canonname, sa = res
1266                 sock = None
1267                 try:
1268                     sock = socket.socket(af, socktype, proto)
1269                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1270                         sock.settimeout(timeout)
1271                     sock.bind(source_address)
1272                     sock.connect(sa)
1273                     err = None  # Explicitly break reference cycle
1274                     return sock
1275                 except socket.error as _:
1276                     err = _
1277                     if sock is not None:
1278                         sock.close()
1279             if err is not None:
1280                 raise err
1281             else:
1282                 raise socket.error('getaddrinfo returns an empty list')
1283         if hasattr(hc, '_create_connection'):
1284             hc._create_connection = _create_connection
1285         sa = (source_address, 0)
1286         if hasattr(hc, 'source_address'):  # Python 2.7+
1287             hc.source_address = sa
1288         else:  # Python 2.6
1289             def _hc_connect(self, *args, **kwargs):
1290                 sock = _create_connection(
1291                     (self.host, self.port), self.timeout, sa)
1292                 if is_https:
1293                     self.sock = ssl.wrap_socket(
1294                         sock, self.key_file, self.cert_file,
1295                         ssl_version=ssl.PROTOCOL_TLSv1)
1296                 else:
1297                     self.sock = sock
1298             hc.connect = functools.partial(_hc_connect, hc)
1299
1300     return hc
1301
1302
1303 def handle_youtubedl_headers(headers):
1304     filtered_headers = headers
1305
1306     if 'Youtubedl-no-compression' in filtered_headers:
1307         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1308         del filtered_headers['Youtubedl-no-compression']
1309
1310     return filtered_headers
1311
1312
1313 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1314     """Handler for HTTP requests and responses.
1315
1316     This class, when installed with an OpenerDirector, automatically adds
1317     the standard headers to every HTTP request and handles gzipped and
1318     deflated responses from web servers. If compression is to be avoided in
1319     a particular request, the original request in the program code only has
1320     to include the HTTP header "Youtubedl-no-compression", which will be
1321     removed before making the real request.
1322
1323     Part of this code was copied from:
1324
1325     http://techknack.net/python-urllib2-handlers/
1326
1327     Andrew Rowls, the author of that code, agreed to release it to the
1328     public domain.
1329     """
1330
1331     def __init__(self, params, *args, **kwargs):
1332         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1333         self._params = params
1334
1335     def http_open(self, req):
1336         conn_class = compat_http_client.HTTPConnection
1337
1338         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1339         if socks_proxy:
1340             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1341             del req.headers['Ytdl-socks-proxy']
1342
1343         return self.do_open(functools.partial(
1344             _create_http_connection, self, conn_class, False),
1345             req)
1346
1347     @staticmethod
1348     def deflate(data):
1349         if not data:
1350             return data
1351         try:
1352             return zlib.decompress(data, -zlib.MAX_WBITS)
1353         except zlib.error:
1354             return zlib.decompress(data)
1355
1356     def http_request(self, req):
1357         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1358         # always respected by websites, some tend to give out URLs with non percent-encoded
1359         # non-ASCII characters (see telemb.py, ard.py [#3412])
1360         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1361         # To work around aforementioned issue we will replace request's original URL with
1362         # percent-encoded one
1363         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1364         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1365         url = req.get_full_url()
1366         url_escaped = escape_url(url)
1367
1368         # Substitute URL if any change after escaping
1369         if url != url_escaped:
1370             req = update_Request(req, url=url_escaped)
1371
1372         for h, v in std_headers.items():
1373             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1374             # The dict keys are capitalized because of this bug by urllib
1375             if h.capitalize() not in req.headers:
1376                 req.add_header(h, v)
1377
1378         req.headers = handle_youtubedl_headers(req.headers)
1379
1380         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1381             # Python 2.6 is brain-dead when it comes to fragments
1382             req._Request__original = req._Request__original.partition('#')[0]
1383             req._Request__r_type = req._Request__r_type.partition('#')[0]
1384
1385         return req
1386
1387     def http_response(self, req, resp):
1388         old_resp = resp
1389         # gzip
1390         if resp.headers.get('Content-encoding', '') == 'gzip':
1391             content = resp.read()
1392             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1393             try:
1394                 uncompressed = io.BytesIO(gz.read())
1395             except IOError as original_ioerror:
1396                 # There may be junk add the end of the file
1397                 # See http://stackoverflow.com/q/4928560/35070 for details
1398                 for i in range(1, 1024):
1399                     try:
1400                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1401                         uncompressed = io.BytesIO(gz.read())
1402                     except IOError:
1403                         continue
1404                     break
1405                 else:
1406                     raise original_ioerror
1407             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1408             resp.msg = old_resp.msg
1409             del resp.headers['Content-encoding']
1410         # deflate
1411         if resp.headers.get('Content-encoding', '') == 'deflate':
1412             gz = io.BytesIO(self.deflate(resp.read()))
1413             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1414             resp.msg = old_resp.msg
1415             del resp.headers['Content-encoding']
1416         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1417         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1418         if 300 <= resp.code < 400:
1419             location = resp.headers.get('Location')
1420             if location:
1421                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1422                 if sys.version_info >= (3, 0):
1423                     location = location.encode('iso-8859-1').decode('utf-8')
1424                 else:
1425                     location = location.decode('utf-8')
1426                 location_escaped = escape_url(location)
1427                 if location != location_escaped:
1428                     del resp.headers['Location']
1429                     if sys.version_info < (3, 0):
1430                         location_escaped = location_escaped.encode('utf-8')
1431                     resp.headers['Location'] = location_escaped
1432         return resp
1433
1434     https_request = http_request
1435     https_response = http_response
1436
1437
1438 def make_socks_conn_class(base_class, socks_proxy):
1439     assert issubclass(base_class, (
1440         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1441
1442     url_components = compat_urlparse.urlparse(socks_proxy)
1443     if url_components.scheme.lower() == 'socks5':
1444         socks_type = ProxyType.SOCKS5
1445     elif url_components.scheme.lower() in ('socks', 'socks4'):
1446         socks_type = ProxyType.SOCKS4
1447     elif url_components.scheme.lower() == 'socks4a':
1448         socks_type = ProxyType.SOCKS4A
1449
1450     def unquote_if_non_empty(s):
1451         if not s:
1452             return s
1453         return compat_urllib_parse_unquote_plus(s)
1454
1455     proxy_args = (
1456         socks_type,
1457         url_components.hostname, url_components.port or 1080,
1458         True,  # Remote DNS
1459         unquote_if_non_empty(url_components.username),
1460         unquote_if_non_empty(url_components.password),
1461     )
1462
1463     class SocksConnection(base_class):
1464         def connect(self):
1465             self.sock = sockssocket()
1466             self.sock.setproxy(*proxy_args)
1467             if type(self.timeout) in (int, float):
1468                 self.sock.settimeout(self.timeout)
1469             self.sock.connect((self.host, self.port))
1470
1471             if isinstance(self, compat_http_client.HTTPSConnection):
1472                 if hasattr(self, '_context'):  # Python > 2.6
1473                     self.sock = self._context.wrap_socket(
1474                         self.sock, server_hostname=self.host)
1475                 else:
1476                     self.sock = ssl.wrap_socket(self.sock)
1477
1478     return SocksConnection
1479
1480
1481 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1482     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1483         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1484         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1485         self._params = params
1486
1487     def https_open(self, req):
1488         kwargs = {}
1489         conn_class = self._https_conn_class
1490
1491         if hasattr(self, '_context'):  # python > 2.6
1492             kwargs['context'] = self._context
1493         if hasattr(self, '_check_hostname'):  # python 3.x
1494             kwargs['check_hostname'] = self._check_hostname
1495
1496         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1497         if socks_proxy:
1498             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1499             del req.headers['Ytdl-socks-proxy']
1500
1501         return self.do_open(functools.partial(
1502             _create_http_connection, self, conn_class, True),
1503             req, **kwargs)
1504
1505
1506 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1507     """
1508     See [1] for cookie file format.
1509
1510     1. https://curl.haxx.se/docs/http-cookies.html
1511     """
1512     _HTTPONLY_PREFIX = '#HttpOnly_'
1513     _ENTRY_LEN = 7
1514     _HEADER = '''# Netscape HTTP Cookie File
1515 # This file is generated by yt-dlp.  Do not edit.
1516
1517 '''
1518     _CookieFileEntry = collections.namedtuple(
1519         'CookieFileEntry',
1520         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1521
1522     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1523         """
1524         Save cookies to a file.
1525
1526         Most of the code is taken from CPython 3.8 and slightly adapted
1527         to support cookie files with UTF-8 in both python 2 and 3.
1528         """
1529         if filename is None:
1530             if self.filename is not None:
1531                 filename = self.filename
1532             else:
1533                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1534
1535         # Store session cookies with `expires` set to 0 instead of an empty
1536         # string
1537         for cookie in self:
1538             if cookie.expires is None:
1539                 cookie.expires = 0
1540
1541         with io.open(filename, 'w', encoding='utf-8') as f:
1542             f.write(self._HEADER)
1543             now = time.time()
1544             for cookie in self:
1545                 if not ignore_discard and cookie.discard:
1546                     continue
1547                 if not ignore_expires and cookie.is_expired(now):
1548                     continue
1549                 if cookie.secure:
1550                     secure = 'TRUE'
1551                 else:
1552                     secure = 'FALSE'
1553                 if cookie.domain.startswith('.'):
1554                     initial_dot = 'TRUE'
1555                 else:
1556                     initial_dot = 'FALSE'
1557                 if cookie.expires is not None:
1558                     expires = compat_str(cookie.expires)
1559                 else:
1560                     expires = ''
1561                 if cookie.value is None:
1562                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1563                     # with no name, whereas http.cookiejar regards it as a
1564                     # cookie with no value.
1565                     name = ''
1566                     value = cookie.name
1567                 else:
1568                     name = cookie.name
1569                     value = cookie.value
1570                 f.write(
1571                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1572                                secure, expires, name, value]) + '\n')
1573
1574     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1575         """Load cookies from a file."""
1576         if filename is None:
1577             if self.filename is not None:
1578                 filename = self.filename
1579             else:
1580                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1581
1582         def prepare_line(line):
1583             if line.startswith(self._HTTPONLY_PREFIX):
1584                 line = line[len(self._HTTPONLY_PREFIX):]
1585             # comments and empty lines are fine
1586             if line.startswith('#') or not line.strip():
1587                 return line
1588             cookie_list = line.split('\t')
1589             if len(cookie_list) != self._ENTRY_LEN:
1590                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1591             cookie = self._CookieFileEntry(*cookie_list)
1592             if cookie.expires_at and not cookie.expires_at.isdigit():
1593                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1594             return line
1595
1596         cf = io.StringIO()
1597         with io.open(filename, encoding='utf-8') as f:
1598             for line in f:
1599                 try:
1600                     cf.write(prepare_line(line))
1601                 except compat_cookiejar.LoadError as e:
1602                     write_string(
1603                         'WARNING: skipping cookie file entry due to %s: %r\n'
1604                         % (e, line), sys.stderr)
1605                     continue
1606         cf.seek(0)
1607         self._really_load(cf, filename, ignore_discard, ignore_expires)
1608         # Session cookies are denoted by either `expires` field set to
1609         # an empty string or 0. MozillaCookieJar only recognizes the former
1610         # (see [1]). So we need force the latter to be recognized as session
1611         # cookies on our own.
1612         # Session cookies may be important for cookies-based authentication,
1613         # e.g. usually, when user does not check 'Remember me' check box while
1614         # logging in on a site, some important cookies are stored as session
1615         # cookies so that not recognizing them will result in failed login.
1616         # 1. https://bugs.python.org/issue17164
1617         for cookie in self:
1618             # Treat `expires=0` cookies as session cookies
1619             if cookie.expires == 0:
1620                 cookie.expires = None
1621                 cookie.discard = True
1622
1623
1624 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1625     def __init__(self, cookiejar=None):
1626         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1627
1628     def http_response(self, request, response):
1629         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1630         # characters in Set-Cookie HTTP header of last response (see
1631         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1632         # In order to at least prevent crashing we will percent encode Set-Cookie
1633         # header before HTTPCookieProcessor starts processing it.
1634         # if sys.version_info < (3, 0) and response.headers:
1635         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1636         #         set_cookie = response.headers.get(set_cookie_header)
1637         #         if set_cookie:
1638         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1639         #             if set_cookie != set_cookie_escaped:
1640         #                 del response.headers[set_cookie_header]
1641         #                 response.headers[set_cookie_header] = set_cookie_escaped
1642         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1643
1644     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1645     https_response = http_response
1646
1647
1648 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1649     """YoutubeDL redirect handler
1650
1651     The code is based on HTTPRedirectHandler implementation from CPython [1].
1652
1653     This redirect handler solves two issues:
1654      - ensures redirect URL is always unicode under python 2
1655      - introduces support for experimental HTTP response status code
1656        308 Permanent Redirect [2] used by some sites [3]
1657
1658     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1659     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1660     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1661     """
1662
1663     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1664
1665     def redirect_request(self, req, fp, code, msg, headers, newurl):
1666         """Return a Request or None in response to a redirect.
1667
1668         This is called by the http_error_30x methods when a
1669         redirection response is received.  If a redirection should
1670         take place, return a new Request to allow http_error_30x to
1671         perform the redirect.  Otherwise, raise HTTPError if no-one
1672         else should try to handle this url.  Return None if you can't
1673         but another Handler might.
1674         """
1675         m = req.get_method()
1676         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1677                  or code in (301, 302, 303) and m == "POST")):
1678             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1679         # Strictly (according to RFC 2616), 301 or 302 in response to
1680         # a POST MUST NOT cause a redirection without confirmation
1681         # from the user (of urllib.request, in this case).  In practice,
1682         # essentially all clients do redirect in this case, so we do
1683         # the same.
1684
1685         # On python 2 urlh.geturl() may sometimes return redirect URL
1686         # as byte string instead of unicode. This workaround allows
1687         # to force it always return unicode.
1688         if sys.version_info[0] < 3:
1689             newurl = compat_str(newurl)
1690
1691         # Be conciliant with URIs containing a space.  This is mainly
1692         # redundant with the more complete encoding done in http_error_302(),
1693         # but it is kept for compatibility with other callers.
1694         newurl = newurl.replace(' ', '%20')
1695
1696         CONTENT_HEADERS = ("content-length", "content-type")
1697         # NB: don't use dict comprehension for python 2.6 compatibility
1698         newheaders = dict((k, v) for k, v in req.headers.items()
1699                           if k.lower() not in CONTENT_HEADERS)
1700         return compat_urllib_request.Request(
1701             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1702             unverifiable=True)
1703
1704
1705 def extract_timezone(date_str):
1706     m = re.search(
1707         r'''(?x)
1708             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1709             (?P<tz>Z|                                            # just the UTC Z, or
1710                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1711                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1712                    [ ]?                                          # optional space
1713                 (?P<sign>\+|-)                                   # +/-
1714                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1715             $)
1716         ''', date_str)
1717     if not m:
1718         timezone = datetime.timedelta()
1719     else:
1720         date_str = date_str[:-len(m.group('tz'))]
1721         if not m.group('sign'):
1722             timezone = datetime.timedelta()
1723         else:
1724             sign = 1 if m.group('sign') == '+' else -1
1725             timezone = datetime.timedelta(
1726                 hours=sign * int(m.group('hours')),
1727                 minutes=sign * int(m.group('minutes')))
1728     return timezone, date_str
1729
1730
1731 def parse_iso8601(date_str, delimiter='T', timezone=None):
1732     """ Return a UNIX timestamp from the given date """
1733
1734     if date_str is None:
1735         return None
1736
1737     date_str = re.sub(r'\.[0-9]+', '', date_str)
1738
1739     if timezone is None:
1740         timezone, date_str = extract_timezone(date_str)
1741
1742     try:
1743         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1744         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1745         return calendar.timegm(dt.timetuple())
1746     except ValueError:
1747         pass
1748
1749
1750 def date_formats(day_first=True):
1751     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1752
1753
1754 def unified_strdate(date_str, day_first=True):
1755     """Return a string with the date in the format YYYYMMDD"""
1756
1757     if date_str is None:
1758         return None
1759     upload_date = None
1760     # Replace commas
1761     date_str = date_str.replace(',', ' ')
1762     # Remove AM/PM + timezone
1763     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1764     _, date_str = extract_timezone(date_str)
1765
1766     for expression in date_formats(day_first):
1767         try:
1768             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1769         except ValueError:
1770             pass
1771     if upload_date is None:
1772         timetuple = email.utils.parsedate_tz(date_str)
1773         if timetuple:
1774             try:
1775                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1776             except ValueError:
1777                 pass
1778     if upload_date is not None:
1779         return compat_str(upload_date)
1780
1781
1782 def unified_timestamp(date_str, day_first=True):
1783     if date_str is None:
1784         return None
1785
1786     date_str = re.sub(r'[,|]', '', date_str)
1787
1788     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1789     timezone, date_str = extract_timezone(date_str)
1790
1791     # Remove AM/PM + timezone
1792     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1793
1794     # Remove unrecognized timezones from ISO 8601 alike timestamps
1795     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1796     if m:
1797         date_str = date_str[:-len(m.group('tz'))]
1798
1799     # Python only supports microseconds, so remove nanoseconds
1800     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1801     if m:
1802         date_str = m.group(1)
1803
1804     for expression in date_formats(day_first):
1805         try:
1806             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1807             return calendar.timegm(dt.timetuple())
1808         except ValueError:
1809             pass
1810     timetuple = email.utils.parsedate_tz(date_str)
1811     if timetuple:
1812         return calendar.timegm(timetuple) + pm_delta * 3600
1813
1814
1815 def determine_ext(url, default_ext='unknown_video'):
1816     if url is None or '.' not in url:
1817         return default_ext
1818     guess = url.partition('?')[0].rpartition('.')[2]
1819     if re.match(r'^[A-Za-z0-9]+$', guess):
1820         return guess
1821     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1822     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1823         return guess.rstrip('/')
1824     else:
1825         return default_ext
1826
1827
1828 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1829     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1830
1831
1832 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1833     """
1834     Return a datetime object from a string in the format YYYYMMDD or
1835     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1836
1837     format: string date format used to return datetime object from
1838     precision: round the time portion of a datetime object.
1839                 auto|microsecond|second|minute|hour|day.
1840                 auto: round to the unit provided in date_str (if applicable).
1841     """
1842     auto_precision = False
1843     if precision == 'auto':
1844         auto_precision = True
1845         precision = 'microsecond'
1846     today = datetime_round(datetime.datetime.utcnow(), precision)
1847     if date_str in ('now', 'today'):
1848         return today
1849     if date_str == 'yesterday':
1850         return today - datetime.timedelta(days=1)
1851     match = re.match(
1852         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1853         date_str)
1854     if match is not None:
1855         start_time = datetime_from_str(match.group('start'), precision, format)
1856         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1857         unit = match.group('unit')
1858         if unit == 'month' or unit == 'year':
1859             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1860             unit = 'day'
1861         else:
1862             if unit == 'week':
1863                 unit = 'day'
1864                 time *= 7
1865             delta = datetime.timedelta(**{unit + 's': time})
1866             new_date = start_time + delta
1867         if auto_precision:
1868             return datetime_round(new_date, unit)
1869         return new_date
1870
1871     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1872
1873
1874 def date_from_str(date_str, format='%Y%m%d'):
1875     """
1876     Return a datetime object from a string in the format YYYYMMDD or
1877     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1878
1879     format: string date format used to return datetime object from
1880     """
1881     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1882
1883
1884 def datetime_add_months(dt, months):
1885     """Increment/Decrement a datetime object by months."""
1886     month = dt.month + months - 1
1887     year = dt.year + month // 12
1888     month = month % 12 + 1
1889     day = min(dt.day, calendar.monthrange(year, month)[1])
1890     return dt.replace(year, month, day)
1891
1892
1893 def datetime_round(dt, precision='day'):
1894     """
1895     Round a datetime object's time to a specific precision
1896     """
1897     if precision == 'microsecond':
1898         return dt
1899
1900     unit_seconds = {
1901         'day': 86400,
1902         'hour': 3600,
1903         'minute': 60,
1904         'second': 1,
1905     }
1906     roundto = lambda x, n: ((x + n / 2) // n) * n
1907     timestamp = calendar.timegm(dt.timetuple())
1908     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1909
1910
1911 def hyphenate_date(date_str):
1912     """
1913     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1914     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1915     if match is not None:
1916         return '-'.join(match.groups())
1917     else:
1918         return date_str
1919
1920
1921 class DateRange(object):
1922     """Represents a time interval between two dates"""
1923
1924     def __init__(self, start=None, end=None):
1925         """start and end must be strings in the format accepted by date"""
1926         if start is not None:
1927             self.start = date_from_str(start)
1928         else:
1929             self.start = datetime.datetime.min.date()
1930         if end is not None:
1931             self.end = date_from_str(end)
1932         else:
1933             self.end = datetime.datetime.max.date()
1934         if self.start > self.end:
1935             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1936
1937     @classmethod
1938     def day(cls, day):
1939         """Returns a range that only contains the given day"""
1940         return cls(day, day)
1941
1942     def __contains__(self, date):
1943         """Check if the date is in the range"""
1944         if not isinstance(date, datetime.date):
1945             date = date_from_str(date)
1946         return self.start <= date <= self.end
1947
1948     def __str__(self):
1949         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1950
1951
1952 def platform_name():
1953     """ Returns the platform name as a compat_str """
1954     res = platform.platform()
1955     if isinstance(res, bytes):
1956         res = res.decode(preferredencoding())
1957
1958     assert isinstance(res, compat_str)
1959     return res
1960
1961
1962 def get_windows_version():
1963     ''' Get Windows version. None if it's not running on Windows '''
1964     if compat_os_name == 'nt':
1965         return version_tuple(platform.win32_ver()[1])
1966     else:
1967         return None
1968
1969
1970 def _windows_write_string(s, out):
1971     """ Returns True if the string was written using special methods,
1972     False if it has yet to be written out."""
1973     # Adapted from http://stackoverflow.com/a/3259271/35070
1974
1975     import ctypes.wintypes
1976
1977     WIN_OUTPUT_IDS = {
1978         1: -11,
1979         2: -12,
1980     }
1981
1982     try:
1983         fileno = out.fileno()
1984     except AttributeError:
1985         # If the output stream doesn't have a fileno, it's virtual
1986         return False
1987     except io.UnsupportedOperation:
1988         # Some strange Windows pseudo files?
1989         return False
1990     if fileno not in WIN_OUTPUT_IDS:
1991         return False
1992
1993     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1994         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1995         ('GetStdHandle', ctypes.windll.kernel32))
1996     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1997
1998     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1999         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2000         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2001         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2002     written = ctypes.wintypes.DWORD(0)
2003
2004     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2005     FILE_TYPE_CHAR = 0x0002
2006     FILE_TYPE_REMOTE = 0x8000
2007     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2008         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2009         ctypes.POINTER(ctypes.wintypes.DWORD))(
2010         ('GetConsoleMode', ctypes.windll.kernel32))
2011     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2012
2013     def not_a_console(handle):
2014         if handle == INVALID_HANDLE_VALUE or handle is None:
2015             return True
2016         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2017                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2018
2019     if not_a_console(h):
2020         return False
2021
2022     def next_nonbmp_pos(s):
2023         try:
2024             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2025         except StopIteration:
2026             return len(s)
2027
2028     while s:
2029         count = min(next_nonbmp_pos(s), 1024)
2030
2031         ret = WriteConsoleW(
2032             h, s, count if count else 2, ctypes.byref(written), None)
2033         if ret == 0:
2034             raise OSError('Failed to write string')
2035         if not count:  # We just wrote a non-BMP character
2036             assert written.value == 2
2037             s = s[1:]
2038         else:
2039             assert written.value > 0
2040             s = s[written.value:]
2041     return True
2042
2043
2044 def write_string(s, out=None, encoding=None):
2045     if out is None:
2046         out = sys.stderr
2047     assert type(s) == compat_str
2048
2049     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2050         if _windows_write_string(s, out):
2051             return
2052
2053     if ('b' in getattr(out, 'mode', '')
2054             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2055         byt = s.encode(encoding or preferredencoding(), 'ignore')
2056         out.write(byt)
2057     elif hasattr(out, 'buffer'):
2058         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2059         byt = s.encode(enc, 'ignore')
2060         out.buffer.write(byt)
2061     else:
2062         out.write(s)
2063     out.flush()
2064
2065
2066 def bytes_to_intlist(bs):
2067     if not bs:
2068         return []
2069     if isinstance(bs[0], int):  # Python 3
2070         return list(bs)
2071     else:
2072         return [ord(c) for c in bs]
2073
2074
2075 def intlist_to_bytes(xs):
2076     if not xs:
2077         return b''
2078     return compat_struct_pack('%dB' % len(xs), *xs)
2079
2080
2081 # Cross-platform file locking
2082 if sys.platform == 'win32':
2083     import ctypes.wintypes
2084     import msvcrt
2085
2086     class OVERLAPPED(ctypes.Structure):
2087         _fields_ = [
2088             ('Internal', ctypes.wintypes.LPVOID),
2089             ('InternalHigh', ctypes.wintypes.LPVOID),
2090             ('Offset', ctypes.wintypes.DWORD),
2091             ('OffsetHigh', ctypes.wintypes.DWORD),
2092             ('hEvent', ctypes.wintypes.HANDLE),
2093         ]
2094
2095     kernel32 = ctypes.windll.kernel32
2096     LockFileEx = kernel32.LockFileEx
2097     LockFileEx.argtypes = [
2098         ctypes.wintypes.HANDLE,     # hFile
2099         ctypes.wintypes.DWORD,      # dwFlags
2100         ctypes.wintypes.DWORD,      # dwReserved
2101         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2102         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2103         ctypes.POINTER(OVERLAPPED)  # Overlapped
2104     ]
2105     LockFileEx.restype = ctypes.wintypes.BOOL
2106     UnlockFileEx = kernel32.UnlockFileEx
2107     UnlockFileEx.argtypes = [
2108         ctypes.wintypes.HANDLE,     # hFile
2109         ctypes.wintypes.DWORD,      # dwReserved
2110         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2111         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2112         ctypes.POINTER(OVERLAPPED)  # Overlapped
2113     ]
2114     UnlockFileEx.restype = ctypes.wintypes.BOOL
2115     whole_low = 0xffffffff
2116     whole_high = 0x7fffffff
2117
2118     def _lock_file(f, exclusive, block):  # todo: block unused on win32
2119         overlapped = OVERLAPPED()
2120         overlapped.Offset = 0
2121         overlapped.OffsetHigh = 0
2122         overlapped.hEvent = 0
2123         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2124         handle = msvcrt.get_osfhandle(f.fileno())
2125         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2126                           whole_low, whole_high, f._lock_file_overlapped_p):
2127             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2128
2129     def _unlock_file(f):
2130         assert f._lock_file_overlapped_p
2131         handle = msvcrt.get_osfhandle(f.fileno())
2132         if not UnlockFileEx(handle, 0,
2133                             whole_low, whole_high, f._lock_file_overlapped_p):
2134             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2135
2136 else:
2137     # Some platforms, such as Jython, is missing fcntl
2138     try:
2139         import fcntl
2140
2141         def _lock_file(f, exclusive, block):
2142             fcntl.flock(f,
2143                         fcntl.LOCK_SH if not exclusive
2144                         else fcntl.LOCK_EX if block
2145                         else fcntl.LOCK_EX | fcntl.LOCK_NB)
2146
2147         def _unlock_file(f):
2148             fcntl.flock(f, fcntl.LOCK_UN)
2149
2150     except ImportError:
2151         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2152
2153         def _lock_file(f, exclusive, block):
2154             raise IOError(UNSUPPORTED_MSG)
2155
2156         def _unlock_file(f):
2157             raise IOError(UNSUPPORTED_MSG)
2158
2159
2160 class locked_file(object):
2161     def __init__(self, filename, mode, block=True, encoding=None):
2162         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2163         self.f = io.open(filename, mode, encoding=encoding)
2164         self.mode = mode
2165         self.block = block
2166
2167     def __enter__(self):
2168         exclusive = 'r' not in self.mode
2169         try:
2170             _lock_file(self.f, exclusive, self.block)
2171         except IOError:
2172             self.f.close()
2173             raise
2174         return self
2175
2176     def __exit__(self, etype, value, traceback):
2177         try:
2178             _unlock_file(self.f)
2179         finally:
2180             self.f.close()
2181
2182     def __iter__(self):
2183         return iter(self.f)
2184
2185     def write(self, *args):
2186         return self.f.write(*args)
2187
2188     def read(self, *args):
2189         return self.f.read(*args)
2190
2191     def flush(self):
2192         self.f.flush()
2193
2194     def open(self):
2195         return self.__enter__()
2196
2197     def close(self, *args):
2198         self.__exit__(self, *args, value=False, traceback=False)
2199
2200
2201 def get_filesystem_encoding():
2202     encoding = sys.getfilesystemencoding()
2203     return encoding if encoding is not None else 'utf-8'
2204
2205
2206 def shell_quote(args):
2207     quoted_args = []
2208     encoding = get_filesystem_encoding()
2209     for a in args:
2210         if isinstance(a, bytes):
2211             # We may get a filename encoded with 'encodeFilename'
2212             a = a.decode(encoding)
2213         quoted_args.append(compat_shlex_quote(a))
2214     return ' '.join(quoted_args)
2215
2216
2217 def smuggle_url(url, data):
2218     """ Pass additional data in a URL for internal use. """
2219
2220     url, idata = unsmuggle_url(url, {})
2221     data.update(idata)
2222     sdata = compat_urllib_parse_urlencode(
2223         {'__youtubedl_smuggle': json.dumps(data)})
2224     return url + '#' + sdata
2225
2226
2227 def unsmuggle_url(smug_url, default=None):
2228     if '#__youtubedl_smuggle' not in smug_url:
2229         return smug_url, default
2230     url, _, sdata = smug_url.rpartition('#')
2231     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2232     data = json.loads(jsond)
2233     return url, data
2234
2235
2236 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2237     """ Formats numbers with decimal sufixes like K, M, etc """
2238     num, factor = float_or_none(num), float(factor)
2239     if num is None:
2240         return None
2241     exponent = 0 if num == 0 else int(math.log(num, factor))
2242     suffix = ['', *'kMGTPEZY'][exponent]
2243     if factor == 1024:
2244         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2245     converted = num / (factor ** exponent)
2246     return fmt % (converted, suffix)
2247
2248
2249 def format_bytes(bytes):
2250     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2251
2252
2253 def lookup_unit_table(unit_table, s):
2254     units_re = '|'.join(re.escape(u) for u in unit_table)
2255     m = re.match(
2256         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2257     if not m:
2258         return None
2259     num_str = m.group('num').replace(',', '.')
2260     mult = unit_table[m.group('unit')]
2261     return int(float(num_str) * mult)
2262
2263
2264 def parse_filesize(s):
2265     if s is None:
2266         return None
2267
2268     # The lower-case forms are of course incorrect and unofficial,
2269     # but we support those too
2270     _UNIT_TABLE = {
2271         'B': 1,
2272         'b': 1,
2273         'bytes': 1,
2274         'KiB': 1024,
2275         'KB': 1000,
2276         'kB': 1024,
2277         'Kb': 1000,
2278         'kb': 1000,
2279         'kilobytes': 1000,
2280         'kibibytes': 1024,
2281         'MiB': 1024 ** 2,
2282         'MB': 1000 ** 2,
2283         'mB': 1024 ** 2,
2284         'Mb': 1000 ** 2,
2285         'mb': 1000 ** 2,
2286         'megabytes': 1000 ** 2,
2287         'mebibytes': 1024 ** 2,
2288         'GiB': 1024 ** 3,
2289         'GB': 1000 ** 3,
2290         'gB': 1024 ** 3,
2291         'Gb': 1000 ** 3,
2292         'gb': 1000 ** 3,
2293         'gigabytes': 1000 ** 3,
2294         'gibibytes': 1024 ** 3,
2295         'TiB': 1024 ** 4,
2296         'TB': 1000 ** 4,
2297         'tB': 1024 ** 4,
2298         'Tb': 1000 ** 4,
2299         'tb': 1000 ** 4,
2300         'terabytes': 1000 ** 4,
2301         'tebibytes': 1024 ** 4,
2302         'PiB': 1024 ** 5,
2303         'PB': 1000 ** 5,
2304         'pB': 1024 ** 5,
2305         'Pb': 1000 ** 5,
2306         'pb': 1000 ** 5,
2307         'petabytes': 1000 ** 5,
2308         'pebibytes': 1024 ** 5,
2309         'EiB': 1024 ** 6,
2310         'EB': 1000 ** 6,
2311         'eB': 1024 ** 6,
2312         'Eb': 1000 ** 6,
2313         'eb': 1000 ** 6,
2314         'exabytes': 1000 ** 6,
2315         'exbibytes': 1024 ** 6,
2316         'ZiB': 1024 ** 7,
2317         'ZB': 1000 ** 7,
2318         'zB': 1024 ** 7,
2319         'Zb': 1000 ** 7,
2320         'zb': 1000 ** 7,
2321         'zettabytes': 1000 ** 7,
2322         'zebibytes': 1024 ** 7,
2323         'YiB': 1024 ** 8,
2324         'YB': 1000 ** 8,
2325         'yB': 1024 ** 8,
2326         'Yb': 1000 ** 8,
2327         'yb': 1000 ** 8,
2328         'yottabytes': 1000 ** 8,
2329         'yobibytes': 1024 ** 8,
2330     }
2331
2332     return lookup_unit_table(_UNIT_TABLE, s)
2333
2334
2335 def parse_count(s):
2336     if s is None:
2337         return None
2338
2339     s = re.sub(r'^[^\d]+\s', '', s).strip()
2340
2341     if re.match(r'^[\d,.]+$', s):
2342         return str_to_int(s)
2343
2344     _UNIT_TABLE = {
2345         'k': 1000,
2346         'K': 1000,
2347         'm': 1000 ** 2,
2348         'M': 1000 ** 2,
2349         'kk': 1000 ** 2,
2350         'KK': 1000 ** 2,
2351         'b': 1000 ** 3,
2352         'B': 1000 ** 3,
2353     }
2354
2355     ret = lookup_unit_table(_UNIT_TABLE, s)
2356     if ret is not None:
2357         return ret
2358
2359     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2360     if mobj:
2361         return str_to_int(mobj.group(1))
2362
2363
2364 def parse_resolution(s):
2365     if s is None:
2366         return {}
2367
2368     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2369     if mobj:
2370         return {
2371             'width': int(mobj.group('w')),
2372             'height': int(mobj.group('h')),
2373         }
2374
2375     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2376     if mobj:
2377         return {'height': int(mobj.group(1))}
2378
2379     mobj = re.search(r'\b([48])[kK]\b', s)
2380     if mobj:
2381         return {'height': int(mobj.group(1)) * 540}
2382
2383     return {}
2384
2385
2386 def parse_bitrate(s):
2387     if not isinstance(s, compat_str):
2388         return
2389     mobj = re.search(r'\b(\d+)\s*kbps', s)
2390     if mobj:
2391         return int(mobj.group(1))
2392
2393
2394 def month_by_name(name, lang='en'):
2395     """ Return the number of a month by (locale-independently) English name """
2396
2397     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2398
2399     try:
2400         return month_names.index(name) + 1
2401     except ValueError:
2402         return None
2403
2404
2405 def month_by_abbreviation(abbrev):
2406     """ Return the number of a month by (locale-independently) English
2407         abbreviations """
2408
2409     try:
2410         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2411     except ValueError:
2412         return None
2413
2414
2415 def fix_xml_ampersands(xml_str):
2416     """Replace all the '&' by '&amp;' in XML"""
2417     return re.sub(
2418         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2419         '&amp;',
2420         xml_str)
2421
2422
2423 def setproctitle(title):
2424     assert isinstance(title, compat_str)
2425
2426     # ctypes in Jython is not complete
2427     # http://bugs.jython.org/issue2148
2428     if sys.platform.startswith('java'):
2429         return
2430
2431     try:
2432         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2433     except OSError:
2434         return
2435     except TypeError:
2436         # LoadLibrary in Windows Python 2.7.13 only expects
2437         # a bytestring, but since unicode_literals turns
2438         # every string into a unicode string, it fails.
2439         return
2440     title_bytes = title.encode('utf-8')
2441     buf = ctypes.create_string_buffer(len(title_bytes))
2442     buf.value = title_bytes
2443     try:
2444         libc.prctl(15, buf, 0, 0, 0)
2445     except AttributeError:
2446         return  # Strange libc, just skip this
2447
2448
2449 def remove_start(s, start):
2450     return s[len(start):] if s is not None and s.startswith(start) else s
2451
2452
2453 def remove_end(s, end):
2454     return s[:-len(end)] if s is not None and s.endswith(end) else s
2455
2456
2457 def remove_quotes(s):
2458     if s is None or len(s) < 2:
2459         return s
2460     for quote in ('"', "'", ):
2461         if s[0] == quote and s[-1] == quote:
2462             return s[1:-1]
2463     return s
2464
2465
2466 def get_domain(url):
2467     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2468     return domain.group('domain') if domain else None
2469
2470
2471 def url_basename(url):
2472     path = compat_urlparse.urlparse(url).path
2473     return path.strip('/').split('/')[-1]
2474
2475
2476 def base_url(url):
2477     return re.match(r'https?://[^?#&]+/', url).group()
2478
2479
2480 def urljoin(base, path):
2481     if isinstance(path, bytes):
2482         path = path.decode('utf-8')
2483     if not isinstance(path, compat_str) or not path:
2484         return None
2485     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2486         return path
2487     if isinstance(base, bytes):
2488         base = base.decode('utf-8')
2489     if not isinstance(base, compat_str) or not re.match(
2490             r'^(?:https?:)?//', base):
2491         return None
2492     return compat_urlparse.urljoin(base, path)
2493
2494
2495 class HEADRequest(compat_urllib_request.Request):
2496     def get_method(self):
2497         return 'HEAD'
2498
2499
2500 class PUTRequest(compat_urllib_request.Request):
2501     def get_method(self):
2502         return 'PUT'
2503
2504
2505 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2506     if get_attr and v is not None:
2507         v = getattr(v, get_attr, None)
2508     try:
2509         return int(v) * invscale // scale
2510     except (ValueError, TypeError, OverflowError):
2511         return default
2512
2513
2514 def str_or_none(v, default=None):
2515     return default if v is None else compat_str(v)
2516
2517
2518 def str_to_int(int_str):
2519     """ A more relaxed version of int_or_none """
2520     if isinstance(int_str, compat_integer_types):
2521         return int_str
2522     elif isinstance(int_str, compat_str):
2523         int_str = re.sub(r'[,\.\+]', '', int_str)
2524         return int_or_none(int_str)
2525
2526
2527 def float_or_none(v, scale=1, invscale=1, default=None):
2528     if v is None:
2529         return default
2530     try:
2531         return float(v) * invscale / scale
2532     except (ValueError, TypeError):
2533         return default
2534
2535
2536 def bool_or_none(v, default=None):
2537     return v if isinstance(v, bool) else default
2538
2539
2540 def strip_or_none(v, default=None):
2541     return v.strip() if isinstance(v, compat_str) else default
2542
2543
2544 def url_or_none(url):
2545     if not url or not isinstance(url, compat_str):
2546         return None
2547     url = url.strip()
2548     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2549
2550
2551 def strftime_or_none(timestamp, date_format, default=None):
2552     datetime_object = None
2553     try:
2554         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2555             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2556         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2557             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2558         return datetime_object.strftime(date_format)
2559     except (ValueError, TypeError, AttributeError):
2560         return default
2561
2562
2563 def parse_duration(s):
2564     if not isinstance(s, compat_basestring):
2565         return None
2566     s = s.strip()
2567     if not s:
2568         return None
2569
2570     days, hours, mins, secs, ms = [None] * 5
2571     m = re.match(r'''(?x)
2572             (?P<before_secs>
2573                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2574             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2575             (?P<ms>[.:][0-9]+)?Z?$
2576         ''', s)
2577     if m:
2578         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2579     else:
2580         m = re.match(
2581             r'''(?ix)(?:P?
2582                 (?:
2583                     [0-9]+\s*y(?:ears?)?\s*
2584                 )?
2585                 (?:
2586                     [0-9]+\s*m(?:onths?)?\s*
2587                 )?
2588                 (?:
2589                     [0-9]+\s*w(?:eeks?)?\s*
2590                 )?
2591                 (?:
2592                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2593                 )?
2594                 T)?
2595                 (?:
2596                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2597                 )?
2598                 (?:
2599                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2600                 )?
2601                 (?:
2602                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2603                 )?Z?$''', s)
2604         if m:
2605             days, hours, mins, secs, ms = m.groups()
2606         else:
2607             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2608             if m:
2609                 hours, mins = m.groups()
2610             else:
2611                 return None
2612
2613     duration = 0
2614     if secs:
2615         duration += float(secs)
2616     if mins:
2617         duration += float(mins) * 60
2618     if hours:
2619         duration += float(hours) * 60 * 60
2620     if days:
2621         duration += float(days) * 24 * 60 * 60
2622     if ms:
2623         duration += float(ms.replace(':', '.'))
2624     return duration
2625
2626
2627 def prepend_extension(filename, ext, expected_real_ext=None):
2628     name, real_ext = os.path.splitext(filename)
2629     return (
2630         '{0}.{1}{2}'.format(name, ext, real_ext)
2631         if not expected_real_ext or real_ext[1:] == expected_real_ext
2632         else '{0}.{1}'.format(filename, ext))
2633
2634
2635 def replace_extension(filename, ext, expected_real_ext=None):
2636     name, real_ext = os.path.splitext(filename)
2637     return '{0}.{1}'.format(
2638         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2639         ext)
2640
2641
2642 def check_executable(exe, args=[]):
2643     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2644     args can be a list of arguments for a short output (like -version) """
2645     try:
2646         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2647     except OSError:
2648         return False
2649     return exe
2650
2651
2652 def _get_exe_version_output(exe, args):
2653     try:
2654         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2655         # SIGTTOU if yt-dlp is run in the background.
2656         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2657         out, _ = Popen(
2658             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2659             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2660     except OSError:
2661         return False
2662     if isinstance(out, bytes):  # Python 2.x
2663         out = out.decode('ascii', 'ignore')
2664     return out
2665
2666
2667 def detect_exe_version(output, version_re=None, unrecognized='present'):
2668     assert isinstance(output, compat_str)
2669     if version_re is None:
2670         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2671     m = re.search(version_re, output)
2672     if m:
2673         return m.group(1)
2674     else:
2675         return unrecognized
2676
2677
2678 def get_exe_version(exe, args=['--version'],
2679                     version_re=None, unrecognized='present'):
2680     """ Returns the version of the specified executable,
2681     or False if the executable is not present """
2682     out = _get_exe_version_output(exe, args)
2683     return detect_exe_version(out, version_re, unrecognized) if out else False
2684
2685
2686 class LazyList(collections.abc.Sequence):
2687     ''' Lazy immutable list from an iterable
2688     Note that slices of a LazyList are lists and not LazyList'''
2689
2690     class IndexError(IndexError):
2691         pass
2692
2693     def __init__(self, iterable, *, reverse=False, _cache=None):
2694         self.__iterable = iter(iterable)
2695         self.__cache = [] if _cache is None else _cache
2696         self.__reversed = reverse
2697
2698     def __iter__(self):
2699         if self.__reversed:
2700             # We need to consume the entire iterable to iterate in reverse
2701             yield from self.exhaust()
2702             return
2703         yield from self.__cache
2704         for item in self.__iterable:
2705             self.__cache.append(item)
2706             yield item
2707
2708     def __exhaust(self):
2709         self.__cache.extend(self.__iterable)
2710         # Discard the emptied iterable to make it pickle-able
2711         self.__iterable = []
2712         return self.__cache
2713
2714     def exhaust(self):
2715         ''' Evaluate the entire iterable '''
2716         return self.__exhaust()[::-1 if self.__reversed else 1]
2717
2718     @staticmethod
2719     def __reverse_index(x):
2720         return None if x is None else -(x + 1)
2721
2722     def __getitem__(self, idx):
2723         if isinstance(idx, slice):
2724             if self.__reversed:
2725                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2726             start, stop, step = idx.start, idx.stop, idx.step or 1
2727         elif isinstance(idx, int):
2728             if self.__reversed:
2729                 idx = self.__reverse_index(idx)
2730             start, stop, step = idx, idx, 0
2731         else:
2732             raise TypeError('indices must be integers or slices')
2733         if ((start or 0) < 0 or (stop or 0) < 0
2734                 or (start is None and step < 0)
2735                 or (stop is None and step > 0)):
2736             # We need to consume the entire iterable to be able to slice from the end
2737             # Obviously, never use this with infinite iterables
2738             self.__exhaust()
2739             try:
2740                 return self.__cache[idx]
2741             except IndexError as e:
2742                 raise self.IndexError(e) from e
2743         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2744         if n > 0:
2745             self.__cache.extend(itertools.islice(self.__iterable, n))
2746         try:
2747             return self.__cache[idx]
2748         except IndexError as e:
2749             raise self.IndexError(e) from e
2750
2751     def __bool__(self):
2752         try:
2753             self[-1] if self.__reversed else self[0]
2754         except self.IndexError:
2755             return False
2756         return True
2757
2758     def __len__(self):
2759         self.__exhaust()
2760         return len(self.__cache)
2761
2762     def __reversed__(self):
2763         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2764
2765     def __copy__(self):
2766         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2767
2768     def __repr__(self):
2769         # repr and str should mimic a list. So we exhaust the iterable
2770         return repr(self.exhaust())
2771
2772     def __str__(self):
2773         return repr(self.exhaust())
2774
2775
2776 class PagedList:
2777
2778     class IndexError(IndexError):
2779         pass
2780
2781     def __len__(self):
2782         # This is only useful for tests
2783         return len(self.getslice())
2784
2785     def __init__(self, pagefunc, pagesize, use_cache=True):
2786         self._pagefunc = pagefunc
2787         self._pagesize = pagesize
2788         self._use_cache = use_cache
2789         self._cache = {}
2790
2791     def getpage(self, pagenum):
2792         page_results = self._cache.get(pagenum)
2793         if page_results is None:
2794             page_results = list(self._pagefunc(pagenum))
2795         if self._use_cache:
2796             self._cache[pagenum] = page_results
2797         return page_results
2798
2799     def getslice(self, start=0, end=None):
2800         return list(self._getslice(start, end))
2801
2802     def _getslice(self, start, end):
2803         raise NotImplementedError('This method must be implemented by subclasses')
2804
2805     def __getitem__(self, idx):
2806         # NOTE: cache must be enabled if this is used
2807         if not isinstance(idx, int) or idx < 0:
2808             raise TypeError('indices must be non-negative integers')
2809         entries = self.getslice(idx, idx + 1)
2810         if not entries:
2811             raise self.IndexError()
2812         return entries[0]
2813
2814
2815 class OnDemandPagedList(PagedList):
2816     def _getslice(self, start, end):
2817         for pagenum in itertools.count(start // self._pagesize):
2818             firstid = pagenum * self._pagesize
2819             nextfirstid = pagenum * self._pagesize + self._pagesize
2820             if start >= nextfirstid:
2821                 continue
2822
2823             startv = (
2824                 start % self._pagesize
2825                 if firstid <= start < nextfirstid
2826                 else 0)
2827             endv = (
2828                 ((end - 1) % self._pagesize) + 1
2829                 if (end is not None and firstid <= end <= nextfirstid)
2830                 else None)
2831
2832             page_results = self.getpage(pagenum)
2833             if startv != 0 or endv is not None:
2834                 page_results = page_results[startv:endv]
2835             yield from page_results
2836
2837             # A little optimization - if current page is not "full", ie. does
2838             # not contain page_size videos then we can assume that this page
2839             # is the last one - there are no more ids on further pages -
2840             # i.e. no need to query again.
2841             if len(page_results) + startv < self._pagesize:
2842                 break
2843
2844             # If we got the whole page, but the next page is not interesting,
2845             # break out early as well
2846             if end == nextfirstid:
2847                 break
2848
2849
2850 class InAdvancePagedList(PagedList):
2851     def __init__(self, pagefunc, pagecount, pagesize):
2852         self._pagecount = pagecount
2853         PagedList.__init__(self, pagefunc, pagesize, True)
2854
2855     def _getslice(self, start, end):
2856         start_page = start // self._pagesize
2857         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2858         skip_elems = start - start_page * self._pagesize
2859         only_more = None if end is None else end - start
2860         for pagenum in range(start_page, end_page):
2861             page_results = self.getpage(pagenum)
2862             if skip_elems:
2863                 page_results = page_results[skip_elems:]
2864                 skip_elems = None
2865             if only_more is not None:
2866                 if len(page_results) < only_more:
2867                     only_more -= len(page_results)
2868                 else:
2869                     yield from page_results[:only_more]
2870                     break
2871             yield from page_results
2872
2873
2874 def uppercase_escape(s):
2875     unicode_escape = codecs.getdecoder('unicode_escape')
2876     return re.sub(
2877         r'\\U[0-9a-fA-F]{8}',
2878         lambda m: unicode_escape(m.group(0))[0],
2879         s)
2880
2881
2882 def lowercase_escape(s):
2883     unicode_escape = codecs.getdecoder('unicode_escape')
2884     return re.sub(
2885         r'\\u[0-9a-fA-F]{4}',
2886         lambda m: unicode_escape(m.group(0))[0],
2887         s)
2888
2889
2890 def escape_rfc3986(s):
2891     """Escape non-ASCII characters as suggested by RFC 3986"""
2892     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2893         s = s.encode('utf-8')
2894     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2895
2896
2897 def escape_url(url):
2898     """Escape URL as suggested by RFC 3986"""
2899     url_parsed = compat_urllib_parse_urlparse(url)
2900     return url_parsed._replace(
2901         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2902         path=escape_rfc3986(url_parsed.path),
2903         params=escape_rfc3986(url_parsed.params),
2904         query=escape_rfc3986(url_parsed.query),
2905         fragment=escape_rfc3986(url_parsed.fragment)
2906     ).geturl()
2907
2908
2909 def parse_qs(url):
2910     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2911
2912
2913 def read_batch_urls(batch_fd):
2914     def fixup(url):
2915         if not isinstance(url, compat_str):
2916             url = url.decode('utf-8', 'replace')
2917         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2918         for bom in BOM_UTF8:
2919             if url.startswith(bom):
2920                 url = url[len(bom):]
2921         url = url.lstrip()
2922         if not url or url.startswith(('#', ';', ']')):
2923             return False
2924         # "#" cannot be stripped out since it is part of the URI
2925         # However, it can be safely stipped out if follwing a whitespace
2926         return re.split(r'\s#', url, 1)[0].rstrip()
2927
2928     with contextlib.closing(batch_fd) as fd:
2929         return [url for url in map(fixup, fd) if url]
2930
2931
2932 def urlencode_postdata(*args, **kargs):
2933     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2934
2935
2936 def update_url_query(url, query):
2937     if not query:
2938         return url
2939     parsed_url = compat_urlparse.urlparse(url)
2940     qs = compat_parse_qs(parsed_url.query)
2941     qs.update(query)
2942     return compat_urlparse.urlunparse(parsed_url._replace(
2943         query=compat_urllib_parse_urlencode(qs, True)))
2944
2945
2946 def update_Request(req, url=None, data=None, headers={}, query={}):
2947     req_headers = req.headers.copy()
2948     req_headers.update(headers)
2949     req_data = data or req.data
2950     req_url = update_url_query(url or req.get_full_url(), query)
2951     req_get_method = req.get_method()
2952     if req_get_method == 'HEAD':
2953         req_type = HEADRequest
2954     elif req_get_method == 'PUT':
2955         req_type = PUTRequest
2956     else:
2957         req_type = compat_urllib_request.Request
2958     new_req = req_type(
2959         req_url, data=req_data, headers=req_headers,
2960         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2961     if hasattr(req, 'timeout'):
2962         new_req.timeout = req.timeout
2963     return new_req
2964
2965
2966 def _multipart_encode_impl(data, boundary):
2967     content_type = 'multipart/form-data; boundary=%s' % boundary
2968
2969     out = b''
2970     for k, v in data.items():
2971         out += b'--' + boundary.encode('ascii') + b'\r\n'
2972         if isinstance(k, compat_str):
2973             k = k.encode('utf-8')
2974         if isinstance(v, compat_str):
2975             v = v.encode('utf-8')
2976         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2977         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2978         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2979         if boundary.encode('ascii') in content:
2980             raise ValueError('Boundary overlaps with data')
2981         out += content
2982
2983     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2984
2985     return out, content_type
2986
2987
2988 def multipart_encode(data, boundary=None):
2989     '''
2990     Encode a dict to RFC 7578-compliant form-data
2991
2992     data:
2993         A dict where keys and values can be either Unicode or bytes-like
2994         objects.
2995     boundary:
2996         If specified a Unicode object, it's used as the boundary. Otherwise
2997         a random boundary is generated.
2998
2999     Reference: https://tools.ietf.org/html/rfc7578
3000     '''
3001     has_specified_boundary = boundary is not None
3002
3003     while True:
3004         if boundary is None:
3005             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3006
3007         try:
3008             out, content_type = _multipart_encode_impl(data, boundary)
3009             break
3010         except ValueError:
3011             if has_specified_boundary:
3012                 raise
3013             boundary = None
3014
3015     return out, content_type
3016
3017
3018 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3019     if isinstance(key_or_keys, (list, tuple)):
3020         for key in key_or_keys:
3021             if key not in d or d[key] is None or skip_false_values and not d[key]:
3022                 continue
3023             return d[key]
3024         return default
3025     return d.get(key_or_keys, default)
3026
3027
3028 def try_get(src, getter, expected_type=None):
3029     for get in variadic(getter):
3030         try:
3031             v = get(src)
3032         except (AttributeError, KeyError, TypeError, IndexError):
3033             pass
3034         else:
3035             if expected_type is None or isinstance(v, expected_type):
3036                 return v
3037
3038
3039 def merge_dicts(*dicts):
3040     merged = {}
3041     for a_dict in dicts:
3042         for k, v in a_dict.items():
3043             if v is None:
3044                 continue
3045             if (k not in merged
3046                     or (isinstance(v, compat_str) and v
3047                         and isinstance(merged[k], compat_str)
3048                         and not merged[k])):
3049                 merged[k] = v
3050     return merged
3051
3052
3053 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3054     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3055
3056
3057 US_RATINGS = {
3058     'G': 0,
3059     'PG': 10,
3060     'PG-13': 13,
3061     'R': 16,
3062     'NC': 18,
3063 }
3064
3065
3066 TV_PARENTAL_GUIDELINES = {
3067     'TV-Y': 0,
3068     'TV-Y7': 7,
3069     'TV-G': 0,
3070     'TV-PG': 0,
3071     'TV-14': 14,
3072     'TV-MA': 17,
3073 }
3074
3075
3076 def parse_age_limit(s):
3077     if type(s) == int:
3078         return s if 0 <= s <= 21 else None
3079     if not isinstance(s, compat_basestring):
3080         return None
3081     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3082     if m:
3083         return int(m.group('age'))
3084     s = s.upper()
3085     if s in US_RATINGS:
3086         return US_RATINGS[s]
3087     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3088     if m:
3089         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3090     return None
3091
3092
3093 def strip_jsonp(code):
3094     return re.sub(
3095         r'''(?sx)^
3096             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3097             (?:\s*&&\s*(?P=func_name))?
3098             \s*\(\s*(?P<callback_data>.*)\);?
3099             \s*?(?://[^\n]*)*$''',
3100         r'\g<callback_data>', code)
3101
3102
3103 def js_to_json(code, vars={}):
3104     # vars is a dict of var, val pairs to substitute
3105     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3106     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3107     INTEGER_TABLE = (
3108         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3109         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3110     )
3111
3112     def fix_kv(m):
3113         v = m.group(0)
3114         if v in ('true', 'false', 'null'):
3115             return v
3116         elif v in ('undefined', 'void 0'):
3117             return 'null'
3118         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3119             return ""
3120
3121         if v[0] in ("'", '"'):
3122             v = re.sub(r'(?s)\\.|"', lambda m: {
3123                 '"': '\\"',
3124                 "\\'": "'",
3125                 '\\\n': '',
3126                 '\\x': '\\u00',
3127             }.get(m.group(0), m.group(0)), v[1:-1])
3128         else:
3129             for regex, base in INTEGER_TABLE:
3130                 im = re.match(regex, v)
3131                 if im:
3132                     i = int(im.group(1), base)
3133                     return '"%d":' % i if v.endswith(':') else '%d' % i
3134
3135             if v in vars:
3136                 return vars[v]
3137
3138         return '"%s"' % v
3139
3140     return re.sub(r'''(?sx)
3141         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3142         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3143         {comment}|,(?={skip}[\]}}])|
3144         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3145         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3146         [0-9]+(?={skip}:)|
3147         !+
3148         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3149
3150
3151 def qualities(quality_ids):
3152     """ Get a numeric quality value out of a list of possible values """
3153     def q(qid):
3154         try:
3155             return quality_ids.index(qid)
3156         except ValueError:
3157             return -1
3158     return q
3159
3160
3161 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3162
3163
3164 DEFAULT_OUTTMPL = {
3165     'default': '%(title)s [%(id)s].%(ext)s',
3166     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3167 }
3168 OUTTMPL_TYPES = {
3169     'chapter': None,
3170     'subtitle': None,
3171     'thumbnail': None,
3172     'description': 'description',
3173     'annotation': 'annotations.xml',
3174     'infojson': 'info.json',
3175     'link': None,
3176     'pl_video': None,
3177     'pl_thumbnail': None,
3178     'pl_description': 'description',
3179     'pl_infojson': 'info.json',
3180 }
3181
3182 # As of [1] format syntax is:
3183 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3184 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3185 STR_FORMAT_RE_TMPL = r'''(?x)
3186     (?<!%)(?P<prefix>(?:%%)*)
3187     %
3188     (?P<has_key>\((?P<key>{0})\))?
3189     (?P<format>
3190         (?P<conversion>[#0\-+ ]+)?
3191         (?P<min_width>\d+)?
3192         (?P<precision>\.\d+)?
3193         (?P<len_mod>[hlL])?  # unused in python
3194         {1}  # conversion type
3195     )
3196 '''
3197
3198
3199 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3200
3201
3202 def limit_length(s, length):
3203     """ Add ellipses to overly long strings """
3204     if s is None:
3205         return None
3206     ELLIPSES = '...'
3207     if len(s) > length:
3208         return s[:length - len(ELLIPSES)] + ELLIPSES
3209     return s
3210
3211
3212 def version_tuple(v):
3213     return tuple(int(e) for e in re.split(r'[-.]', v))
3214
3215
3216 def is_outdated_version(version, limit, assume_new=True):
3217     if not version:
3218         return not assume_new
3219     try:
3220         return version_tuple(version) < version_tuple(limit)
3221     except ValueError:
3222         return not assume_new
3223
3224
3225 def ytdl_is_updateable():
3226     """ Returns if yt-dlp can be updated with -U """
3227
3228     from .update import is_non_updateable
3229
3230     return not is_non_updateable()
3231
3232
3233 def args_to_str(args):
3234     # Get a short string representation for a subprocess command
3235     return ' '.join(compat_shlex_quote(a) for a in args)
3236
3237
3238 def error_to_compat_str(err):
3239     err_str = str(err)
3240     # On python 2 error byte string must be decoded with proper
3241     # encoding rather than ascii
3242     if sys.version_info[0] < 3:
3243         err_str = err_str.decode(preferredencoding())
3244     return err_str
3245
3246
3247 def mimetype2ext(mt):
3248     if mt is None:
3249         return None
3250
3251     mt, _, params = mt.partition(';')
3252     mt = mt.strip()
3253
3254     FULL_MAP = {
3255         'audio/mp4': 'm4a',
3256         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3257         # it's the most popular one
3258         'audio/mpeg': 'mp3',
3259         'audio/x-wav': 'wav',
3260         'audio/wav': 'wav',
3261         'audio/wave': 'wav',
3262     }
3263
3264     ext = FULL_MAP.get(mt)
3265     if ext is not None:
3266         return ext
3267
3268     SUBTYPE_MAP = {
3269         '3gpp': '3gp',
3270         'smptett+xml': 'tt',
3271         'ttaf+xml': 'dfxp',
3272         'ttml+xml': 'ttml',
3273         'x-flv': 'flv',
3274         'x-mp4-fragmented': 'mp4',
3275         'x-ms-sami': 'sami',
3276         'x-ms-wmv': 'wmv',
3277         'mpegurl': 'm3u8',
3278         'x-mpegurl': 'm3u8',
3279         'vnd.apple.mpegurl': 'm3u8',
3280         'dash+xml': 'mpd',
3281         'f4m+xml': 'f4m',
3282         'hds+xml': 'f4m',
3283         'vnd.ms-sstr+xml': 'ism',
3284         'quicktime': 'mov',
3285         'mp2t': 'ts',
3286         'x-wav': 'wav',
3287         'filmstrip+json': 'fs',
3288         'svg+xml': 'svg',
3289     }
3290
3291     _, _, subtype = mt.rpartition('/')
3292     ext = SUBTYPE_MAP.get(subtype.lower())
3293     if ext is not None:
3294         return ext
3295
3296     SUFFIX_MAP = {
3297         'json': 'json',
3298         'xml': 'xml',
3299         'zip': 'zip',
3300         'gzip': 'gz',
3301     }
3302
3303     _, _, suffix = subtype.partition('+')
3304     ext = SUFFIX_MAP.get(suffix)
3305     if ext is not None:
3306         return ext
3307
3308     return subtype.replace('+', '.')
3309
3310
3311 def ext2mimetype(ext_or_url):
3312     if not ext_or_url:
3313         return None
3314     if '.' not in ext_or_url:
3315         ext_or_url = f'file.{ext_or_url}'
3316     return mimetypes.guess_type(ext_or_url)[0]
3317
3318
3319 def parse_codecs(codecs_str):
3320     # http://tools.ietf.org/html/rfc6381
3321     if not codecs_str:
3322         return {}
3323     split_codecs = list(filter(None, map(
3324         str.strip, codecs_str.strip().strip(',').split(','))))
3325     vcodec, acodec, tcodec, hdr = None, None, None, None
3326     for full_codec in split_codecs:
3327         parts = full_codec.split('.')
3328         codec = parts[0].replace('0', '')
3329         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3330                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3331             if not vcodec:
3332                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3333                 if codec in ('dvh1', 'dvhe'):
3334                     hdr = 'DV'
3335                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3336                     hdr = 'HDR10'
3337                 elif full_codec.replace('0', '').startswith('vp9.2'):
3338                     hdr = 'HDR10'
3339         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3340             if not acodec:
3341                 acodec = full_codec
3342         elif codec in ('stpp', 'wvtt',):
3343             if not tcodec:
3344                 tcodec = full_codec
3345         else:
3346             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3347     if vcodec or acodec or tcodec:
3348         return {
3349             'vcodec': vcodec or 'none',
3350             'acodec': acodec or 'none',
3351             'dynamic_range': hdr,
3352             **({'tcodec': tcodec} if tcodec is not None else {}),
3353         }
3354     elif len(split_codecs) == 2:
3355         return {
3356             'vcodec': split_codecs[0],
3357             'acodec': split_codecs[1],
3358         }
3359     return {}
3360
3361
3362 def urlhandle_detect_ext(url_handle):
3363     getheader = url_handle.headers.get
3364
3365     cd = getheader('Content-Disposition')
3366     if cd:
3367         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3368         if m:
3369             e = determine_ext(m.group('filename'), default_ext=None)
3370             if e:
3371                 return e
3372
3373     return mimetype2ext(getheader('Content-Type'))
3374
3375
3376 def encode_data_uri(data, mime_type):
3377     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3378
3379
3380 def age_restricted(content_limit, age_limit):
3381     """ Returns True iff the content should be blocked """
3382
3383     if age_limit is None:  # No limit set
3384         return False
3385     if content_limit is None:
3386         return False  # Content available for everyone
3387     return age_limit < content_limit
3388
3389
3390 def is_html(first_bytes):
3391     """ Detect whether a file contains HTML by examining its first bytes. """
3392
3393     BOMS = [
3394         (b'\xef\xbb\xbf', 'utf-8'),
3395         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3396         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3397         (b'\xff\xfe', 'utf-16-le'),
3398         (b'\xfe\xff', 'utf-16-be'),
3399     ]
3400     for bom, enc in BOMS:
3401         if first_bytes.startswith(bom):
3402             s = first_bytes[len(bom):].decode(enc, 'replace')
3403             break
3404     else:
3405         s = first_bytes.decode('utf-8', 'replace')
3406
3407     return re.match(r'^\s*<', s)
3408
3409
3410 def determine_protocol(info_dict):
3411     protocol = info_dict.get('protocol')
3412     if protocol is not None:
3413         return protocol
3414
3415     url = sanitize_url(info_dict['url'])
3416     if url.startswith('rtmp'):
3417         return 'rtmp'
3418     elif url.startswith('mms'):
3419         return 'mms'
3420     elif url.startswith('rtsp'):
3421         return 'rtsp'
3422
3423     ext = determine_ext(url)
3424     if ext == 'm3u8':
3425         return 'm3u8'
3426     elif ext == 'f4m':
3427         return 'f4m'
3428
3429     return compat_urllib_parse_urlparse(url).scheme
3430
3431
3432 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3433     """ Render a list of rows, each as a list of values.
3434     Text after a \t will be right aligned """
3435     def width(string):
3436         return len(remove_terminal_sequences(string).replace('\t', ''))
3437
3438     def get_max_lens(table):
3439         return [max(width(str(v)) for v in col) for col in zip(*table)]
3440
3441     def filter_using_list(row, filterArray):
3442         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3443
3444     max_lens = get_max_lens(data) if hide_empty else []
3445     header_row = filter_using_list(header_row, max_lens)
3446     data = [filter_using_list(row, max_lens) for row in data]
3447
3448     table = [header_row] + data
3449     max_lens = get_max_lens(table)
3450     extra_gap += 1
3451     if delim:
3452         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3453         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3454     for row in table:
3455         for pos, text in enumerate(map(str, row)):
3456             if '\t' in text:
3457                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3458             else:
3459                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3460     ret = '\n'.join(''.join(row).rstrip() for row in table)
3461     return ret
3462
3463
3464 def _match_one(filter_part, dct, incomplete):
3465     # TODO: Generalize code with YoutubeDL._build_format_filter
3466     STRING_OPERATORS = {
3467         '*=': operator.contains,
3468         '^=': lambda attr, value: attr.startswith(value),
3469         '$=': lambda attr, value: attr.endswith(value),
3470         '~=': lambda attr, value: re.search(value, attr),
3471     }
3472     COMPARISON_OPERATORS = {
3473         **STRING_OPERATORS,
3474         '<=': operator.le,  # "<=" must be defined above "<"
3475         '<': operator.lt,
3476         '>=': operator.ge,
3477         '>': operator.gt,
3478         '=': operator.eq,
3479     }
3480
3481     operator_rex = re.compile(r'''(?x)\s*
3482         (?P<key>[a-z_]+)
3483         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3484         (?:
3485             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3486             (?P<strval>.+?)
3487         )
3488         \s*$
3489         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3490     m = operator_rex.search(filter_part)
3491     if m:
3492         m = m.groupdict()
3493         unnegated_op = COMPARISON_OPERATORS[m['op']]
3494         if m['negation']:
3495             op = lambda attr, value: not unnegated_op(attr, value)
3496         else:
3497             op = unnegated_op
3498         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3499         if m['quote']:
3500             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3501         actual_value = dct.get(m['key'])
3502         numeric_comparison = None
3503         if isinstance(actual_value, compat_numeric_types):
3504             # If the original field is a string and matching comparisonvalue is
3505             # a number we should respect the origin of the original field
3506             # and process comparison value as a string (see
3507             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3508             try:
3509                 numeric_comparison = int(comparison_value)
3510             except ValueError:
3511                 numeric_comparison = parse_filesize(comparison_value)
3512                 if numeric_comparison is None:
3513                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3514                 if numeric_comparison is None:
3515                     numeric_comparison = parse_duration(comparison_value)
3516         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3517             raise ValueError('Operator %s only supports string values!' % m['op'])
3518         if actual_value is None:
3519             return incomplete or m['none_inclusive']
3520         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3521
3522     UNARY_OPERATORS = {
3523         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3524         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3525     }
3526     operator_rex = re.compile(r'''(?x)\s*
3527         (?P<op>%s)\s*(?P<key>[a-z_]+)
3528         \s*$
3529         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3530     m = operator_rex.search(filter_part)
3531     if m:
3532         op = UNARY_OPERATORS[m.group('op')]
3533         actual_value = dct.get(m.group('key'))
3534         if incomplete and actual_value is None:
3535             return True
3536         return op(actual_value)
3537
3538     raise ValueError('Invalid filter part %r' % filter_part)
3539
3540
3541 def match_str(filter_str, dct, incomplete=False):
3542     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3543         When incomplete, all conditions passes on missing fields
3544     """
3545     return all(
3546         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3547         for filter_part in re.split(r'(?<!\\)&', filter_str))
3548
3549
3550 def match_filter_func(filter_str):
3551     def _match_func(info_dict, *args, **kwargs):
3552         if match_str(filter_str, info_dict, *args, **kwargs):
3553             return None
3554         else:
3555             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3556             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3557     return _match_func
3558
3559
3560 def parse_dfxp_time_expr(time_expr):
3561     if not time_expr:
3562         return
3563
3564     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3565     if mobj:
3566         return float(mobj.group('time_offset'))
3567
3568     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3569     if mobj:
3570         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3571
3572
3573 def srt_subtitles_timecode(seconds):
3574     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3575
3576
3577 def ass_subtitles_timecode(seconds):
3578     time = timetuple_from_msec(seconds * 1000)
3579     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3580
3581
3582 def dfxp2srt(dfxp_data):
3583     '''
3584     @param dfxp_data A bytes-like object containing DFXP data
3585     @returns A unicode object containing converted SRT data
3586     '''
3587     LEGACY_NAMESPACES = (
3588         (b'http://www.w3.org/ns/ttml', [
3589             b'http://www.w3.org/2004/11/ttaf1',
3590             b'http://www.w3.org/2006/04/ttaf1',
3591             b'http://www.w3.org/2006/10/ttaf1',
3592         ]),
3593         (b'http://www.w3.org/ns/ttml#styling', [
3594             b'http://www.w3.org/ns/ttml#style',
3595         ]),
3596     )
3597
3598     SUPPORTED_STYLING = [
3599         'color',
3600         'fontFamily',
3601         'fontSize',
3602         'fontStyle',
3603         'fontWeight',
3604         'textDecoration'
3605     ]
3606
3607     _x = functools.partial(xpath_with_ns, ns_map={
3608         'xml': 'http://www.w3.org/XML/1998/namespace',
3609         'ttml': 'http://www.w3.org/ns/ttml',
3610         'tts': 'http://www.w3.org/ns/ttml#styling',
3611     })
3612
3613     styles = {}
3614     default_style = {}
3615
3616     class TTMLPElementParser(object):
3617         _out = ''
3618         _unclosed_elements = []
3619         _applied_styles = []
3620
3621         def start(self, tag, attrib):
3622             if tag in (_x('ttml:br'), 'br'):
3623                 self._out += '\n'
3624             else:
3625                 unclosed_elements = []
3626                 style = {}
3627                 element_style_id = attrib.get('style')
3628                 if default_style:
3629                     style.update(default_style)
3630                 if element_style_id:
3631                     style.update(styles.get(element_style_id, {}))
3632                 for prop in SUPPORTED_STYLING:
3633                     prop_val = attrib.get(_x('tts:' + prop))
3634                     if prop_val:
3635                         style[prop] = prop_val
3636                 if style:
3637                     font = ''
3638                     for k, v in sorted(style.items()):
3639                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3640                             continue
3641                         if k == 'color':
3642                             font += ' color="%s"' % v
3643                         elif k == 'fontSize':
3644                             font += ' size="%s"' % v
3645                         elif k == 'fontFamily':
3646                             font += ' face="%s"' % v
3647                         elif k == 'fontWeight' and v == 'bold':
3648                             self._out += '<b>'
3649                             unclosed_elements.append('b')
3650                         elif k == 'fontStyle' and v == 'italic':
3651                             self._out += '<i>'
3652                             unclosed_elements.append('i')
3653                         elif k == 'textDecoration' and v == 'underline':
3654                             self._out += '<u>'
3655                             unclosed_elements.append('u')
3656                     if font:
3657                         self._out += '<font' + font + '>'
3658                         unclosed_elements.append('font')
3659                     applied_style = {}
3660                     if self._applied_styles:
3661                         applied_style.update(self._applied_styles[-1])
3662                     applied_style.update(style)
3663                     self._applied_styles.append(applied_style)
3664                 self._unclosed_elements.append(unclosed_elements)
3665
3666         def end(self, tag):
3667             if tag not in (_x('ttml:br'), 'br'):
3668                 unclosed_elements = self._unclosed_elements.pop()
3669                 for element in reversed(unclosed_elements):
3670                     self._out += '</%s>' % element
3671                 if unclosed_elements and self._applied_styles:
3672                     self._applied_styles.pop()
3673
3674         def data(self, data):
3675             self._out += data
3676
3677         def close(self):
3678             return self._out.strip()
3679
3680     def parse_node(node):
3681         target = TTMLPElementParser()
3682         parser = xml.etree.ElementTree.XMLParser(target=target)
3683         parser.feed(xml.etree.ElementTree.tostring(node))
3684         return parser.close()
3685
3686     for k, v in LEGACY_NAMESPACES:
3687         for ns in v:
3688             dfxp_data = dfxp_data.replace(ns, k)
3689
3690     dfxp = compat_etree_fromstring(dfxp_data)
3691     out = []
3692     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3693
3694     if not paras:
3695         raise ValueError('Invalid dfxp/TTML subtitle')
3696
3697     repeat = False
3698     while True:
3699         for style in dfxp.findall(_x('.//ttml:style')):
3700             style_id = style.get('id') or style.get(_x('xml:id'))
3701             if not style_id:
3702                 continue
3703             parent_style_id = style.get('style')
3704             if parent_style_id:
3705                 if parent_style_id not in styles:
3706                     repeat = True
3707                     continue
3708                 styles[style_id] = styles[parent_style_id].copy()
3709             for prop in SUPPORTED_STYLING:
3710                 prop_val = style.get(_x('tts:' + prop))
3711                 if prop_val:
3712                     styles.setdefault(style_id, {})[prop] = prop_val
3713         if repeat:
3714             repeat = False
3715         else:
3716             break
3717
3718     for p in ('body', 'div'):
3719         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3720         if ele is None:
3721             continue
3722         style = styles.get(ele.get('style'))
3723         if not style:
3724             continue
3725         default_style.update(style)
3726
3727     for para, index in zip(paras, itertools.count(1)):
3728         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3729         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3730         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3731         if begin_time is None:
3732             continue
3733         if not end_time:
3734             if not dur:
3735                 continue
3736             end_time = begin_time + dur
3737         out.append('%d\n%s --> %s\n%s\n\n' % (
3738             index,
3739             srt_subtitles_timecode(begin_time),
3740             srt_subtitles_timecode(end_time),
3741             parse_node(para)))
3742
3743     return ''.join(out)
3744
3745
3746 def cli_option(params, command_option, param):
3747     param = params.get(param)
3748     if param:
3749         param = compat_str(param)
3750     return [command_option, param] if param is not None else []
3751
3752
3753 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3754     param = params.get(param)
3755     if param is None:
3756         return []
3757     assert isinstance(param, bool)
3758     if separator:
3759         return [command_option + separator + (true_value if param else false_value)]
3760     return [command_option, true_value if param else false_value]
3761
3762
3763 def cli_valueless_option(params, command_option, param, expected_value=True):
3764     param = params.get(param)
3765     return [command_option] if param == expected_value else []
3766
3767
3768 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3769     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3770         if use_compat:
3771             return argdict
3772         else:
3773             argdict = None
3774     if argdict is None:
3775         return default
3776     assert isinstance(argdict, dict)
3777
3778     assert isinstance(keys, (list, tuple))
3779     for key_list in keys:
3780         arg_list = list(filter(
3781             lambda x: x is not None,
3782             [argdict.get(key.lower()) for key in variadic(key_list)]))
3783         if arg_list:
3784             return [arg for args in arg_list for arg in args]
3785     return default
3786
3787
3788 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3789     main_key, exe = main_key.lower(), exe.lower()
3790     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3791     keys = [f'{root_key}{k}' for k in (keys or [''])]
3792     if root_key in keys:
3793         if main_key != exe:
3794             keys.append((main_key, exe))
3795         keys.append('default')
3796     else:
3797         use_compat = False
3798     return cli_configuration_args(argdict, keys, default, use_compat)
3799
3800
3801 class ISO639Utils(object):
3802     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3803     _lang_map = {
3804         'aa': 'aar',
3805         'ab': 'abk',
3806         'ae': 'ave',
3807         'af': 'afr',
3808         'ak': 'aka',
3809         'am': 'amh',
3810         'an': 'arg',
3811         'ar': 'ara',
3812         'as': 'asm',
3813         'av': 'ava',
3814         'ay': 'aym',
3815         'az': 'aze',
3816         'ba': 'bak',
3817         'be': 'bel',
3818         'bg': 'bul',
3819         'bh': 'bih',
3820         'bi': 'bis',
3821         'bm': 'bam',
3822         'bn': 'ben',
3823         'bo': 'bod',
3824         'br': 'bre',
3825         'bs': 'bos',
3826         'ca': 'cat',
3827         'ce': 'che',
3828         'ch': 'cha',
3829         'co': 'cos',
3830         'cr': 'cre',
3831         'cs': 'ces',
3832         'cu': 'chu',
3833         'cv': 'chv',
3834         'cy': 'cym',
3835         'da': 'dan',
3836         'de': 'deu',
3837         'dv': 'div',
3838         'dz': 'dzo',
3839         'ee': 'ewe',
3840         'el': 'ell',
3841         'en': 'eng',
3842         'eo': 'epo',
3843         'es': 'spa',
3844         'et': 'est',
3845         'eu': 'eus',
3846         'fa': 'fas',
3847         'ff': 'ful',
3848         'fi': 'fin',
3849         'fj': 'fij',
3850         'fo': 'fao',
3851         'fr': 'fra',
3852         'fy': 'fry',
3853         'ga': 'gle',
3854         'gd': 'gla',
3855         'gl': 'glg',
3856         'gn': 'grn',
3857         'gu': 'guj',
3858         'gv': 'glv',
3859         'ha': 'hau',
3860         'he': 'heb',
3861         'iw': 'heb',  # Replaced by he in 1989 revision
3862         'hi': 'hin',
3863         'ho': 'hmo',
3864         'hr': 'hrv',
3865         'ht': 'hat',
3866         'hu': 'hun',
3867         'hy': 'hye',
3868         'hz': 'her',
3869         'ia': 'ina',
3870         'id': 'ind',
3871         'in': 'ind',  # Replaced by id in 1989 revision
3872         'ie': 'ile',
3873         'ig': 'ibo',
3874         'ii': 'iii',
3875         'ik': 'ipk',
3876         'io': 'ido',
3877         'is': 'isl',
3878         'it': 'ita',
3879         'iu': 'iku',
3880         'ja': 'jpn',
3881         'jv': 'jav',
3882         'ka': 'kat',
3883         'kg': 'kon',
3884         'ki': 'kik',
3885         'kj': 'kua',
3886         'kk': 'kaz',
3887         'kl': 'kal',
3888         'km': 'khm',
3889         'kn': 'kan',
3890         'ko': 'kor',
3891         'kr': 'kau',
3892         'ks': 'kas',
3893         'ku': 'kur',
3894         'kv': 'kom',
3895         'kw': 'cor',
3896         'ky': 'kir',
3897         'la': 'lat',
3898         'lb': 'ltz',
3899         'lg': 'lug',
3900         'li': 'lim',
3901         'ln': 'lin',
3902         'lo': 'lao',
3903         'lt': 'lit',
3904         'lu': 'lub',
3905         'lv': 'lav',
3906         'mg': 'mlg',
3907         'mh': 'mah',
3908         'mi': 'mri',
3909         'mk': 'mkd',
3910         'ml': 'mal',
3911         'mn': 'mon',
3912         'mr': 'mar',
3913         'ms': 'msa',
3914         'mt': 'mlt',
3915         'my': 'mya',
3916         'na': 'nau',
3917         'nb': 'nob',
3918         'nd': 'nde',
3919         'ne': 'nep',
3920         'ng': 'ndo',
3921         'nl': 'nld',
3922         'nn': 'nno',
3923         'no': 'nor',
3924         'nr': 'nbl',
3925         'nv': 'nav',
3926         'ny': 'nya',
3927         'oc': 'oci',
3928         'oj': 'oji',
3929         'om': 'orm',
3930         'or': 'ori',
3931         'os': 'oss',
3932         'pa': 'pan',
3933         'pi': 'pli',
3934         'pl': 'pol',
3935         'ps': 'pus',
3936         'pt': 'por',
3937         'qu': 'que',
3938         'rm': 'roh',
3939         'rn': 'run',
3940         'ro': 'ron',
3941         'ru': 'rus',
3942         'rw': 'kin',
3943         'sa': 'san',
3944         'sc': 'srd',
3945         'sd': 'snd',
3946         'se': 'sme',
3947         'sg': 'sag',
3948         'si': 'sin',
3949         'sk': 'slk',
3950         'sl': 'slv',
3951         'sm': 'smo',
3952         'sn': 'sna',
3953         'so': 'som',
3954         'sq': 'sqi',
3955         'sr': 'srp',
3956         'ss': 'ssw',
3957         'st': 'sot',
3958         'su': 'sun',
3959         'sv': 'swe',
3960         'sw': 'swa',
3961         'ta': 'tam',
3962         'te': 'tel',
3963         'tg': 'tgk',
3964         'th': 'tha',
3965         'ti': 'tir',
3966         'tk': 'tuk',
3967         'tl': 'tgl',
3968         'tn': 'tsn',
3969         'to': 'ton',
3970         'tr': 'tur',
3971         'ts': 'tso',
3972         'tt': 'tat',
3973         'tw': 'twi',
3974         'ty': 'tah',
3975         'ug': 'uig',
3976         'uk': 'ukr',
3977         'ur': 'urd',
3978         'uz': 'uzb',
3979         've': 'ven',
3980         'vi': 'vie',
3981         'vo': 'vol',
3982         'wa': 'wln',
3983         'wo': 'wol',
3984         'xh': 'xho',
3985         'yi': 'yid',
3986         'ji': 'yid',  # Replaced by yi in 1989 revision
3987         'yo': 'yor',
3988         'za': 'zha',
3989         'zh': 'zho',
3990         'zu': 'zul',
3991     }
3992
3993     @classmethod
3994     def short2long(cls, code):
3995         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3996         return cls._lang_map.get(code[:2])
3997
3998     @classmethod
3999     def long2short(cls, code):
4000         """Convert language code from ISO 639-2/T to ISO 639-1"""
4001         for short_name, long_name in cls._lang_map.items():
4002             if long_name == code:
4003                 return short_name
4004
4005
4006 class ISO3166Utils(object):
4007     # From http://data.okfn.org/data/core/country-list
4008     _country_map = {
4009         'AF': 'Afghanistan',
4010         'AX': 'Åland Islands',
4011         'AL': 'Albania',
4012         'DZ': 'Algeria',
4013         'AS': 'American Samoa',
4014         'AD': 'Andorra',
4015         'AO': 'Angola',
4016         'AI': 'Anguilla',
4017         'AQ': 'Antarctica',
4018         'AG': 'Antigua and Barbuda',
4019         'AR': 'Argentina',
4020         'AM': 'Armenia',
4021         'AW': 'Aruba',
4022         'AU': 'Australia',
4023         'AT': 'Austria',
4024         'AZ': 'Azerbaijan',
4025         'BS': 'Bahamas',
4026         'BH': 'Bahrain',
4027         'BD': 'Bangladesh',
4028         'BB': 'Barbados',
4029         'BY': 'Belarus',
4030         'BE': 'Belgium',
4031         'BZ': 'Belize',
4032         'BJ': 'Benin',
4033         'BM': 'Bermuda',
4034         'BT': 'Bhutan',
4035         'BO': 'Bolivia, Plurinational State of',
4036         'BQ': 'Bonaire, Sint Eustatius and Saba',
4037         'BA': 'Bosnia and Herzegovina',
4038         'BW': 'Botswana',
4039         'BV': 'Bouvet Island',
4040         'BR': 'Brazil',
4041         'IO': 'British Indian Ocean Territory',
4042         'BN': 'Brunei Darussalam',
4043         'BG': 'Bulgaria',
4044         'BF': 'Burkina Faso',
4045         'BI': 'Burundi',
4046         'KH': 'Cambodia',
4047         'CM': 'Cameroon',
4048         'CA': 'Canada',
4049         'CV': 'Cape Verde',
4050         'KY': 'Cayman Islands',
4051         'CF': 'Central African Republic',
4052         'TD': 'Chad',
4053         'CL': 'Chile',
4054         'CN': 'China',
4055         'CX': 'Christmas Island',
4056         'CC': 'Cocos (Keeling) Islands',
4057         'CO': 'Colombia',
4058         'KM': 'Comoros',
4059         'CG': 'Congo',
4060         'CD': 'Congo, the Democratic Republic of the',
4061         'CK': 'Cook Islands',
4062         'CR': 'Costa Rica',
4063         'CI': 'Côte d\'Ivoire',
4064         'HR': 'Croatia',
4065         'CU': 'Cuba',
4066         'CW': 'Curaçao',
4067         'CY': 'Cyprus',
4068         'CZ': 'Czech Republic',
4069         'DK': 'Denmark',
4070         'DJ': 'Djibouti',
4071         'DM': 'Dominica',
4072         'DO': 'Dominican Republic',
4073         'EC': 'Ecuador',
4074         'EG': 'Egypt',
4075         'SV': 'El Salvador',
4076         'GQ': 'Equatorial Guinea',
4077         'ER': 'Eritrea',
4078         'EE': 'Estonia',
4079         'ET': 'Ethiopia',
4080         'FK': 'Falkland Islands (Malvinas)',
4081         'FO': 'Faroe Islands',
4082         'FJ': 'Fiji',
4083         'FI': 'Finland',
4084         'FR': 'France',
4085         'GF': 'French Guiana',
4086         'PF': 'French Polynesia',
4087         'TF': 'French Southern Territories',
4088         'GA': 'Gabon',
4089         'GM': 'Gambia',
4090         'GE': 'Georgia',
4091         'DE': 'Germany',
4092         'GH': 'Ghana',
4093         'GI': 'Gibraltar',
4094         'GR': 'Greece',
4095         'GL': 'Greenland',
4096         'GD': 'Grenada',
4097         'GP': 'Guadeloupe',
4098         'GU': 'Guam',
4099         'GT': 'Guatemala',
4100         'GG': 'Guernsey',
4101         'GN': 'Guinea',
4102         'GW': 'Guinea-Bissau',
4103         'GY': 'Guyana',
4104         'HT': 'Haiti',
4105         'HM': 'Heard Island and McDonald Islands',
4106         'VA': 'Holy See (Vatican City State)',
4107         'HN': 'Honduras',
4108         'HK': 'Hong Kong',
4109         'HU': 'Hungary',
4110         'IS': 'Iceland',
4111         'IN': 'India',
4112         'ID': 'Indonesia',
4113         'IR': 'Iran, Islamic Republic of',
4114         'IQ': 'Iraq',
4115         'IE': 'Ireland',
4116         'IM': 'Isle of Man',
4117         'IL': 'Israel',
4118         'IT': 'Italy',
4119         'JM': 'Jamaica',
4120         'JP': 'Japan',
4121         'JE': 'Jersey',
4122         'JO': 'Jordan',
4123         'KZ': 'Kazakhstan',
4124         'KE': 'Kenya',
4125         'KI': 'Kiribati',
4126         'KP': 'Korea, Democratic People\'s Republic of',
4127         'KR': 'Korea, Republic of',
4128         'KW': 'Kuwait',
4129         'KG': 'Kyrgyzstan',
4130         'LA': 'Lao People\'s Democratic Republic',
4131         'LV': 'Latvia',
4132         'LB': 'Lebanon',
4133         'LS': 'Lesotho',
4134         'LR': 'Liberia',
4135         'LY': 'Libya',
4136         'LI': 'Liechtenstein',
4137         'LT': 'Lithuania',
4138         'LU': 'Luxembourg',
4139         'MO': 'Macao',
4140         'MK': 'Macedonia, the Former Yugoslav Republic of',
4141         'MG': 'Madagascar',
4142         'MW': 'Malawi',
4143         'MY': 'Malaysia',
4144         'MV': 'Maldives',
4145         'ML': 'Mali',
4146         'MT': 'Malta',
4147         'MH': 'Marshall Islands',
4148         'MQ': 'Martinique',
4149         'MR': 'Mauritania',
4150         'MU': 'Mauritius',
4151         'YT': 'Mayotte',
4152         'MX': 'Mexico',
4153         'FM': 'Micronesia, Federated States of',
4154         'MD': 'Moldova, Republic of',
4155         'MC': 'Monaco',
4156         'MN': 'Mongolia',
4157         'ME': 'Montenegro',
4158         'MS': 'Montserrat',
4159         'MA': 'Morocco',
4160         'MZ': 'Mozambique',
4161         'MM': 'Myanmar',
4162         'NA': 'Namibia',
4163         'NR': 'Nauru',
4164         'NP': 'Nepal',
4165         'NL': 'Netherlands',
4166         'NC': 'New Caledonia',
4167         'NZ': 'New Zealand',
4168         'NI': 'Nicaragua',
4169         'NE': 'Niger',
4170         'NG': 'Nigeria',
4171         'NU': 'Niue',
4172         'NF': 'Norfolk Island',
4173         'MP': 'Northern Mariana Islands',
4174         'NO': 'Norway',
4175         'OM': 'Oman',
4176         'PK': 'Pakistan',
4177         'PW': 'Palau',
4178         'PS': 'Palestine, State of',
4179         'PA': 'Panama',
4180         'PG': 'Papua New Guinea',
4181         'PY': 'Paraguay',
4182         'PE': 'Peru',
4183         'PH': 'Philippines',
4184         'PN': 'Pitcairn',
4185         'PL': 'Poland',
4186         'PT': 'Portugal',
4187         'PR': 'Puerto Rico',
4188         'QA': 'Qatar',
4189         'RE': 'Réunion',
4190         'RO': 'Romania',
4191         'RU': 'Russian Federation',
4192         'RW': 'Rwanda',
4193         'BL': 'Saint Barthélemy',
4194         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4195         'KN': 'Saint Kitts and Nevis',
4196         'LC': 'Saint Lucia',
4197         'MF': 'Saint Martin (French part)',
4198         'PM': 'Saint Pierre and Miquelon',
4199         'VC': 'Saint Vincent and the Grenadines',
4200         'WS': 'Samoa',
4201         'SM': 'San Marino',
4202         'ST': 'Sao Tome and Principe',
4203         'SA': 'Saudi Arabia',
4204         'SN': 'Senegal',
4205         'RS': 'Serbia',
4206         'SC': 'Seychelles',
4207         'SL': 'Sierra Leone',
4208         'SG': 'Singapore',
4209         'SX': 'Sint Maarten (Dutch part)',
4210         'SK': 'Slovakia',
4211         'SI': 'Slovenia',
4212         'SB': 'Solomon Islands',
4213         'SO': 'Somalia',
4214         'ZA': 'South Africa',
4215         'GS': 'South Georgia and the South Sandwich Islands',
4216         'SS': 'South Sudan',
4217         'ES': 'Spain',
4218         'LK': 'Sri Lanka',
4219         'SD': 'Sudan',
4220         'SR': 'Suriname',
4221         'SJ': 'Svalbard and Jan Mayen',
4222         'SZ': 'Swaziland',
4223         'SE': 'Sweden',
4224         'CH': 'Switzerland',
4225         'SY': 'Syrian Arab Republic',
4226         'TW': 'Taiwan, Province of China',
4227         'TJ': 'Tajikistan',
4228         'TZ': 'Tanzania, United Republic of',
4229         'TH': 'Thailand',
4230         'TL': 'Timor-Leste',
4231         'TG': 'Togo',
4232         'TK': 'Tokelau',
4233         'TO': 'Tonga',
4234         'TT': 'Trinidad and Tobago',
4235         'TN': 'Tunisia',
4236         'TR': 'Turkey',
4237         'TM': 'Turkmenistan',
4238         'TC': 'Turks and Caicos Islands',
4239         'TV': 'Tuvalu',
4240         'UG': 'Uganda',
4241         'UA': 'Ukraine',
4242         'AE': 'United Arab Emirates',
4243         'GB': 'United Kingdom',
4244         'US': 'United States',
4245         'UM': 'United States Minor Outlying Islands',
4246         'UY': 'Uruguay',
4247         'UZ': 'Uzbekistan',
4248         'VU': 'Vanuatu',
4249         'VE': 'Venezuela, Bolivarian Republic of',
4250         'VN': 'Viet Nam',
4251         'VG': 'Virgin Islands, British',
4252         'VI': 'Virgin Islands, U.S.',
4253         'WF': 'Wallis and Futuna',
4254         'EH': 'Western Sahara',
4255         'YE': 'Yemen',
4256         'ZM': 'Zambia',
4257         'ZW': 'Zimbabwe',
4258     }
4259
4260     @classmethod
4261     def short2full(cls, code):
4262         """Convert an ISO 3166-2 country code to the corresponding full name"""
4263         return cls._country_map.get(code.upper())
4264
4265
4266 class GeoUtils(object):
4267     # Major IPv4 address blocks per country
4268     _country_ip_map = {
4269         'AD': '46.172.224.0/19',
4270         'AE': '94.200.0.0/13',
4271         'AF': '149.54.0.0/17',
4272         'AG': '209.59.64.0/18',
4273         'AI': '204.14.248.0/21',
4274         'AL': '46.99.0.0/16',
4275         'AM': '46.70.0.0/15',
4276         'AO': '105.168.0.0/13',
4277         'AP': '182.50.184.0/21',
4278         'AQ': '23.154.160.0/24',
4279         'AR': '181.0.0.0/12',
4280         'AS': '202.70.112.0/20',
4281         'AT': '77.116.0.0/14',
4282         'AU': '1.128.0.0/11',
4283         'AW': '181.41.0.0/18',
4284         'AX': '185.217.4.0/22',
4285         'AZ': '5.197.0.0/16',
4286         'BA': '31.176.128.0/17',
4287         'BB': '65.48.128.0/17',
4288         'BD': '114.130.0.0/16',
4289         'BE': '57.0.0.0/8',
4290         'BF': '102.178.0.0/15',
4291         'BG': '95.42.0.0/15',
4292         'BH': '37.131.0.0/17',
4293         'BI': '154.117.192.0/18',
4294         'BJ': '137.255.0.0/16',
4295         'BL': '185.212.72.0/23',
4296         'BM': '196.12.64.0/18',
4297         'BN': '156.31.0.0/16',
4298         'BO': '161.56.0.0/16',
4299         'BQ': '161.0.80.0/20',
4300         'BR': '191.128.0.0/12',
4301         'BS': '24.51.64.0/18',
4302         'BT': '119.2.96.0/19',
4303         'BW': '168.167.0.0/16',
4304         'BY': '178.120.0.0/13',
4305         'BZ': '179.42.192.0/18',
4306         'CA': '99.224.0.0/11',
4307         'CD': '41.243.0.0/16',
4308         'CF': '197.242.176.0/21',
4309         'CG': '160.113.0.0/16',
4310         'CH': '85.0.0.0/13',
4311         'CI': '102.136.0.0/14',
4312         'CK': '202.65.32.0/19',
4313         'CL': '152.172.0.0/14',
4314         'CM': '102.244.0.0/14',
4315         'CN': '36.128.0.0/10',
4316         'CO': '181.240.0.0/12',
4317         'CR': '201.192.0.0/12',
4318         'CU': '152.206.0.0/15',
4319         'CV': '165.90.96.0/19',
4320         'CW': '190.88.128.0/17',
4321         'CY': '31.153.0.0/16',
4322         'CZ': '88.100.0.0/14',
4323         'DE': '53.0.0.0/8',
4324         'DJ': '197.241.0.0/17',
4325         'DK': '87.48.0.0/12',
4326         'DM': '192.243.48.0/20',
4327         'DO': '152.166.0.0/15',
4328         'DZ': '41.96.0.0/12',
4329         'EC': '186.68.0.0/15',
4330         'EE': '90.190.0.0/15',
4331         'EG': '156.160.0.0/11',
4332         'ER': '196.200.96.0/20',
4333         'ES': '88.0.0.0/11',
4334         'ET': '196.188.0.0/14',
4335         'EU': '2.16.0.0/13',
4336         'FI': '91.152.0.0/13',
4337         'FJ': '144.120.0.0/16',
4338         'FK': '80.73.208.0/21',
4339         'FM': '119.252.112.0/20',
4340         'FO': '88.85.32.0/19',
4341         'FR': '90.0.0.0/9',
4342         'GA': '41.158.0.0/15',
4343         'GB': '25.0.0.0/8',
4344         'GD': '74.122.88.0/21',
4345         'GE': '31.146.0.0/16',
4346         'GF': '161.22.64.0/18',
4347         'GG': '62.68.160.0/19',
4348         'GH': '154.160.0.0/12',
4349         'GI': '95.164.0.0/16',
4350         'GL': '88.83.0.0/19',
4351         'GM': '160.182.0.0/15',
4352         'GN': '197.149.192.0/18',
4353         'GP': '104.250.0.0/19',
4354         'GQ': '105.235.224.0/20',
4355         'GR': '94.64.0.0/13',
4356         'GT': '168.234.0.0/16',
4357         'GU': '168.123.0.0/16',
4358         'GW': '197.214.80.0/20',
4359         'GY': '181.41.64.0/18',
4360         'HK': '113.252.0.0/14',
4361         'HN': '181.210.0.0/16',
4362         'HR': '93.136.0.0/13',
4363         'HT': '148.102.128.0/17',
4364         'HU': '84.0.0.0/14',
4365         'ID': '39.192.0.0/10',
4366         'IE': '87.32.0.0/12',
4367         'IL': '79.176.0.0/13',
4368         'IM': '5.62.80.0/20',
4369         'IN': '117.192.0.0/10',
4370         'IO': '203.83.48.0/21',
4371         'IQ': '37.236.0.0/14',
4372         'IR': '2.176.0.0/12',
4373         'IS': '82.221.0.0/16',
4374         'IT': '79.0.0.0/10',
4375         'JE': '87.244.64.0/18',
4376         'JM': '72.27.0.0/17',
4377         'JO': '176.29.0.0/16',
4378         'JP': '133.0.0.0/8',
4379         'KE': '105.48.0.0/12',
4380         'KG': '158.181.128.0/17',
4381         'KH': '36.37.128.0/17',
4382         'KI': '103.25.140.0/22',
4383         'KM': '197.255.224.0/20',
4384         'KN': '198.167.192.0/19',
4385         'KP': '175.45.176.0/22',
4386         'KR': '175.192.0.0/10',
4387         'KW': '37.36.0.0/14',
4388         'KY': '64.96.0.0/15',
4389         'KZ': '2.72.0.0/13',
4390         'LA': '115.84.64.0/18',
4391         'LB': '178.135.0.0/16',
4392         'LC': '24.92.144.0/20',
4393         'LI': '82.117.0.0/19',
4394         'LK': '112.134.0.0/15',
4395         'LR': '102.183.0.0/16',
4396         'LS': '129.232.0.0/17',
4397         'LT': '78.56.0.0/13',
4398         'LU': '188.42.0.0/16',
4399         'LV': '46.109.0.0/16',
4400         'LY': '41.252.0.0/14',
4401         'MA': '105.128.0.0/11',
4402         'MC': '88.209.64.0/18',
4403         'MD': '37.246.0.0/16',
4404         'ME': '178.175.0.0/17',
4405         'MF': '74.112.232.0/21',
4406         'MG': '154.126.0.0/17',
4407         'MH': '117.103.88.0/21',
4408         'MK': '77.28.0.0/15',
4409         'ML': '154.118.128.0/18',
4410         'MM': '37.111.0.0/17',
4411         'MN': '49.0.128.0/17',
4412         'MO': '60.246.0.0/16',
4413         'MP': '202.88.64.0/20',
4414         'MQ': '109.203.224.0/19',
4415         'MR': '41.188.64.0/18',
4416         'MS': '208.90.112.0/22',
4417         'MT': '46.11.0.0/16',
4418         'MU': '105.16.0.0/12',
4419         'MV': '27.114.128.0/18',
4420         'MW': '102.70.0.0/15',
4421         'MX': '187.192.0.0/11',
4422         'MY': '175.136.0.0/13',
4423         'MZ': '197.218.0.0/15',
4424         'NA': '41.182.0.0/16',
4425         'NC': '101.101.0.0/18',
4426         'NE': '197.214.0.0/18',
4427         'NF': '203.17.240.0/22',
4428         'NG': '105.112.0.0/12',
4429         'NI': '186.76.0.0/15',
4430         'NL': '145.96.0.0/11',
4431         'NO': '84.208.0.0/13',
4432         'NP': '36.252.0.0/15',
4433         'NR': '203.98.224.0/19',
4434         'NU': '49.156.48.0/22',
4435         'NZ': '49.224.0.0/14',
4436         'OM': '5.36.0.0/15',
4437         'PA': '186.72.0.0/15',
4438         'PE': '186.160.0.0/14',
4439         'PF': '123.50.64.0/18',
4440         'PG': '124.240.192.0/19',
4441         'PH': '49.144.0.0/13',
4442         'PK': '39.32.0.0/11',
4443         'PL': '83.0.0.0/11',
4444         'PM': '70.36.0.0/20',
4445         'PR': '66.50.0.0/16',
4446         'PS': '188.161.0.0/16',
4447         'PT': '85.240.0.0/13',
4448         'PW': '202.124.224.0/20',
4449         'PY': '181.120.0.0/14',
4450         'QA': '37.210.0.0/15',
4451         'RE': '102.35.0.0/16',
4452         'RO': '79.112.0.0/13',
4453         'RS': '93.86.0.0/15',
4454         'RU': '5.136.0.0/13',
4455         'RW': '41.186.0.0/16',
4456         'SA': '188.48.0.0/13',
4457         'SB': '202.1.160.0/19',
4458         'SC': '154.192.0.0/11',
4459         'SD': '102.120.0.0/13',
4460         'SE': '78.64.0.0/12',
4461         'SG': '8.128.0.0/10',
4462         'SI': '188.196.0.0/14',
4463         'SK': '78.98.0.0/15',
4464         'SL': '102.143.0.0/17',
4465         'SM': '89.186.32.0/19',
4466         'SN': '41.82.0.0/15',
4467         'SO': '154.115.192.0/18',
4468         'SR': '186.179.128.0/17',
4469         'SS': '105.235.208.0/21',
4470         'ST': '197.159.160.0/19',
4471         'SV': '168.243.0.0/16',
4472         'SX': '190.102.0.0/20',
4473         'SY': '5.0.0.0/16',
4474         'SZ': '41.84.224.0/19',
4475         'TC': '65.255.48.0/20',
4476         'TD': '154.68.128.0/19',
4477         'TG': '196.168.0.0/14',
4478         'TH': '171.96.0.0/13',
4479         'TJ': '85.9.128.0/18',
4480         'TK': '27.96.24.0/21',
4481         'TL': '180.189.160.0/20',
4482         'TM': '95.85.96.0/19',
4483         'TN': '197.0.0.0/11',
4484         'TO': '175.176.144.0/21',
4485         'TR': '78.160.0.0/11',
4486         'TT': '186.44.0.0/15',
4487         'TV': '202.2.96.0/19',
4488         'TW': '120.96.0.0/11',
4489         'TZ': '156.156.0.0/14',
4490         'UA': '37.52.0.0/14',
4491         'UG': '102.80.0.0/13',
4492         'US': '6.0.0.0/8',
4493         'UY': '167.56.0.0/13',
4494         'UZ': '84.54.64.0/18',
4495         'VA': '212.77.0.0/19',
4496         'VC': '207.191.240.0/21',
4497         'VE': '186.88.0.0/13',
4498         'VG': '66.81.192.0/20',
4499         'VI': '146.226.0.0/16',
4500         'VN': '14.160.0.0/11',
4501         'VU': '202.80.32.0/20',
4502         'WF': '117.20.32.0/21',
4503         'WS': '202.4.32.0/19',
4504         'YE': '134.35.0.0/16',
4505         'YT': '41.242.116.0/22',
4506         'ZA': '41.0.0.0/11',
4507         'ZM': '102.144.0.0/13',
4508         'ZW': '102.177.192.0/18',
4509     }
4510
4511     @classmethod
4512     def random_ipv4(cls, code_or_block):
4513         if len(code_or_block) == 2:
4514             block = cls._country_ip_map.get(code_or_block.upper())
4515             if not block:
4516                 return None
4517         else:
4518             block = code_or_block
4519         addr, preflen = block.split('/')
4520         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4521         addr_max = addr_min | (0xffffffff >> int(preflen))
4522         return compat_str(socket.inet_ntoa(
4523             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4524
4525
4526 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4527     def __init__(self, proxies=None):
4528         # Set default handlers
4529         for type in ('http', 'https'):
4530             setattr(self, '%s_open' % type,
4531                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4532                         meth(r, proxy, type))
4533         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4534
4535     def proxy_open(self, req, proxy, type):
4536         req_proxy = req.headers.get('Ytdl-request-proxy')
4537         if req_proxy is not None:
4538             proxy = req_proxy
4539             del req.headers['Ytdl-request-proxy']
4540
4541         if proxy == '__noproxy__':
4542             return None  # No Proxy
4543         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4544             req.add_header('Ytdl-socks-proxy', proxy)
4545             # yt-dlp's http/https handlers do wrapping the socket with socks
4546             return None
4547         return compat_urllib_request.ProxyHandler.proxy_open(
4548             self, req, proxy, type)
4549
4550
4551 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4552 # released into Public Domain
4553 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4554
4555 def long_to_bytes(n, blocksize=0):
4556     """long_to_bytes(n:long, blocksize:int) : string
4557     Convert a long integer to a byte string.
4558
4559     If optional blocksize is given and greater than zero, pad the front of the
4560     byte string with binary zeros so that the length is a multiple of
4561     blocksize.
4562     """
4563     # after much testing, this algorithm was deemed to be the fastest
4564     s = b''
4565     n = int(n)
4566     while n > 0:
4567         s = compat_struct_pack('>I', n & 0xffffffff) + s
4568         n = n >> 32
4569     # strip off leading zeros
4570     for i in range(len(s)):
4571         if s[i] != b'\000'[0]:
4572             break
4573     else:
4574         # only happens when n == 0
4575         s = b'\000'
4576         i = 0
4577     s = s[i:]
4578     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4579     # de-padding being done above, but sigh...
4580     if blocksize > 0 and len(s) % blocksize:
4581         s = (blocksize - len(s) % blocksize) * b'\000' + s
4582     return s
4583
4584
4585 def bytes_to_long(s):
4586     """bytes_to_long(string) : long
4587     Convert a byte string to a long integer.
4588
4589     This is (essentially) the inverse of long_to_bytes().
4590     """
4591     acc = 0
4592     length = len(s)
4593     if length % 4:
4594         extra = (4 - length % 4)
4595         s = b'\000' * extra + s
4596         length = length + extra
4597     for i in range(0, length, 4):
4598         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4599     return acc
4600
4601
4602 def ohdave_rsa_encrypt(data, exponent, modulus):
4603     '''
4604     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4605
4606     Input:
4607         data: data to encrypt, bytes-like object
4608         exponent, modulus: parameter e and N of RSA algorithm, both integer
4609     Output: hex string of encrypted data
4610
4611     Limitation: supports one block encryption only
4612     '''
4613
4614     payload = int(binascii.hexlify(data[::-1]), 16)
4615     encrypted = pow(payload, exponent, modulus)
4616     return '%x' % encrypted
4617
4618
4619 def pkcs1pad(data, length):
4620     """
4621     Padding input data with PKCS#1 scheme
4622
4623     @param {int[]} data        input data
4624     @param {int}   length      target length
4625     @returns {int[]}           padded data
4626     """
4627     if len(data) > length - 11:
4628         raise ValueError('Input data too long for PKCS#1 padding')
4629
4630     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4631     return [0, 2] + pseudo_random + [0] + data
4632
4633
4634 def encode_base_n(num, n, table=None):
4635     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4636     if not table:
4637         table = FULL_TABLE[:n]
4638
4639     if n > len(table):
4640         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4641
4642     if num == 0:
4643         return table[0]
4644
4645     ret = ''
4646     while num:
4647         ret = table[num % n] + ret
4648         num = num // n
4649     return ret
4650
4651
4652 def decode_packed_codes(code):
4653     mobj = re.search(PACKED_CODES_RE, code)
4654     obfuscated_code, base, count, symbols = mobj.groups()
4655     base = int(base)
4656     count = int(count)
4657     symbols = symbols.split('|')
4658     symbol_table = {}
4659
4660     while count:
4661         count -= 1
4662         base_n_count = encode_base_n(count, base)
4663         symbol_table[base_n_count] = symbols[count] or base_n_count
4664
4665     return re.sub(
4666         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4667         obfuscated_code)
4668
4669
4670 def caesar(s, alphabet, shift):
4671     if shift == 0:
4672         return s
4673     l = len(alphabet)
4674     return ''.join(
4675         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4676         for c in s)
4677
4678
4679 def rot47(s):
4680     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4681
4682
4683 def parse_m3u8_attributes(attrib):
4684     info = {}
4685     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4686         if val.startswith('"'):
4687             val = val[1:-1]
4688         info[key] = val
4689     return info
4690
4691
4692 def urshift(val, n):
4693     return val >> n if val >= 0 else (val + 0x100000000) >> n
4694
4695
4696 # Based on png2str() written by @gdkchan and improved by @yokrysty
4697 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4698 def decode_png(png_data):
4699     # Reference: https://www.w3.org/TR/PNG/
4700     header = png_data[8:]
4701
4702     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4703         raise IOError('Not a valid PNG file.')
4704
4705     int_map = {1: '>B', 2: '>H', 4: '>I'}
4706     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4707
4708     chunks = []
4709
4710     while header:
4711         length = unpack_integer(header[:4])
4712         header = header[4:]
4713
4714         chunk_type = header[:4]
4715         header = header[4:]
4716
4717         chunk_data = header[:length]
4718         header = header[length:]
4719
4720         header = header[4:]  # Skip CRC
4721
4722         chunks.append({
4723             'type': chunk_type,
4724             'length': length,
4725             'data': chunk_data
4726         })
4727
4728     ihdr = chunks[0]['data']
4729
4730     width = unpack_integer(ihdr[:4])
4731     height = unpack_integer(ihdr[4:8])
4732
4733     idat = b''
4734
4735     for chunk in chunks:
4736         if chunk['type'] == b'IDAT':
4737             idat += chunk['data']
4738
4739     if not idat:
4740         raise IOError('Unable to read PNG data.')
4741
4742     decompressed_data = bytearray(zlib.decompress(idat))
4743
4744     stride = width * 3
4745     pixels = []
4746
4747     def _get_pixel(idx):
4748         x = idx % stride
4749         y = idx // stride
4750         return pixels[y][x]
4751
4752     for y in range(height):
4753         basePos = y * (1 + stride)
4754         filter_type = decompressed_data[basePos]
4755
4756         current_row = []
4757
4758         pixels.append(current_row)
4759
4760         for x in range(stride):
4761             color = decompressed_data[1 + basePos + x]
4762             basex = y * stride + x
4763             left = 0
4764             up = 0
4765
4766             if x > 2:
4767                 left = _get_pixel(basex - 3)
4768             if y > 0:
4769                 up = _get_pixel(basex - stride)
4770
4771             if filter_type == 1:  # Sub
4772                 color = (color + left) & 0xff
4773             elif filter_type == 2:  # Up
4774                 color = (color + up) & 0xff
4775             elif filter_type == 3:  # Average
4776                 color = (color + ((left + up) >> 1)) & 0xff
4777             elif filter_type == 4:  # Paeth
4778                 a = left
4779                 b = up
4780                 c = 0
4781
4782                 if x > 2 and y > 0:
4783                     c = _get_pixel(basex - stride - 3)
4784
4785                 p = a + b - c
4786
4787                 pa = abs(p - a)
4788                 pb = abs(p - b)
4789                 pc = abs(p - c)
4790
4791                 if pa <= pb and pa <= pc:
4792                     color = (color + a) & 0xff
4793                 elif pb <= pc:
4794                     color = (color + b) & 0xff
4795                 else:
4796                     color = (color + c) & 0xff
4797
4798             current_row.append(color)
4799
4800     return width, height, pixels
4801
4802
4803 def write_xattr(path, key, value):
4804     # This mess below finds the best xattr tool for the job
4805     try:
4806         # try the pyxattr module...
4807         import xattr
4808
4809         if hasattr(xattr, 'set'):  # pyxattr
4810             # Unicode arguments are not supported in python-pyxattr until
4811             # version 0.5.0
4812             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4813             pyxattr_required_version = '0.5.0'
4814             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4815                 # TODO: fallback to CLI tools
4816                 raise XAttrUnavailableError(
4817                     'python-pyxattr is detected but is too old. '
4818                     'yt-dlp requires %s or above while your version is %s. '
4819                     'Falling back to other xattr implementations' % (
4820                         pyxattr_required_version, xattr.__version__))
4821
4822             setxattr = xattr.set
4823         else:  # xattr
4824             setxattr = xattr.setxattr
4825
4826         try:
4827             setxattr(path, key, value)
4828         except EnvironmentError as e:
4829             raise XAttrMetadataError(e.errno, e.strerror)
4830
4831     except ImportError:
4832         if compat_os_name == 'nt':
4833             # Write xattrs to NTFS Alternate Data Streams:
4834             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4835             assert ':' not in key
4836             assert os.path.exists(path)
4837
4838             ads_fn = path + ':' + key
4839             try:
4840                 with open(ads_fn, 'wb') as f:
4841                     f.write(value)
4842             except EnvironmentError as e:
4843                 raise XAttrMetadataError(e.errno, e.strerror)
4844         else:
4845             user_has_setfattr = check_executable('setfattr', ['--version'])
4846             user_has_xattr = check_executable('xattr', ['-h'])
4847
4848             if user_has_setfattr or user_has_xattr:
4849
4850                 value = value.decode('utf-8')
4851                 if user_has_setfattr:
4852                     executable = 'setfattr'
4853                     opts = ['-n', key, '-v', value]
4854                 elif user_has_xattr:
4855                     executable = 'xattr'
4856                     opts = ['-w', key, value]
4857
4858                 cmd = ([encodeFilename(executable, True)]
4859                        + [encodeArgument(o) for o in opts]
4860                        + [encodeFilename(path, True)])
4861
4862                 try:
4863                     p = Popen(
4864                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4865                 except EnvironmentError as e:
4866                     raise XAttrMetadataError(e.errno, e.strerror)
4867                 stdout, stderr = p.communicate_or_kill()
4868                 stderr = stderr.decode('utf-8', 'replace')
4869                 if p.returncode != 0:
4870                     raise XAttrMetadataError(p.returncode, stderr)
4871
4872             else:
4873                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4874                 if sys.platform.startswith('linux'):
4875                     raise XAttrUnavailableError(
4876                         "Couldn't find a tool to set the xattrs. "
4877                         "Install either the python 'pyxattr' or 'xattr' "
4878                         "modules, or the GNU 'attr' package "
4879                         "(which contains the 'setfattr' tool).")
4880                 else:
4881                     raise XAttrUnavailableError(
4882                         "Couldn't find a tool to set the xattrs. "
4883                         "Install either the python 'xattr' module, "
4884                         "or the 'xattr' binary.")
4885
4886
4887 def random_birthday(year_field, month_field, day_field):
4888     start_date = datetime.date(1950, 1, 1)
4889     end_date = datetime.date(1995, 12, 31)
4890     offset = random.randint(0, (end_date - start_date).days)
4891     random_date = start_date + datetime.timedelta(offset)
4892     return {
4893         year_field: str(random_date.year),
4894         month_field: str(random_date.month),
4895         day_field: str(random_date.day),
4896     }
4897
4898
4899 # Templates for internet shortcut files, which are plain text files.
4900 DOT_URL_LINK_TEMPLATE = '''
4901 [InternetShortcut]
4902 URL=%(url)s
4903 '''.lstrip()
4904
4905 DOT_WEBLOC_LINK_TEMPLATE = '''
4906 <?xml version="1.0" encoding="UTF-8"?>
4907 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4908 <plist version="1.0">
4909 <dict>
4910 \t<key>URL</key>
4911 \t<string>%(url)s</string>
4912 </dict>
4913 </plist>
4914 '''.lstrip()
4915
4916 DOT_DESKTOP_LINK_TEMPLATE = '''
4917 [Desktop Entry]
4918 Encoding=UTF-8
4919 Name=%(filename)s
4920 Type=Link
4921 URL=%(url)s
4922 Icon=text-html
4923 '''.lstrip()
4924
4925 LINK_TEMPLATES = {
4926     'url': DOT_URL_LINK_TEMPLATE,
4927     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4928     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4929 }
4930
4931
4932 def iri_to_uri(iri):
4933     """
4934     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4935
4936     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4937     """
4938
4939     iri_parts = compat_urllib_parse_urlparse(iri)
4940
4941     if '[' in iri_parts.netloc:
4942         raise ValueError('IPv6 URIs are not, yet, supported.')
4943         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4944
4945     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4946
4947     net_location = ''
4948     if iri_parts.username:
4949         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4950         if iri_parts.password is not None:
4951             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4952         net_location += '@'
4953
4954     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4955     # The 'idna' encoding produces ASCII text.
4956     if iri_parts.port is not None and iri_parts.port != 80:
4957         net_location += ':' + str(iri_parts.port)
4958
4959     return compat_urllib_parse_urlunparse(
4960         (iri_parts.scheme,
4961             net_location,
4962
4963             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4964
4965             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4966             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4967
4968             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4969             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4970
4971             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4972
4973     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4974
4975
4976 def to_high_limit_path(path):
4977     if sys.platform in ['win32', 'cygwin']:
4978         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4979         return r'\\?\ '.rstrip() + os.path.abspath(path)
4980
4981     return path
4982
4983
4984 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4985     val = traverse_obj(obj, *variadic(field))
4986     if val in ignore:
4987         return default
4988     return template % (func(val) if func else val)
4989
4990
4991 def clean_podcast_url(url):
4992     return re.sub(r'''(?x)
4993         (?:
4994             (?:
4995                 chtbl\.com/track|
4996                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4997                 play\.podtrac\.com
4998             )/[^/]+|
4999             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5000             flex\.acast\.com|
5001             pd(?:
5002                 cn\.co| # https://podcorn.com/analytics-prefix/
5003                 st\.fm # https://podsights.com/docs/
5004             )/e
5005         )/''', '', url)
5006
5007
5008 _HEX_TABLE = '0123456789abcdef'
5009
5010
5011 def random_uuidv4():
5012     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5013
5014
5015 def make_dir(path, to_screen=None):
5016     try:
5017         dn = os.path.dirname(path)
5018         if dn and not os.path.exists(dn):
5019             os.makedirs(dn)
5020         return True
5021     except (OSError, IOError) as err:
5022         if callable(to_screen) is not None:
5023             to_screen('unable to create directory ' + error_to_compat_str(err))
5024         return False
5025
5026
5027 def get_executable_path():
5028     from zipimport import zipimporter
5029     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5030         path = os.path.dirname(sys.executable)
5031     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5032         path = os.path.join(os.path.dirname(__file__), '../..')
5033     else:
5034         path = os.path.join(os.path.dirname(__file__), '..')
5035     return os.path.abspath(path)
5036
5037
5038 def load_plugins(name, suffix, namespace):
5039     classes = {}
5040     try:
5041         plugins_spec = importlib.util.spec_from_file_location(
5042             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5043         plugins = importlib.util.module_from_spec(plugins_spec)
5044         sys.modules[plugins_spec.name] = plugins
5045         plugins_spec.loader.exec_module(plugins)
5046         for name in dir(plugins):
5047             if name in namespace:
5048                 continue
5049             if not name.endswith(suffix):
5050                 continue
5051             klass = getattr(plugins, name)
5052             classes[name] = namespace[name] = klass
5053     except FileNotFoundError:
5054         pass
5055     return classes
5056
5057
5058 def traverse_obj(
5059         obj, *path_list, default=None, expected_type=None, get_all=True,
5060         casesense=True, is_user_input=False, traverse_string=False):
5061     ''' Traverse nested list/dict/tuple
5062     @param path_list        A list of paths which are checked one by one.
5063                             Each path is a list of keys where each key is a string,
5064                             a function, a tuple of strings/None or "...".
5065                             When a fuction is given, it takes the key as argument and
5066                             returns whether the key matches or not. When a tuple is given,
5067                             all the keys given in the tuple are traversed, and
5068                             "..." traverses all the keys in the object
5069                             "None" returns the object without traversal
5070     @param default          Default value to return
5071     @param expected_type    Only accept final value of this type (Can also be any callable)
5072     @param get_all          Return all the values obtained from a path or only the first one
5073     @param casesense        Whether to consider dictionary keys as case sensitive
5074     @param is_user_input    Whether the keys are generated from user input. If True,
5075                             strings are converted to int/slice if necessary
5076     @param traverse_string  Whether to traverse inside strings. If True, any
5077                             non-compatible object will also be converted into a string
5078     # TODO: Write tests
5079     '''
5080     if not casesense:
5081         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5082         path_list = (map(_lower, variadic(path)) for path in path_list)
5083
5084     def _traverse_obj(obj, path, _current_depth=0):
5085         nonlocal depth
5086         path = tuple(variadic(path))
5087         for i, key in enumerate(path):
5088             if None in (key, obj):
5089                 return obj
5090             if isinstance(key, (list, tuple)):
5091                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5092                 key = ...
5093             if key is ...:
5094                 obj = (obj.values() if isinstance(obj, dict)
5095                        else obj if isinstance(obj, (list, tuple, LazyList))
5096                        else str(obj) if traverse_string else [])
5097                 _current_depth += 1
5098                 depth = max(depth, _current_depth)
5099                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5100             elif callable(key):
5101                 if isinstance(obj, (list, tuple, LazyList)):
5102                     obj = enumerate(obj)
5103                 elif isinstance(obj, dict):
5104                     obj = obj.items()
5105                 else:
5106                     if not traverse_string:
5107                         return None
5108                     obj = str(obj)
5109                 _current_depth += 1
5110                 depth = max(depth, _current_depth)
5111                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5112             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5113                 obj = (obj.get(key) if casesense or (key in obj)
5114                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5115             else:
5116                 if is_user_input:
5117                     key = (int_or_none(key) if ':' not in key
5118                            else slice(*map(int_or_none, key.split(':'))))
5119                     if key == slice(None):
5120                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5121                 if not isinstance(key, (int, slice)):
5122                     return None
5123                 if not isinstance(obj, (list, tuple, LazyList)):
5124                     if not traverse_string:
5125                         return None
5126                     obj = str(obj)
5127                 try:
5128                     obj = obj[key]
5129                 except IndexError:
5130                     return None
5131         return obj
5132
5133     if isinstance(expected_type, type):
5134         type_test = lambda val: val if isinstance(val, expected_type) else None
5135     elif expected_type is not None:
5136         type_test = expected_type
5137     else:
5138         type_test = lambda val: val
5139
5140     for path in path_list:
5141         depth = 0
5142         val = _traverse_obj(obj, path)
5143         if val is not None:
5144             if depth:
5145                 for _ in range(depth - 1):
5146                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5147                 val = [v for v in map(type_test, val) if v is not None]
5148                 if val:
5149                     return val if get_all else val[0]
5150             else:
5151                 val = type_test(val)
5152                 if val is not None:
5153                     return val
5154     return default
5155
5156
5157 def traverse_dict(dictn, keys, casesense=True):
5158     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5159                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5160     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5161
5162
5163 def variadic(x, allowed_types=(str, bytes, dict)):
5164     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5165
5166
5167 # create a JSON Web Signature (jws) with HS256 algorithm
5168 # the resulting format is in JWS Compact Serialization
5169 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5170 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5171 def jwt_encode_hs256(payload_data, key, headers={}):
5172     header_data = {
5173         'alg': 'HS256',
5174         'typ': 'JWT',
5175     }
5176     if headers:
5177         header_data.update(headers)
5178     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5179     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5180     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5181     signature_b64 = base64.b64encode(h.digest())
5182     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5183     return token
5184
5185
5186 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5187 def jwt_decode_hs256(jwt):
5188     header_b64, payload_b64, signature_b64 = jwt.split('.')
5189     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5190     return payload_data
5191
5192
5193 def supports_terminal_sequences(stream):
5194     if compat_os_name == 'nt':
5195         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5196         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5197             return False
5198     elif not os.getenv('TERM'):
5199         return False
5200     try:
5201         return stream.isatty()
5202     except BaseException:
5203         return False
5204
5205
5206 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5207
5208
5209 def remove_terminal_sequences(string):
5210     return _terminal_sequences_re.sub('', string)
5211
5212
5213 def number_of_digits(number):
5214     return len('%d' % number)
5215
5216
5217 def join_nonempty(*values, delim='-', from_dict=None):
5218     if from_dict is not None:
5219         values = map(from_dict.get, values)
5220     return delim.join(map(str, filter(None, values)))
5221
5222
5223 class Config:
5224     own_args = None
5225     filename = None
5226     __initialized = False
5227
5228     def __init__(self, parser, label=None):
5229         self._parser, self.label = parser, label
5230         self._loaded_paths, self.configs = set(), []
5231
5232     def init(self, args=None, filename=None):
5233         assert not self.__initialized
5234         directory = ''
5235         if filename:
5236             location = os.path.realpath(filename)
5237             directory = os.path.dirname(location)
5238             if location in self._loaded_paths:
5239                 return False
5240             self._loaded_paths.add(location)
5241
5242         self.__initialized = True
5243         self.own_args, self.filename = args, filename
5244         for location in self._parser.parse_args(args)[0].config_locations or []:
5245             location = os.path.join(directory, expand_path(location))
5246             if os.path.isdir(location):
5247                 location = os.path.join(location, 'yt-dlp.conf')
5248             if not os.path.exists(location):
5249                 self._parser.error(f'config location {location} does not exist')
5250             self.append_config(self.read_file(location), location)
5251         return True
5252
5253     def __str__(self):
5254         label = join_nonempty(
5255             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5256             delim=' ')
5257         return join_nonempty(
5258             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5259             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5260             delim='\n')
5261
5262     @staticmethod
5263     def read_file(filename, default=[]):
5264         try:
5265             optionf = open(filename)
5266         except IOError:
5267             return default  # silently skip if file is not present
5268         try:
5269             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5270             contents = optionf.read()
5271             if sys.version_info < (3,):
5272                 contents = contents.decode(preferredencoding())
5273             res = compat_shlex_split(contents, comments=True)
5274         finally:
5275             optionf.close()
5276         return res
5277
5278     @staticmethod
5279     def hide_login_info(opts):
5280         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5281         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5282
5283         def _scrub_eq(o):
5284             m = eqre.match(o)
5285             if m:
5286                 return m.group('key') + '=PRIVATE'
5287             else:
5288                 return o
5289
5290         opts = list(map(_scrub_eq, opts))
5291         for idx, opt in enumerate(opts):
5292             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5293                 opts[idx + 1] = 'PRIVATE'
5294         return opts
5295
5296     def append_config(self, *args, label=None):
5297         config = type(self)(self._parser, label)
5298         config._loaded_paths = self._loaded_paths
5299         if config.init(*args):
5300             self.configs.append(config)
5301
5302     @property
5303     def all_args(self):
5304         for config in reversed(self.configs):
5305             yield from config.all_args
5306         yield from self.own_args or []
5307
5308     def parse_args(self):
5309         return self._parser.parse_args(list(self.all_args))