yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_split,
  62     compat_shlex_quote,
  63     compat_str,
  64     compat_struct_pack,
  65     compat_struct_unpack,
  66     compat_urllib_error,
  67     compat_urllib_parse,
  68     compat_urllib_parse_urlencode,
  69     compat_urllib_parse_urlparse,
  70     compat_urllib_parse_urlunparse,
  71     compat_urllib_parse_quote,
  72     compat_urllib_parse_quote_plus,
  73     compat_urllib_parse_unquote_plus,
  74     compat_urllib_request,
  75     compat_urlparse,
  76     compat_xpath,
  77 )
  78
  79 from .socks import (
  80     ProxyType,
  81     sockssocket,
  82 )
  83
  84
  85 def register_socks_protocols():
  86     # "Register" SOCKS protocols
  87     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  88     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  89     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  90         if scheme not in compat_urlparse.uses_netloc:
  91             compat_urlparse.uses_netloc.append(scheme)
  92
  93
  94 # This is not clearly defined otherwise
  95 compiled_regex_type = type(re.compile(''))
  96
  97
  98 def random_user_agent():
  99     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 100     _CHROME_VERSIONS = (
 101         '90.0.4430.212',
 102         '90.0.4430.24',
 103         '90.0.4430.70',
 104         '90.0.4430.72',
 105         '90.0.4430.85',
 106         '90.0.4430.93',
 107         '91.0.4472.101',
 108         '91.0.4472.106',
 109         '91.0.4472.114',
 110         '91.0.4472.124',
 111         '91.0.4472.164',
 112         '91.0.4472.19',
 113         '91.0.4472.77',
 114         '92.0.4515.107',
 115         '92.0.4515.115',
 116         '92.0.4515.131',
 117         '92.0.4515.159',
 118         '92.0.4515.43',
 119         '93.0.4556.0',
 120         '93.0.4577.15',
 121         '93.0.4577.63',
 122         '93.0.4577.82',
 123         '94.0.4606.41',
 124         '94.0.4606.54',
 125         '94.0.4606.61',
 126         '94.0.4606.71',
 127         '94.0.4606.81',
 128         '94.0.4606.85',
 129         '95.0.4638.17',
 130         '95.0.4638.50',
 131         '95.0.4638.54',
 132         '95.0.4638.69',
 133         '95.0.4638.74',
 134         '96.0.4664.18',
 135         '96.0.4664.45',
 136         '96.0.4664.55',
 137         '96.0.4664.93',
 138         '97.0.4692.20',
 139     )
 140     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 141
 142
 143 std_headers = {
 144     'User-Agent': random_user_agent(),
 145     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 146     'Accept-Encoding': 'gzip, deflate',
 147     'Accept-Language': 'en-us,en;q=0.5',
 148     'Sec-Fetch-Mode': 'navigate',
 149 }
 150
 151
 152 USER_AGENTS = {
 153     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 154 }
 155
 156
 157 NO_DEFAULT = object()
 158
 159 ENGLISH_MONTH_NAMES = [
 160     'January', 'February', 'March', 'April', 'May', 'June',
 161     'July', 'August', 'September', 'October', 'November', 'December']
 162
 163 MONTH_NAMES = {
 164     'en': ENGLISH_MONTH_NAMES,
 165     'fr': [
 166         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 167         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 168 }
 169
 170 KNOWN_EXTENSIONS = (
 171     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 172     'flv', 'f4v', 'f4a', 'f4b',
 173     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 174     'mkv', 'mka', 'mk3d',
 175     'avi', 'divx',
 176     'mov',
 177     'asf', 'wmv', 'wma',
 178     '3gp', '3g2',
 179     'mp3',
 180     'flac',
 181     'ape',
 182     'wav',
 183     'f4f', 'f4m', 'm3u8', 'smil')
 184
 185 # needed for sanitizing filenames in restricted mode
 186 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 187                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 188                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 189
 190 DATE_FORMATS = (
 191     '%d %B %Y',
 192     '%d %b %Y',
 193     '%B %d %Y',
 194     '%B %dst %Y',
 195     '%B %dnd %Y',
 196     '%B %drd %Y',
 197     '%B %dth %Y',
 198     '%b %d %Y',
 199     '%b %dst %Y',
 200     '%b %dnd %Y',
 201     '%b %drd %Y',
 202     '%b %dth %Y',
 203     '%b %dst %Y %I:%M',
 204     '%b %dnd %Y %I:%M',
 205     '%b %drd %Y %I:%M',
 206     '%b %dth %Y %I:%M',
 207     '%Y %m %d',
 208     '%Y-%m-%d',
 209     '%Y.%m.%d.',
 210     '%Y/%m/%d',
 211     '%Y/%m/%d %H:%M',
 212     '%Y/%m/%d %H:%M:%S',
 213     '%Y%m%d%H%M',
 214     '%Y%m%d%H%M%S',
 215     '%Y%m%d',
 216     '%Y-%m-%d %H:%M',
 217     '%Y-%m-%d %H:%M:%S',
 218     '%Y-%m-%d %H:%M:%S.%f',
 219     '%Y-%m-%d %H:%M:%S:%f',
 220     '%d.%m.%Y %H:%M',
 221     '%d.%m.%Y %H.%M',
 222     '%Y-%m-%dT%H:%M:%SZ',
 223     '%Y-%m-%dT%H:%M:%S.%fZ',
 224     '%Y-%m-%dT%H:%M:%S.%f0Z',
 225     '%Y-%m-%dT%H:%M:%S',
 226     '%Y-%m-%dT%H:%M:%S.%f',
 227     '%Y-%m-%dT%H:%M',
 228     '%b %d %Y at %H:%M',
 229     '%b %d %Y at %H:%M:%S',
 230     '%B %d %Y at %H:%M',
 231     '%B %d %Y at %H:%M:%S',
 232     '%H:%M %d-%b-%Y',
 233 )
 234
 235 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 236 DATE_FORMATS_DAY_FIRST.extend([
 237     '%d-%m-%Y',
 238     '%d.%m.%Y',
 239     '%d.%m.%y',
 240     '%d/%m/%Y',
 241     '%d/%m/%y',
 242     '%d/%m/%Y %H:%M:%S',
 243 ])
 244
 245 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 246 DATE_FORMATS_MONTH_FIRST.extend([
 247     '%m-%d-%Y',
 248     '%m.%d.%Y',
 249     '%m/%d/%Y',
 250     '%m/%d/%y',
 251     '%m/%d/%Y %H:%M:%S',
 252 ])
 253
 254 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 255 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 256
 257
 258 def preferredencoding():
 259     """Get preferred encoding.
 260
 261     Returns the best encoding scheme for the system, based on
 262     locale.getpreferredencoding() and some further tweaks.
 263     """
 264     try:
 265         pref = locale.getpreferredencoding()
 266         'TEST'.encode(pref)
 267     except Exception:
 268         pref = 'UTF-8'
 269
 270     return pref
 271
 272
 273 def write_json_file(obj, fn):
 274     """ Encode obj as JSON and write it to fn, atomically if possible """
 275
 276     fn = encodeFilename(fn)
 277     if sys.version_info < (3, 0) and sys.platform != 'win32':
 278         encoding = get_filesystem_encoding()
 279         # os.path.basename returns a bytes object, but NamedTemporaryFile
 280         # will fail if the filename contains non ascii characters unless we
 281         # use a unicode object
 282         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 283         # the same for os.path.dirname
 284         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 285     else:
 286         path_basename = os.path.basename
 287         path_dirname = os.path.dirname
 288
 289     args = {
 290         'suffix': '.tmp',
 291         'prefix': path_basename(fn) + '.',
 292         'dir': path_dirname(fn),
 293         'delete': False,
 294     }
 295
 296     # In Python 2.x, json.dump expects a bytestream.
 297     # In Python 3.x, it writes to a character stream
 298     if sys.version_info < (3, 0):
 299         args['mode'] = 'wb'
 300     else:
 301         args.update({
 302             'mode': 'w',
 303             'encoding': 'utf-8',
 304         })
 305
 306     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 307
 308     try:
 309         with tf:
 310             json.dump(obj, tf, ensure_ascii=False)
 311         if sys.platform == 'win32':
 312             # Need to remove existing file on Windows, else os.rename raises
 313             # WindowsError or FileExistsError.
 314             try:
 315                 os.unlink(fn)
 316             except OSError:
 317                 pass
 318         try:
 319             mask = os.umask(0)
 320             os.umask(mask)
 321             os.chmod(tf.name, 0o666 & ~mask)
 322         except OSError:
 323             pass
 324         os.rename(tf.name, fn)
 325     except Exception:
 326         try:
 327             os.remove(tf.name)
 328         except OSError:
 329             pass
 330         raise
 331
 332
 333 if sys.version_info >= (2, 7):
 334     def find_xpath_attr(node, xpath, key, val=None):
 335         """ Find the xpath xpath[@key=val] """
 336         assert re.match(r'^[a-zA-Z_-]+$', key)
 337         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 338         return node.find(expr)
 339 else:
 340     def find_xpath_attr(node, xpath, key, val=None):
 341         for f in node.findall(compat_xpath(xpath)):
 342             if key not in f.attrib:
 343                 continue
 344             if val is None or f.attrib.get(key) == val:
 345                 return f
 346         return None
 347
 348 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 349 # the namespace parameter
 350
 351
 352 def xpath_with_ns(path, ns_map):
 353     components = [c.split(':') for c in path.split('/')]
 354     replaced = []
 355     for c in components:
 356         if len(c) == 1:
 357             replaced.append(c[0])
 358         else:
 359             ns, tag = c
 360             replaced.append('{%s}%s' % (ns_map[ns], tag))
 361     return '/'.join(replaced)
 362
 363
 364 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 365     def _find_xpath(xpath):
 366         return node.find(compat_xpath(xpath))
 367
 368     if isinstance(xpath, (str, compat_str)):
 369         n = _find_xpath(xpath)
 370     else:
 371         for xp in xpath:
 372             n = _find_xpath(xp)
 373             if n is not None:
 374                 break
 375
 376     if n is None:
 377         if default is not NO_DEFAULT:
 378             return default
 379         elif fatal:
 380             name = xpath if name is None else name
 381             raise ExtractorError('Could not find XML element %s' % name)
 382         else:
 383             return None
 384     return n
 385
 386
 387 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 388     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 389     if n is None or n == default:
 390         return n
 391     if n.text is None:
 392         if default is not NO_DEFAULT:
 393             return default
 394         elif fatal:
 395             name = xpath if name is None else name
 396             raise ExtractorError('Could not find XML element\'s text %s' % name)
 397         else:
 398             return None
 399     return n.text
 400
 401
 402 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 403     n = find_xpath_attr(node, xpath, key)
 404     if n is None:
 405         if default is not NO_DEFAULT:
 406             return default
 407         elif fatal:
 408             name = '%s[@%s]' % (xpath, key) if name is None else name
 409             raise ExtractorError('Could not find XML attribute %s' % name)
 410         else:
 411             return None
 412     return n.attrib[key]
 413
 414
 415 def get_element_by_id(id, html):
 416     """Return the content of the tag with the specified ID in the passed HTML document"""
 417     return get_element_by_attribute('id', id, html)
 418
 419
 420 def get_element_html_by_id(id, html):
 421     """Return the html of the tag with the specified ID in the passed HTML document"""
 422     return get_element_html_by_attribute('id', id, html)
 423
 424
 425 def get_element_by_class(class_name, html):
 426     """Return the content of the first tag with the specified class in the passed HTML document"""
 427     retval = get_elements_by_class(class_name, html)
 428     return retval[0] if retval else None
 429
 430
 431 def get_element_html_by_class(class_name, html):
 432     """Return the html of the first tag with the specified class in the passed HTML document"""
 433     retval = get_elements_html_by_class(class_name, html)
 434     return retval[0] if retval else None
 435
 436
 437 def get_element_by_attribute(attribute, value, html, escape_value=True):
 438     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 439     return retval[0] if retval else None
 440
 441
 442 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 443     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 444     return retval[0] if retval else None
 445
 446
 447 def get_elements_by_class(class_name, html):
 448     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 449     return get_elements_by_attribute(
 450         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 451         html, escape_value=False)
 452
 453
 454 def get_elements_html_by_class(class_name, html):
 455     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 456     return get_elements_html_by_attribute(
 457         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 458         html, escape_value=False)
 459
 460
 461 def get_elements_by_attribute(*args, **kwargs):
 462     """Return the content of the tag with the specified attribute in the passed HTML document"""
 463     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 464
 465
 466 def get_elements_html_by_attribute(*args, **kwargs):
 467     """Return the html of the tag with the specified attribute in the passed HTML document"""
 468     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 469
 470
 471 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 472     """
 473     Return the text (content) and the html (whole) of the tag with the specified
 474     attribute in the passed HTML document
 475     """
 476
 477     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 478
 479     value = re.escape(value) if escape_value else value
 480
 481     partial_element_re = r'''(?x)
 482         <(?P<tag>[a-zA-Z0-9:._-]+)
 483          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 484          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 485         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 486
 487     for m in re.finditer(partial_element_re, html):
 488         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 489
 490         yield (
 491             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 492             whole
 493         )
 494
 495
 496 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 497     """
 498     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 499     closing tag for the first opening tag it has encountered, and can be used
 500     as a context manager
 501     """
 502
 503     class HTMLBreakOnClosingTagException(Exception):
 504         pass
 505
 506     def __init__(self):
 507         self.tagstack = collections.deque()
 508         compat_HTMLParser.__init__(self)
 509
 510     def __enter__(self):
 511         return self
 512
 513     def __exit__(self, *_):
 514         self.close()
 515
 516     def close(self):
 517         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 518         # so data remains buffered; we no longer have any interest in it, thus
 519         # override this method to discard it
 520         pass
 521
 522     def handle_starttag(self, tag, _):
 523         self.tagstack.append(tag)
 524
 525     def handle_endtag(self, tag):
 526         if not self.tagstack:
 527             raise compat_HTMLParseError('no tags in the stack')
 528         while self.tagstack:
 529             inner_tag = self.tagstack.pop()
 530             if inner_tag == tag:
 531                 break
 532         else:
 533             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 534         if not self.tagstack:
 535             raise self.HTMLBreakOnClosingTagException()
 536
 537
 538 def get_element_text_and_html_by_tag(tag, html):
 539     """
 540     For the first element with the specified tag in the passed HTML document
 541     return its' content (text) and the whole element (html)
 542     """
 543     def find_or_raise(haystack, needle, exc):
 544         try:
 545             return haystack.index(needle)
 546         except ValueError:
 547             raise exc
 548     closing_tag = f'</{tag}>'
 549     whole_start = find_or_raise(
 550         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 551     content_start = find_or_raise(
 552         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 553     content_start += whole_start + 1
 554     with HTMLBreakOnClosingTagParser() as parser:
 555         parser.feed(html[whole_start:content_start])
 556         if not parser.tagstack or parser.tagstack[0] != tag:
 557             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 558         offset = content_start
 559         while offset < len(html):
 560             next_closing_tag_start = find_or_raise(
 561                 html[offset:], closing_tag,
 562                 compat_HTMLParseError(f'closing {tag} tag not found'))
 563             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 564             try:
 565                 parser.feed(html[offset:offset + next_closing_tag_end])
 566                 offset += next_closing_tag_end
 567             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 568                 return html[content_start:offset + next_closing_tag_start], \
 569                     html[whole_start:offset + next_closing_tag_end]
 570         raise compat_HTMLParseError('unexpected end of html')
 571
 572
 573 class HTMLAttributeParser(compat_HTMLParser):
 574     """Trivial HTML parser to gather the attributes for a single element"""
 575
 576     def __init__(self):
 577         self.attrs = {}
 578         compat_HTMLParser.__init__(self)
 579
 580     def handle_starttag(self, tag, attrs):
 581         self.attrs = dict(attrs)
 582
 583
 584 class HTMLListAttrsParser(compat_HTMLParser):
 585     """HTML parser to gather the attributes for the elements of a list"""
 586
 587     def __init__(self):
 588         compat_HTMLParser.__init__(self)
 589         self.items = []
 590         self._level = 0
 591
 592     def handle_starttag(self, tag, attrs):
 593         if tag == 'li' and self._level == 0:
 594             self.items.append(dict(attrs))
 595         self._level += 1
 596
 597     def handle_endtag(self, tag):
 598         self._level -= 1
 599
 600
 601 def extract_attributes(html_element):
 602     """Given a string for an HTML element such as
 603     <el
 604          a="foo" B="bar" c="&98;az" d=boz
 605          empty= noval entity="&amp;"
 606          sq='"' dq="'"
 607     >
 608     Decode and return a dictionary of attributes.
 609     {
 610         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 611         'empty': '', 'noval': None, 'entity': '&',
 612         'sq': '"', 'dq': '\''
 613     }.
 614     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 615     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 616     """
 617     parser = HTMLAttributeParser()
 618     try:
 619         parser.feed(html_element)
 620         parser.close()
 621     # Older Python may throw HTMLParseError in case of malformed HTML
 622     except compat_HTMLParseError:
 623         pass
 624     return parser.attrs
 625
 626
 627 def parse_list(webpage):
 628     """Given a string for an series of HTML <li> elements,
 629     return a dictionary of their attributes"""
 630     parser = HTMLListAttrsParser()
 631     parser.feed(webpage)
 632     parser.close()
 633     return parser.items
 634
 635
 636 def clean_html(html):
 637     """Clean an HTML snippet into a readable string"""
 638
 639     if html is None:  # Convenience for sanitizing descriptions etc.
 640         return html
 641
 642     # Newline vs <br />
 643     html = html.replace('\n', ' ')
 644     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 645     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 646     # Strip html tags
 647     html = re.sub('<.*?>', '', html)
 648     # Replace html entities
 649     html = unescapeHTML(html)
 650     return html.strip()
 651
 652
 653 def sanitize_open(filename, open_mode):
 654     """Try to open the given filename, and slightly tweak it if this fails.
 655
 656     Attempts to open the given filename. If this fails, it tries to change
 657     the filename slightly, step by step, until it's either able to open it
 658     or it fails and raises a final exception, like the standard open()
 659     function.
 660
 661     It returns the tuple (stream, definitive_file_name).
 662     """
 663     try:
 664         if filename == '-':
 665             if sys.platform == 'win32':
 666                 import msvcrt
 667                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 668             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 669         stream = open(encodeFilename(filename), open_mode)
 670         return (stream, filename)
 671     except (IOError, OSError) as err:
 672         if err.errno in (errno.EACCES,):
 673             raise
 674
 675         # In case of error, try to remove win32 forbidden chars
 676         alt_filename = sanitize_path(filename)
 677         if alt_filename == filename:
 678             raise
 679         else:
 680             # An exception here should be caught in the caller
 681             stream = open(encodeFilename(alt_filename), open_mode)
 682             return (stream, alt_filename)
 683
 684
 685 def timeconvert(timestr):
 686     """Convert RFC 2822 defined time string into system timestamp"""
 687     timestamp = None
 688     timetuple = email.utils.parsedate_tz(timestr)
 689     if timetuple is not None:
 690         timestamp = email.utils.mktime_tz(timetuple)
 691     return timestamp
 692
 693
 694 def sanitize_filename(s, restricted=False, is_id=False):
 695     """Sanitizes a string so it could be used as part of a filename.
 696     If restricted is set, use a stricter subset of allowed characters.
 697     Set is_id if this is not an arbitrary string, but an ID that should be kept
 698     if possible.
 699     """
 700     def replace_insane(char):
 701         if restricted and char in ACCENT_CHARS:
 702             return ACCENT_CHARS[char]
 703         elif not restricted and char == '\n':
 704             return ' '
 705         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 706             return ''
 707         elif char == '"':
 708             return '' if restricted else '\''
 709         elif char == ':':
 710             return '_-' if restricted else ' -'
 711         elif char in '\\/|*<>':
 712             return '_'
 713         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 714             return '_'
 715         if restricted and ord(char) > 127:
 716             return '_'
 717         return char
 718
 719     if s == '':
 720         return ''
 721     # Handle timestamps
 722     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 723     result = ''.join(map(replace_insane, s))
 724     if not is_id:
 725         while '__' in result:
 726             result = result.replace('__', '_')
 727         result = result.strip('_')
 728         # Common case of "Foreign band name - English song title"
 729         if restricted and result.startswith('-_'):
 730             result = result[2:]
 731         if result.startswith('-'):
 732             result = '_' + result[len('-'):]
 733         result = result.lstrip('.')
 734         if not result:
 735             result = '_'
 736     return result
 737
 738
 739 def sanitize_path(s, force=False):
 740     """Sanitizes and normalizes path on Windows"""
 741     if sys.platform == 'win32':
 742         force = False
 743         drive_or_unc, _ = os.path.splitdrive(s)
 744         if sys.version_info < (2, 7) and not drive_or_unc:
 745             drive_or_unc, _ = os.path.splitunc(s)
 746     elif force:
 747         drive_or_unc = ''
 748     else:
 749         return s
 750
 751     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 752     if drive_or_unc:
 753         norm_path.pop(0)
 754     sanitized_path = [
 755         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 756         for path_part in norm_path]
 757     if drive_or_unc:
 758         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 759     elif force and s[0] == os.path.sep:
 760         sanitized_path.insert(0, os.path.sep)
 761     return os.path.join(*sanitized_path)
 762
 763
 764 def sanitize_url(url):
 765     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 766     # the number of unwanted failures due to missing protocol
 767     if url.startswith('//'):
 768         return 'http:%s' % url
 769     # Fix some common typos seen so far
 770     COMMON_TYPOS = (
 771         # https://github.com/ytdl-org/youtube-dl/issues/15649
 772         (r'^httpss://', r'https://'),
 773         # https://bx1.be/lives/direct-tv/
 774         (r'^rmtp([es]?)://', r'rtmp\1://'),
 775     )
 776     for mistake, fixup in COMMON_TYPOS:
 777         if re.match(mistake, url):
 778             return re.sub(mistake, fixup, url)
 779     return url
 780
 781
 782 def extract_basic_auth(url):
 783     parts = compat_urlparse.urlsplit(url)
 784     if parts.username is None:
 785         return url, None
 786     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 787         parts.hostname if parts.port is None
 788         else '%s:%d' % (parts.hostname, parts.port))))
 789     auth_payload = base64.b64encode(
 790         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 791     return url, 'Basic ' + auth_payload.decode('utf-8')
 792
 793
 794 def sanitized_Request(url, *args, **kwargs):
 795     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 796     if auth_header is not None:
 797         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 798         headers['Authorization'] = auth_header
 799     return compat_urllib_request.Request(url, *args, **kwargs)
 800
 801
 802 def expand_path(s):
 803     """Expand shell variables and ~"""
 804     return os.path.expandvars(compat_expanduser(s))
 805
 806
 807 def orderedSet(iterable):
 808     """ Remove all duplicates from the input iterable """
 809     res = []
 810     for el in iterable:
 811         if el not in res:
 812             res.append(el)
 813     return res
 814
 815
 816 def _htmlentity_transform(entity_with_semicolon):
 817     """Transforms an HTML entity to a character."""
 818     entity = entity_with_semicolon[:-1]
 819
 820     # Known non-numeric HTML entity
 821     if entity in compat_html_entities.name2codepoint:
 822         return compat_chr(compat_html_entities.name2codepoint[entity])
 823
 824     # TODO: HTML5 allows entities without a semicolon. For example,
 825     # '&Eacuteric' should be decoded as 'Éric'.
 826     if entity_with_semicolon in compat_html_entities_html5:
 827         return compat_html_entities_html5[entity_with_semicolon]
 828
 829     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 830     if mobj is not None:
 831         numstr = mobj.group(1)
 832         if numstr.startswith('x'):
 833             base = 16
 834             numstr = '0%s' % numstr
 835         else:
 836             base = 10
 837         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 838         try:
 839             return compat_chr(int(numstr, base))
 840         except ValueError:
 841             pass
 842
 843     # Unknown entity in name, return its literal representation
 844     return '&%s;' % entity
 845
 846
 847 def unescapeHTML(s):
 848     if s is None:
 849         return None
 850     assert type(s) == compat_str
 851
 852     return re.sub(
 853         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 854
 855
 856 def escapeHTML(text):
 857     return (
 858         text
 859         .replace('&', '&amp;')
 860         .replace('<', '&lt;')
 861         .replace('>', '&gt;')
 862         .replace('"', '&quot;')
 863         .replace("'", '&#39;')
 864     )
 865
 866
 867 def process_communicate_or_kill(p, *args, **kwargs):
 868     try:
 869         return p.communicate(*args, **kwargs)
 870     except BaseException:  # Including KeyboardInterrupt
 871         p.kill()
 872         p.wait()
 873         raise
 874
 875
 876 class Popen(subprocess.Popen):
 877     if sys.platform == 'win32':
 878         _startupinfo = subprocess.STARTUPINFO()
 879         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 880     else:
 881         _startupinfo = None
 882
 883     def __init__(self, *args, **kwargs):
 884         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 885
 886     def communicate_or_kill(self, *args, **kwargs):
 887         return process_communicate_or_kill(self, *args, **kwargs)
 888
 889
 890 def get_subprocess_encoding():
 891     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 892         # For subprocess calls, encode with locale encoding
 893         # Refer to http://stackoverflow.com/a/9951851/35070
 894         encoding = preferredencoding()
 895     else:
 896         encoding = sys.getfilesystemencoding()
 897     if encoding is None:
 898         encoding = 'utf-8'
 899     return encoding
 900
 901
 902 def encodeFilename(s, for_subprocess=False):
 903     """
 904     @param s The name of the file
 905     """
 906
 907     assert type(s) == compat_str
 908
 909     # Python 3 has a Unicode API
 910     if sys.version_info >= (3, 0):
 911         return s
 912
 913     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 914     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 915     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 916     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 917         return s
 918
 919     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 920     if sys.platform.startswith('java'):
 921         return s
 922
 923     return s.encode(get_subprocess_encoding(), 'ignore')
 924
 925
 926 def decodeFilename(b, for_subprocess=False):
 927
 928     if sys.version_info >= (3, 0):
 929         return b
 930
 931     if not isinstance(b, bytes):
 932         return b
 933
 934     return b.decode(get_subprocess_encoding(), 'ignore')
 935
 936
 937 def encodeArgument(s):
 938     if not isinstance(s, compat_str):
 939         # Legacy code that uses byte strings
 940         # Uncomment the following line after fixing all post processors
 941         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 942         s = s.decode('ascii')
 943     return encodeFilename(s, True)
 944
 945
 946 def decodeArgument(b):
 947     return decodeFilename(b, True)
 948
 949
 950 def decodeOption(optval):
 951     if optval is None:
 952         return optval
 953     if isinstance(optval, bytes):
 954         optval = optval.decode(preferredencoding())
 955
 956     assert isinstance(optval, compat_str)
 957     return optval
 958
 959
 960 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 961
 962
 963 def timetuple_from_msec(msec):
 964     secs, msec = divmod(msec, 1000)
 965     mins, secs = divmod(secs, 60)
 966     hrs, mins = divmod(mins, 60)
 967     return _timetuple(hrs, mins, secs, msec)
 968
 969
 970 def formatSeconds(secs, delim=':', msec=False):
 971     time = timetuple_from_msec(secs * 1000)
 972     if time.hours:
 973         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 974     elif time.minutes:
 975         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 976     else:
 977         ret = '%d' % time.seconds
 978     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 979
 980
 981 def _ssl_load_windows_store_certs(ssl_context, storename):
 982     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 983     try:
 984         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 985                  if encoding == 'x509_asn' and (
 986                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 987     except PermissionError:
 988         return
 989     for cert in certs:
 990         try:
 991             ssl_context.load_verify_locations(cadata=cert)
 992         except ssl.SSLError:
 993             pass
 994
 995
 996 def make_HTTPS_handler(params, **kwargs):
 997     opts_check_certificate = not params.get('nocheckcertificate')
 998     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 999     context.check_hostname = opts_check_certificate
1000     if params.get('legacyserverconnect'):
1001         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1002     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003     if opts_check_certificate:
1004         try:
1005             context.load_default_certs()
1006             # Work around the issue in load_default_certs when there are bad certificates. See:
1007             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1008             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1009         except ssl.SSLError:
1010             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1011             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1012                 # Create a new context to discard any certificates that were already loaded
1013                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1014                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1015                 for storename in ('CA', 'ROOT'):
1016                     _ssl_load_windows_store_certs(context, storename)
1017             context.set_default_verify_paths()
1018     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1019
1020
1021 def bug_reports_message(before=';'):
1022     if ytdl_is_updateable():
1023         update_cmd = 'type  yt-dlp -U  to update'
1024     else:
1025         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
1026     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
1027     msg += ' Make sure you are using the latest version; %s.' % update_cmd
1028     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
1029
1030     before = before.rstrip()
1031     if not before or before.endswith(('.', '!', '?')):
1032         msg = msg[0].title() + msg[1:]
1033
1034     return (before + ' ' if before else '') + msg
1035
1036
1037 class YoutubeDLError(Exception):
1038     """Base exception for YoutubeDL errors."""
1039     msg = None
1040
1041     def __init__(self, msg=None):
1042         if msg is not None:
1043             self.msg = msg
1044         elif self.msg is None:
1045             self.msg = type(self).__name__
1046         super().__init__(self.msg)
1047
1048
1049 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1050 if hasattr(ssl, 'CertificateError'):
1051     network_exceptions.append(ssl.CertificateError)
1052 network_exceptions = tuple(network_exceptions)
1053
1054
1055 class ExtractorError(YoutubeDLError):
1056     """Error during info extraction."""
1057
1058     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1059         """ tb, if given, is the original traceback (so that it can be printed out).
1060         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1061         """
1062         if sys.exc_info()[0] in network_exceptions:
1063             expected = True
1064
1065         self.msg = str(msg)
1066         self.traceback = tb
1067         self.expected = expected
1068         self.cause = cause
1069         self.video_id = video_id
1070         self.ie = ie
1071         self.exc_info = sys.exc_info()  # preserve original exception
1072
1073         super(ExtractorError, self).__init__(''.join((
1074             format_field(ie, template='[%s] '),
1075             format_field(video_id, template='%s: '),
1076             self.msg,
1077             format_field(cause, template=' (caused by %r)'),
1078             '' if expected else bug_reports_message())))
1079
1080     def format_traceback(self):
1081         if self.traceback is None:
1082             return None
1083         return ''.join(traceback.format_tb(self.traceback))
1084
1085
1086 class UnsupportedError(ExtractorError):
1087     def __init__(self, url):
1088         super(UnsupportedError, self).__init__(
1089             'Unsupported URL: %s' % url, expected=True)
1090         self.url = url
1091
1092
1093 class RegexNotFoundError(ExtractorError):
1094     """Error when a regex didn't match"""
1095     pass
1096
1097
1098 class GeoRestrictedError(ExtractorError):
1099     """Geographic restriction Error exception.
1100
1101     This exception may be thrown when a video is not available from your
1102     geographic location due to geographic restrictions imposed by a website.
1103     """
1104
1105     def __init__(self, msg, countries=None, **kwargs):
1106         kwargs['expected'] = True
1107         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1108         self.countries = countries
1109
1110
1111 class DownloadError(YoutubeDLError):
1112     """Download Error exception.
1113
1114     This exception may be thrown by FileDownloader objects if they are not
1115     configured to continue on errors. They will contain the appropriate
1116     error message.
1117     """
1118
1119     def __init__(self, msg, exc_info=None):
1120         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1121         super(DownloadError, self).__init__(msg)
1122         self.exc_info = exc_info
1123
1124
1125 class EntryNotInPlaylist(YoutubeDLError):
1126     """Entry not in playlist exception.
1127
1128     This exception will be thrown by YoutubeDL when a requested entry
1129     is not found in the playlist info_dict
1130     """
1131     msg = 'Entry not found in info'
1132
1133
1134 class SameFileError(YoutubeDLError):
1135     """Same File exception.
1136
1137     This exception will be thrown by FileDownloader objects if they detect
1138     multiple files would have to be downloaded to the same file on disk.
1139     """
1140     msg = 'Fixed output name but more than one file to download'
1141
1142     def __init__(self, filename=None):
1143         if filename is not None:
1144             self.msg += f': {filename}'
1145         super().__init__(self.msg)
1146
1147
1148 class PostProcessingError(YoutubeDLError):
1149     """Post Processing exception.
1150
1151     This exception may be raised by PostProcessor's .run() method to
1152     indicate an error in the postprocessing task.
1153     """
1154
1155
1156 class DownloadCancelled(YoutubeDLError):
1157     """ Exception raised when the download queue should be interrupted """
1158     msg = 'The download was cancelled'
1159
1160
1161 class ExistingVideoReached(DownloadCancelled):
1162     """ --break-on-existing triggered """
1163     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1164
1165
1166 class RejectedVideoReached(DownloadCancelled):
1167     """ --break-on-reject triggered """
1168     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1169
1170
1171 class MaxDownloadsReached(DownloadCancelled):
1172     """ --max-downloads limit has been reached. """
1173     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1174
1175
1176 class ReExtractInfo(YoutubeDLError):
1177     """ Video info needs to be re-extracted. """
1178
1179     def __init__(self, msg, expected=False):
1180         super().__init__(msg)
1181         self.expected = expected
1182
1183
1184 class ThrottledDownload(ReExtractInfo):
1185     """ Download speed below --throttled-rate. """
1186     msg = 'The download speed is below throttle limit'
1187
1188     def __init__(self):
1189         super().__init__(self.msg, expected=False)
1190
1191
1192 class UnavailableVideoError(YoutubeDLError):
1193     """Unavailable Format exception.
1194
1195     This exception will be thrown when a video is requested
1196     in a format that is not available for that video.
1197     """
1198     msg = 'Unable to download video'
1199
1200     def __init__(self, err=None):
1201         if err is not None:
1202             self.msg += f': {err}'
1203         super().__init__(self.msg)
1204
1205
1206 class ContentTooShortError(YoutubeDLError):
1207     """Content Too Short exception.
1208
1209     This exception may be raised by FileDownloader objects when a file they
1210     download is too small for what the server announced first, indicating
1211     the connection was probably interrupted.
1212     """
1213
1214     def __init__(self, downloaded, expected):
1215         super(ContentTooShortError, self).__init__(
1216             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1217         )
1218         # Both in bytes
1219         self.downloaded = downloaded
1220         self.expected = expected
1221
1222
1223 class XAttrMetadataError(YoutubeDLError):
1224     def __init__(self, code=None, msg='Unknown error'):
1225         super(XAttrMetadataError, self).__init__(msg)
1226         self.code = code
1227         self.msg = msg
1228
1229         # Parsing code and msg
1230         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1231                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1232             self.reason = 'NO_SPACE'
1233         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1234             self.reason = 'VALUE_TOO_LONG'
1235         else:
1236             self.reason = 'NOT_SUPPORTED'
1237
1238
1239 class XAttrUnavailableError(YoutubeDLError):
1240     pass
1241
1242
1243 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1244     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1245     # expected HTTP responses to meet HTTP/1.0 or later (see also
1246     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1247     if sys.version_info < (3, 0):
1248         kwargs['strict'] = True
1249     hc = http_class(*args, **compat_kwargs(kwargs))
1250     source_address = ydl_handler._params.get('source_address')
1251
1252     if source_address is not None:
1253         # This is to workaround _create_connection() from socket where it will try all
1254         # address data from getaddrinfo() including IPv6. This filters the result from
1255         # getaddrinfo() based on the source_address value.
1256         # This is based on the cpython socket.create_connection() function.
1257         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1258         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1259             host, port = address
1260             err = None
1261             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1262             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1263             ip_addrs = [addr for addr in addrs if addr[0] == af]
1264             if addrs and not ip_addrs:
1265                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1266                 raise socket.error(
1267                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1268                     % (ip_version, source_address[0]))
1269             for res in ip_addrs:
1270                 af, socktype, proto, canonname, sa = res
1271                 sock = None
1272                 try:
1273                     sock = socket.socket(af, socktype, proto)
1274                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1275                         sock.settimeout(timeout)
1276                     sock.bind(source_address)
1277                     sock.connect(sa)
1278                     err = None  # Explicitly break reference cycle
1279                     return sock
1280                 except socket.error as _:
1281                     err = _
1282                     if sock is not None:
1283                         sock.close()
1284             if err is not None:
1285                 raise err
1286             else:
1287                 raise socket.error('getaddrinfo returns an empty list')
1288         if hasattr(hc, '_create_connection'):
1289             hc._create_connection = _create_connection
1290         sa = (source_address, 0)
1291         if hasattr(hc, 'source_address'):  # Python 2.7+
1292             hc.source_address = sa
1293         else:  # Python 2.6
1294             def _hc_connect(self, *args, **kwargs):
1295                 sock = _create_connection(
1296                     (self.host, self.port), self.timeout, sa)
1297                 if is_https:
1298                     self.sock = ssl.wrap_socket(
1299                         sock, self.key_file, self.cert_file,
1300                         ssl_version=ssl.PROTOCOL_TLSv1)
1301                 else:
1302                     self.sock = sock
1303             hc.connect = functools.partial(_hc_connect, hc)
1304
1305     return hc
1306
1307
1308 def handle_youtubedl_headers(headers):
1309     filtered_headers = headers
1310
1311     if 'Youtubedl-no-compression' in filtered_headers:
1312         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1313         del filtered_headers['Youtubedl-no-compression']
1314
1315     return filtered_headers
1316
1317
1318 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1319     """Handler for HTTP requests and responses.
1320
1321     This class, when installed with an OpenerDirector, automatically adds
1322     the standard headers to every HTTP request and handles gzipped and
1323     deflated responses from web servers. If compression is to be avoided in
1324     a particular request, the original request in the program code only has
1325     to include the HTTP header "Youtubedl-no-compression", which will be
1326     removed before making the real request.
1327
1328     Part of this code was copied from:
1329
1330     http://techknack.net/python-urllib2-handlers/
1331
1332     Andrew Rowls, the author of that code, agreed to release it to the
1333     public domain.
1334     """
1335
1336     def __init__(self, params, *args, **kwargs):
1337         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1338         self._params = params
1339
1340     def http_open(self, req):
1341         conn_class = compat_http_client.HTTPConnection
1342
1343         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344         if socks_proxy:
1345             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346             del req.headers['Ytdl-socks-proxy']
1347
1348         return self.do_open(functools.partial(
1349             _create_http_connection, self, conn_class, False),
1350             req)
1351
1352     @staticmethod
1353     def deflate(data):
1354         if not data:
1355             return data
1356         try:
1357             return zlib.decompress(data, -zlib.MAX_WBITS)
1358         except zlib.error:
1359             return zlib.decompress(data)
1360
1361     def http_request(self, req):
1362         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1363         # always respected by websites, some tend to give out URLs with non percent-encoded
1364         # non-ASCII characters (see telemb.py, ard.py [#3412])
1365         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1366         # To work around aforementioned issue we will replace request's original URL with
1367         # percent-encoded one
1368         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1369         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1370         url = req.get_full_url()
1371         url_escaped = escape_url(url)
1372
1373         # Substitute URL if any change after escaping
1374         if url != url_escaped:
1375             req = update_Request(req, url=url_escaped)
1376
1377         for h, v in std_headers.items():
1378             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1379             # The dict keys are capitalized because of this bug by urllib
1380             if h.capitalize() not in req.headers:
1381                 req.add_header(h, v)
1382
1383         req.headers = handle_youtubedl_headers(req.headers)
1384
1385         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1386             # Python 2.6 is brain-dead when it comes to fragments
1387             req._Request__original = req._Request__original.partition('#')[0]
1388             req._Request__r_type = req._Request__r_type.partition('#')[0]
1389
1390         return req
1391
1392     def http_response(self, req, resp):
1393         old_resp = resp
1394         # gzip
1395         if resp.headers.get('Content-encoding', '') == 'gzip':
1396             content = resp.read()
1397             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1398             try:
1399                 uncompressed = io.BytesIO(gz.read())
1400             except IOError as original_ioerror:
1401                 # There may be junk add the end of the file
1402                 # See http://stackoverflow.com/q/4928560/35070 for details
1403                 for i in range(1, 1024):
1404                     try:
1405                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1406                         uncompressed = io.BytesIO(gz.read())
1407                     except IOError:
1408                         continue
1409                     break
1410                 else:
1411                     raise original_ioerror
1412             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1413             resp.msg = old_resp.msg
1414             del resp.headers['Content-encoding']
1415         # deflate
1416         if resp.headers.get('Content-encoding', '') == 'deflate':
1417             gz = io.BytesIO(self.deflate(resp.read()))
1418             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1419             resp.msg = old_resp.msg
1420             del resp.headers['Content-encoding']
1421         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1422         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1423         if 300 <= resp.code < 400:
1424             location = resp.headers.get('Location')
1425             if location:
1426                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1427                 if sys.version_info >= (3, 0):
1428                     location = location.encode('iso-8859-1').decode('utf-8')
1429                 else:
1430                     location = location.decode('utf-8')
1431                 location_escaped = escape_url(location)
1432                 if location != location_escaped:
1433                     del resp.headers['Location']
1434                     if sys.version_info < (3, 0):
1435                         location_escaped = location_escaped.encode('utf-8')
1436                     resp.headers['Location'] = location_escaped
1437         return resp
1438
1439     https_request = http_request
1440     https_response = http_response
1441
1442
1443 def make_socks_conn_class(base_class, socks_proxy):
1444     assert issubclass(base_class, (
1445         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1446
1447     url_components = compat_urlparse.urlparse(socks_proxy)
1448     if url_components.scheme.lower() == 'socks5':
1449         socks_type = ProxyType.SOCKS5
1450     elif url_components.scheme.lower() in ('socks', 'socks4'):
1451         socks_type = ProxyType.SOCKS4
1452     elif url_components.scheme.lower() == 'socks4a':
1453         socks_type = ProxyType.SOCKS4A
1454
1455     def unquote_if_non_empty(s):
1456         if not s:
1457             return s
1458         return compat_urllib_parse_unquote_plus(s)
1459
1460     proxy_args = (
1461         socks_type,
1462         url_components.hostname, url_components.port or 1080,
1463         True,  # Remote DNS
1464         unquote_if_non_empty(url_components.username),
1465         unquote_if_non_empty(url_components.password),
1466     )
1467
1468     class SocksConnection(base_class):
1469         def connect(self):
1470             self.sock = sockssocket()
1471             self.sock.setproxy(*proxy_args)
1472             if type(self.timeout) in (int, float):
1473                 self.sock.settimeout(self.timeout)
1474             self.sock.connect((self.host, self.port))
1475
1476             if isinstance(self, compat_http_client.HTTPSConnection):
1477                 if hasattr(self, '_context'):  # Python > 2.6
1478                     self.sock = self._context.wrap_socket(
1479                         self.sock, server_hostname=self.host)
1480                 else:
1481                     self.sock = ssl.wrap_socket(self.sock)
1482
1483     return SocksConnection
1484
1485
1486 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1487     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1488         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1489         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1490         self._params = params
1491
1492     def https_open(self, req):
1493         kwargs = {}
1494         conn_class = self._https_conn_class
1495
1496         if hasattr(self, '_context'):  # python > 2.6
1497             kwargs['context'] = self._context
1498         if hasattr(self, '_check_hostname'):  # python 3.x
1499             kwargs['check_hostname'] = self._check_hostname
1500
1501         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1502         if socks_proxy:
1503             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1504             del req.headers['Ytdl-socks-proxy']
1505
1506         return self.do_open(functools.partial(
1507             _create_http_connection, self, conn_class, True),
1508             req, **kwargs)
1509
1510
1511 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1512     """
1513     See [1] for cookie file format.
1514
1515     1. https://curl.haxx.se/docs/http-cookies.html
1516     """
1517     _HTTPONLY_PREFIX = '#HttpOnly_'
1518     _ENTRY_LEN = 7
1519     _HEADER = '''# Netscape HTTP Cookie File
1520 # This file is generated by yt-dlp.  Do not edit.
1521
1522 '''
1523     _CookieFileEntry = collections.namedtuple(
1524         'CookieFileEntry',
1525         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1526
1527     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1528         """
1529         Save cookies to a file.
1530
1531         Most of the code is taken from CPython 3.8 and slightly adapted
1532         to support cookie files with UTF-8 in both python 2 and 3.
1533         """
1534         if filename is None:
1535             if self.filename is not None:
1536                 filename = self.filename
1537             else:
1538                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1539
1540         # Store session cookies with `expires` set to 0 instead of an empty
1541         # string
1542         for cookie in self:
1543             if cookie.expires is None:
1544                 cookie.expires = 0
1545
1546         with io.open(filename, 'w', encoding='utf-8') as f:
1547             f.write(self._HEADER)
1548             now = time.time()
1549             for cookie in self:
1550                 if not ignore_discard and cookie.discard:
1551                     continue
1552                 if not ignore_expires and cookie.is_expired(now):
1553                     continue
1554                 if cookie.secure:
1555                     secure = 'TRUE'
1556                 else:
1557                     secure = 'FALSE'
1558                 if cookie.domain.startswith('.'):
1559                     initial_dot = 'TRUE'
1560                 else:
1561                     initial_dot = 'FALSE'
1562                 if cookie.expires is not None:
1563                     expires = compat_str(cookie.expires)
1564                 else:
1565                     expires = ''
1566                 if cookie.value is None:
1567                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1568                     # with no name, whereas http.cookiejar regards it as a
1569                     # cookie with no value.
1570                     name = ''
1571                     value = cookie.name
1572                 else:
1573                     name = cookie.name
1574                     value = cookie.value
1575                 f.write(
1576                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1577                                secure, expires, name, value]) + '\n')
1578
1579     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1580         """Load cookies from a file."""
1581         if filename is None:
1582             if self.filename is not None:
1583                 filename = self.filename
1584             else:
1585                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1586
1587         def prepare_line(line):
1588             if line.startswith(self._HTTPONLY_PREFIX):
1589                 line = line[len(self._HTTPONLY_PREFIX):]
1590             # comments and empty lines are fine
1591             if line.startswith('#') or not line.strip():
1592                 return line
1593             cookie_list = line.split('\t')
1594             if len(cookie_list) != self._ENTRY_LEN:
1595                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1596             cookie = self._CookieFileEntry(*cookie_list)
1597             if cookie.expires_at and not cookie.expires_at.isdigit():
1598                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1599             return line
1600
1601         cf = io.StringIO()
1602         with io.open(filename, encoding='utf-8') as f:
1603             for line in f:
1604                 try:
1605                     cf.write(prepare_line(line))
1606                 except compat_cookiejar.LoadError as e:
1607                     write_string(
1608                         'WARNING: skipping cookie file entry due to %s: %r\n'
1609                         % (e, line), sys.stderr)
1610                     continue
1611         cf.seek(0)
1612         self._really_load(cf, filename, ignore_discard, ignore_expires)
1613         # Session cookies are denoted by either `expires` field set to
1614         # an empty string or 0. MozillaCookieJar only recognizes the former
1615         # (see [1]). So we need force the latter to be recognized as session
1616         # cookies on our own.
1617         # Session cookies may be important for cookies-based authentication,
1618         # e.g. usually, when user does not check 'Remember me' check box while
1619         # logging in on a site, some important cookies are stored as session
1620         # cookies so that not recognizing them will result in failed login.
1621         # 1. https://bugs.python.org/issue17164
1622         for cookie in self:
1623             # Treat `expires=0` cookies as session cookies
1624             if cookie.expires == 0:
1625                 cookie.expires = None
1626                 cookie.discard = True
1627
1628
1629 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1630     def __init__(self, cookiejar=None):
1631         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1632
1633     def http_response(self, request, response):
1634         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1635         # characters in Set-Cookie HTTP header of last response (see
1636         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1637         # In order to at least prevent crashing we will percent encode Set-Cookie
1638         # header before HTTPCookieProcessor starts processing it.
1639         # if sys.version_info < (3, 0) and response.headers:
1640         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1641         #         set_cookie = response.headers.get(set_cookie_header)
1642         #         if set_cookie:
1643         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1644         #             if set_cookie != set_cookie_escaped:
1645         #                 del response.headers[set_cookie_header]
1646         #                 response.headers[set_cookie_header] = set_cookie_escaped
1647         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1648
1649     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1650     https_response = http_response
1651
1652
1653 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1654     """YoutubeDL redirect handler
1655
1656     The code is based on HTTPRedirectHandler implementation from CPython [1].
1657
1658     This redirect handler solves two issues:
1659      - ensures redirect URL is always unicode under python 2
1660      - introduces support for experimental HTTP response status code
1661        308 Permanent Redirect [2] used by some sites [3]
1662
1663     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1664     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1665     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1666     """
1667
1668     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1669
1670     def redirect_request(self, req, fp, code, msg, headers, newurl):
1671         """Return a Request or None in response to a redirect.
1672
1673         This is called by the http_error_30x methods when a
1674         redirection response is received.  If a redirection should
1675         take place, return a new Request to allow http_error_30x to
1676         perform the redirect.  Otherwise, raise HTTPError if no-one
1677         else should try to handle this url.  Return None if you can't
1678         but another Handler might.
1679         """
1680         m = req.get_method()
1681         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1682                  or code in (301, 302, 303) and m == "POST")):
1683             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1684         # Strictly (according to RFC 2616), 301 or 302 in response to
1685         # a POST MUST NOT cause a redirection without confirmation
1686         # from the user (of urllib.request, in this case).  In practice,
1687         # essentially all clients do redirect in this case, so we do
1688         # the same.
1689
1690         # On python 2 urlh.geturl() may sometimes return redirect URL
1691         # as byte string instead of unicode. This workaround allows
1692         # to force it always return unicode.
1693         if sys.version_info[0] < 3:
1694             newurl = compat_str(newurl)
1695
1696         # Be conciliant with URIs containing a space.  This is mainly
1697         # redundant with the more complete encoding done in http_error_302(),
1698         # but it is kept for compatibility with other callers.
1699         newurl = newurl.replace(' ', '%20')
1700
1701         CONTENT_HEADERS = ("content-length", "content-type")
1702         # NB: don't use dict comprehension for python 2.6 compatibility
1703         newheaders = dict((k, v) for k, v in req.headers.items()
1704                           if k.lower() not in CONTENT_HEADERS)
1705         return compat_urllib_request.Request(
1706             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1707             unverifiable=True)
1708
1709
1710 def extract_timezone(date_str):
1711     m = re.search(
1712         r'''(?x)
1713             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1714             (?P<tz>Z|                                            # just the UTC Z, or
1715                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1716                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1717                    [ ]?                                          # optional space
1718                 (?P<sign>\+|-)                                   # +/-
1719                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1720             $)
1721         ''', date_str)
1722     if not m:
1723         timezone = datetime.timedelta()
1724     else:
1725         date_str = date_str[:-len(m.group('tz'))]
1726         if not m.group('sign'):
1727             timezone = datetime.timedelta()
1728         else:
1729             sign = 1 if m.group('sign') == '+' else -1
1730             timezone = datetime.timedelta(
1731                 hours=sign * int(m.group('hours')),
1732                 minutes=sign * int(m.group('minutes')))
1733     return timezone, date_str
1734
1735
1736 def parse_iso8601(date_str, delimiter='T', timezone=None):
1737     """ Return a UNIX timestamp from the given date """
1738
1739     if date_str is None:
1740         return None
1741
1742     date_str = re.sub(r'\.[0-9]+', '', date_str)
1743
1744     if timezone is None:
1745         timezone, date_str = extract_timezone(date_str)
1746
1747     try:
1748         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1749         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1750         return calendar.timegm(dt.timetuple())
1751     except ValueError:
1752         pass
1753
1754
1755 def date_formats(day_first=True):
1756     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1757
1758
1759 def unified_strdate(date_str, day_first=True):
1760     """Return a string with the date in the format YYYYMMDD"""
1761
1762     if date_str is None:
1763         return None
1764     upload_date = None
1765     # Replace commas
1766     date_str = date_str.replace(',', ' ')
1767     # Remove AM/PM + timezone
1768     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1769     _, date_str = extract_timezone(date_str)
1770
1771     for expression in date_formats(day_first):
1772         try:
1773             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1774         except ValueError:
1775             pass
1776     if upload_date is None:
1777         timetuple = email.utils.parsedate_tz(date_str)
1778         if timetuple:
1779             try:
1780                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1781             except ValueError:
1782                 pass
1783     if upload_date is not None:
1784         return compat_str(upload_date)
1785
1786
1787 def unified_timestamp(date_str, day_first=True):
1788     if date_str is None:
1789         return None
1790
1791     date_str = re.sub(r'[,|]', '', date_str)
1792
1793     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1794     timezone, date_str = extract_timezone(date_str)
1795
1796     # Remove AM/PM + timezone
1797     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1798
1799     # Remove unrecognized timezones from ISO 8601 alike timestamps
1800     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1801     if m:
1802         date_str = date_str[:-len(m.group('tz'))]
1803
1804     # Python only supports microseconds, so remove nanoseconds
1805     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1806     if m:
1807         date_str = m.group(1)
1808
1809     for expression in date_formats(day_first):
1810         try:
1811             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1812             return calendar.timegm(dt.timetuple())
1813         except ValueError:
1814             pass
1815     timetuple = email.utils.parsedate_tz(date_str)
1816     if timetuple:
1817         return calendar.timegm(timetuple) + pm_delta * 3600
1818
1819
1820 def determine_ext(url, default_ext='unknown_video'):
1821     if url is None or '.' not in url:
1822         return default_ext
1823     guess = url.partition('?')[0].rpartition('.')[2]
1824     if re.match(r'^[A-Za-z0-9]+$', guess):
1825         return guess
1826     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1827     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1828         return guess.rstrip('/')
1829     else:
1830         return default_ext
1831
1832
1833 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1834     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1835
1836
1837 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1838     """
1839     Return a datetime object from a string in the format YYYYMMDD or
1840     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1841
1842     format: string date format used to return datetime object from
1843     precision: round the time portion of a datetime object.
1844                 auto|microsecond|second|minute|hour|day.
1845                 auto: round to the unit provided in date_str (if applicable).
1846     """
1847     auto_precision = False
1848     if precision == 'auto':
1849         auto_precision = True
1850         precision = 'microsecond'
1851     today = datetime_round(datetime.datetime.utcnow(), precision)
1852     if date_str in ('now', 'today'):
1853         return today
1854     if date_str == 'yesterday':
1855         return today - datetime.timedelta(days=1)
1856     match = re.match(
1857         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1858         date_str)
1859     if match is not None:
1860         start_time = datetime_from_str(match.group('start'), precision, format)
1861         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1862         unit = match.group('unit')
1863         if unit == 'month' or unit == 'year':
1864             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1865             unit = 'day'
1866         else:
1867             if unit == 'week':
1868                 unit = 'day'
1869                 time *= 7
1870             delta = datetime.timedelta(**{unit + 's': time})
1871             new_date = start_time + delta
1872         if auto_precision:
1873             return datetime_round(new_date, unit)
1874         return new_date
1875
1876     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1877
1878
1879 def date_from_str(date_str, format='%Y%m%d'):
1880     """
1881     Return a datetime object from a string in the format YYYYMMDD or
1882     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1883
1884     format: string date format used to return datetime object from
1885     """
1886     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1887
1888
1889 def datetime_add_months(dt, months):
1890     """Increment/Decrement a datetime object by months."""
1891     month = dt.month + months - 1
1892     year = dt.year + month // 12
1893     month = month % 12 + 1
1894     day = min(dt.day, calendar.monthrange(year, month)[1])
1895     return dt.replace(year, month, day)
1896
1897
1898 def datetime_round(dt, precision='day'):
1899     """
1900     Round a datetime object's time to a specific precision
1901     """
1902     if precision == 'microsecond':
1903         return dt
1904
1905     unit_seconds = {
1906         'day': 86400,
1907         'hour': 3600,
1908         'minute': 60,
1909         'second': 1,
1910     }
1911     roundto = lambda x, n: ((x + n / 2) // n) * n
1912     timestamp = calendar.timegm(dt.timetuple())
1913     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1914
1915
1916 def hyphenate_date(date_str):
1917     """
1918     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1919     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1920     if match is not None:
1921         return '-'.join(match.groups())
1922     else:
1923         return date_str
1924
1925
1926 class DateRange(object):
1927     """Represents a time interval between two dates"""
1928
1929     def __init__(self, start=None, end=None):
1930         """start and end must be strings in the format accepted by date"""
1931         if start is not None:
1932             self.start = date_from_str(start)
1933         else:
1934             self.start = datetime.datetime.min.date()
1935         if end is not None:
1936             self.end = date_from_str(end)
1937         else:
1938             self.end = datetime.datetime.max.date()
1939         if self.start > self.end:
1940             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1941
1942     @classmethod
1943     def day(cls, day):
1944         """Returns a range that only contains the given day"""
1945         return cls(day, day)
1946
1947     def __contains__(self, date):
1948         """Check if the date is in the range"""
1949         if not isinstance(date, datetime.date):
1950             date = date_from_str(date)
1951         return self.start <= date <= self.end
1952
1953     def __str__(self):
1954         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1955
1956
1957 def platform_name():
1958     """ Returns the platform name as a compat_str """
1959     res = platform.platform()
1960     if isinstance(res, bytes):
1961         res = res.decode(preferredencoding())
1962
1963     assert isinstance(res, compat_str)
1964     return res
1965
1966
1967 def get_windows_version():
1968     ''' Get Windows version. None if it's not running on Windows '''
1969     if compat_os_name == 'nt':
1970         return version_tuple(platform.win32_ver()[1])
1971     else:
1972         return None
1973
1974
1975 def _windows_write_string(s, out):
1976     """ Returns True if the string was written using special methods,
1977     False if it has yet to be written out."""
1978     # Adapted from http://stackoverflow.com/a/3259271/35070
1979
1980     import ctypes.wintypes
1981
1982     WIN_OUTPUT_IDS = {
1983         1: -11,
1984         2: -12,
1985     }
1986
1987     try:
1988         fileno = out.fileno()
1989     except AttributeError:
1990         # If the output stream doesn't have a fileno, it's virtual
1991         return False
1992     except io.UnsupportedOperation:
1993         # Some strange Windows pseudo files?
1994         return False
1995     if fileno not in WIN_OUTPUT_IDS:
1996         return False
1997
1998     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1999         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2000         ('GetStdHandle', ctypes.windll.kernel32))
2001     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2002
2003     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2004         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2005         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2006         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2007     written = ctypes.wintypes.DWORD(0)
2008
2009     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2010     FILE_TYPE_CHAR = 0x0002
2011     FILE_TYPE_REMOTE = 0x8000
2012     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2013         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2014         ctypes.POINTER(ctypes.wintypes.DWORD))(
2015         ('GetConsoleMode', ctypes.windll.kernel32))
2016     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2017
2018     def not_a_console(handle):
2019         if handle == INVALID_HANDLE_VALUE or handle is None:
2020             return True
2021         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2022                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2023
2024     if not_a_console(h):
2025         return False
2026
2027     def next_nonbmp_pos(s):
2028         try:
2029             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2030         except StopIteration:
2031             return len(s)
2032
2033     while s:
2034         count = min(next_nonbmp_pos(s), 1024)
2035
2036         ret = WriteConsoleW(
2037             h, s, count if count else 2, ctypes.byref(written), None)
2038         if ret == 0:
2039             raise OSError('Failed to write string')
2040         if not count:  # We just wrote a non-BMP character
2041             assert written.value == 2
2042             s = s[1:]
2043         else:
2044             assert written.value > 0
2045             s = s[written.value:]
2046     return True
2047
2048
2049 def write_string(s, out=None, encoding=None):
2050     if out is None:
2051         out = sys.stderr
2052     assert type(s) == compat_str
2053
2054     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2055         if _windows_write_string(s, out):
2056             return
2057
2058     if ('b' in getattr(out, 'mode', '')
2059             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2060         byt = s.encode(encoding or preferredencoding(), 'ignore')
2061         out.write(byt)
2062     elif hasattr(out, 'buffer'):
2063         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2064         byt = s.encode(enc, 'ignore')
2065         out.buffer.write(byt)
2066     else:
2067         out.write(s)
2068     out.flush()
2069
2070
2071 def bytes_to_intlist(bs):
2072     if not bs:
2073         return []
2074     if isinstance(bs[0], int):  # Python 3
2075         return list(bs)
2076     else:
2077         return [ord(c) for c in bs]
2078
2079
2080 def intlist_to_bytes(xs):
2081     if not xs:
2082         return b''
2083     return compat_struct_pack('%dB' % len(xs), *xs)
2084
2085
2086 # Cross-platform file locking
2087 if sys.platform == 'win32':
2088     import ctypes.wintypes
2089     import msvcrt
2090
2091     class OVERLAPPED(ctypes.Structure):
2092         _fields_ = [
2093             ('Internal', ctypes.wintypes.LPVOID),
2094             ('InternalHigh', ctypes.wintypes.LPVOID),
2095             ('Offset', ctypes.wintypes.DWORD),
2096             ('OffsetHigh', ctypes.wintypes.DWORD),
2097             ('hEvent', ctypes.wintypes.HANDLE),
2098         ]
2099
2100     kernel32 = ctypes.windll.kernel32
2101     LockFileEx = kernel32.LockFileEx
2102     LockFileEx.argtypes = [
2103         ctypes.wintypes.HANDLE,     # hFile
2104         ctypes.wintypes.DWORD,      # dwFlags
2105         ctypes.wintypes.DWORD,      # dwReserved
2106         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2107         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2108         ctypes.POINTER(OVERLAPPED)  # Overlapped
2109     ]
2110     LockFileEx.restype = ctypes.wintypes.BOOL
2111     UnlockFileEx = kernel32.UnlockFileEx
2112     UnlockFileEx.argtypes = [
2113         ctypes.wintypes.HANDLE,     # hFile
2114         ctypes.wintypes.DWORD,      # dwReserved
2115         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2116         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2117         ctypes.POINTER(OVERLAPPED)  # Overlapped
2118     ]
2119     UnlockFileEx.restype = ctypes.wintypes.BOOL
2120     whole_low = 0xffffffff
2121     whole_high = 0x7fffffff
2122
2123     def _lock_file(f, exclusive):
2124         overlapped = OVERLAPPED()
2125         overlapped.Offset = 0
2126         overlapped.OffsetHigh = 0
2127         overlapped.hEvent = 0
2128         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2129         handle = msvcrt.get_osfhandle(f.fileno())
2130         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2131                           whole_low, whole_high, f._lock_file_overlapped_p):
2132             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2133
2134     def _unlock_file(f):
2135         assert f._lock_file_overlapped_p
2136         handle = msvcrt.get_osfhandle(f.fileno())
2137         if not UnlockFileEx(handle, 0,
2138                             whole_low, whole_high, f._lock_file_overlapped_p):
2139             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2140
2141 else:
2142     # Some platforms, such as Jython, is missing fcntl
2143     try:
2144         import fcntl
2145
2146         def _lock_file(f, exclusive):
2147             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2148
2149         def _unlock_file(f):
2150             fcntl.flock(f, fcntl.LOCK_UN)
2151     except ImportError:
2152         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2153
2154         def _lock_file(f, exclusive):
2155             raise IOError(UNSUPPORTED_MSG)
2156
2157         def _unlock_file(f):
2158             raise IOError(UNSUPPORTED_MSG)
2159
2160
2161 class locked_file(object):
2162     def __init__(self, filename, mode, encoding=None):
2163         assert mode in ['r', 'a', 'w']
2164         self.f = io.open(filename, mode, encoding=encoding)
2165         self.mode = mode
2166
2167     def __enter__(self):
2168         exclusive = self.mode != 'r'
2169         try:
2170             _lock_file(self.f, exclusive)
2171         except IOError:
2172             self.f.close()
2173             raise
2174         return self
2175
2176     def __exit__(self, etype, value, traceback):
2177         try:
2178             _unlock_file(self.f)
2179         finally:
2180             self.f.close()
2181
2182     def __iter__(self):
2183         return iter(self.f)
2184
2185     def write(self, *args):
2186         return self.f.write(*args)
2187
2188     def read(self, *args):
2189         return self.f.read(*args)
2190
2191
2192 def get_filesystem_encoding():
2193     encoding = sys.getfilesystemencoding()
2194     return encoding if encoding is not None else 'utf-8'
2195
2196
2197 def shell_quote(args):
2198     quoted_args = []
2199     encoding = get_filesystem_encoding()
2200     for a in args:
2201         if isinstance(a, bytes):
2202             # We may get a filename encoded with 'encodeFilename'
2203             a = a.decode(encoding)
2204         quoted_args.append(compat_shlex_quote(a))
2205     return ' '.join(quoted_args)
2206
2207
2208 def smuggle_url(url, data):
2209     """ Pass additional data in a URL for internal use. """
2210
2211     url, idata = unsmuggle_url(url, {})
2212     data.update(idata)
2213     sdata = compat_urllib_parse_urlencode(
2214         {'__youtubedl_smuggle': json.dumps(data)})
2215     return url + '#' + sdata
2216
2217
2218 def unsmuggle_url(smug_url, default=None):
2219     if '#__youtubedl_smuggle' not in smug_url:
2220         return smug_url, default
2221     url, _, sdata = smug_url.rpartition('#')
2222     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2223     data = json.loads(jsond)
2224     return url, data
2225
2226
2227 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2228     """ Formats numbers with decimal sufixes like K, M, etc """
2229     num, factor = float_or_none(num), float(factor)
2230     if num is None:
2231         return None
2232     exponent = 0 if num == 0 else int(math.log(num, factor))
2233     suffix = ['', *'kMGTPEZY'][exponent]
2234     if factor == 1024:
2235         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2236     converted = num / (factor ** exponent)
2237     return fmt % (converted, suffix)
2238
2239
2240 def format_bytes(bytes):
2241     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2242
2243
2244 def lookup_unit_table(unit_table, s):
2245     units_re = '|'.join(re.escape(u) for u in unit_table)
2246     m = re.match(
2247         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2248     if not m:
2249         return None
2250     num_str = m.group('num').replace(',', '.')
2251     mult = unit_table[m.group('unit')]
2252     return int(float(num_str) * mult)
2253
2254
2255 def parse_filesize(s):
2256     if s is None:
2257         return None
2258
2259     # The lower-case forms are of course incorrect and unofficial,
2260     # but we support those too
2261     _UNIT_TABLE = {
2262         'B': 1,
2263         'b': 1,
2264         'bytes': 1,
2265         'KiB': 1024,
2266         'KB': 1000,
2267         'kB': 1024,
2268         'Kb': 1000,
2269         'kb': 1000,
2270         'kilobytes': 1000,
2271         'kibibytes': 1024,
2272         'MiB': 1024 ** 2,
2273         'MB': 1000 ** 2,
2274         'mB': 1024 ** 2,
2275         'Mb': 1000 ** 2,
2276         'mb': 1000 ** 2,
2277         'megabytes': 1000 ** 2,
2278         'mebibytes': 1024 ** 2,
2279         'GiB': 1024 ** 3,
2280         'GB': 1000 ** 3,
2281         'gB': 1024 ** 3,
2282         'Gb': 1000 ** 3,
2283         'gb': 1000 ** 3,
2284         'gigabytes': 1000 ** 3,
2285         'gibibytes': 1024 ** 3,
2286         'TiB': 1024 ** 4,
2287         'TB': 1000 ** 4,
2288         'tB': 1024 ** 4,
2289         'Tb': 1000 ** 4,
2290         'tb': 1000 ** 4,
2291         'terabytes': 1000 ** 4,
2292         'tebibytes': 1024 ** 4,
2293         'PiB': 1024 ** 5,
2294         'PB': 1000 ** 5,
2295         'pB': 1024 ** 5,
2296         'Pb': 1000 ** 5,
2297         'pb': 1000 ** 5,
2298         'petabytes': 1000 ** 5,
2299         'pebibytes': 1024 ** 5,
2300         'EiB': 1024 ** 6,
2301         'EB': 1000 ** 6,
2302         'eB': 1024 ** 6,
2303         'Eb': 1000 ** 6,
2304         'eb': 1000 ** 6,
2305         'exabytes': 1000 ** 6,
2306         'exbibytes': 1024 ** 6,
2307         'ZiB': 1024 ** 7,
2308         'ZB': 1000 ** 7,
2309         'zB': 1024 ** 7,
2310         'Zb': 1000 ** 7,
2311         'zb': 1000 ** 7,
2312         'zettabytes': 1000 ** 7,
2313         'zebibytes': 1024 ** 7,
2314         'YiB': 1024 ** 8,
2315         'YB': 1000 ** 8,
2316         'yB': 1024 ** 8,
2317         'Yb': 1000 ** 8,
2318         'yb': 1000 ** 8,
2319         'yottabytes': 1000 ** 8,
2320         'yobibytes': 1024 ** 8,
2321     }
2322
2323     return lookup_unit_table(_UNIT_TABLE, s)
2324
2325
2326 def parse_count(s):
2327     if s is None:
2328         return None
2329
2330     s = re.sub(r'^[^\d]+\s', '', s).strip()
2331
2332     if re.match(r'^[\d,.]+$', s):
2333         return str_to_int(s)
2334
2335     _UNIT_TABLE = {
2336         'k': 1000,
2337         'K': 1000,
2338         'm': 1000 ** 2,
2339         'M': 1000 ** 2,
2340         'kk': 1000 ** 2,
2341         'KK': 1000 ** 2,
2342         'b': 1000 ** 3,
2343         'B': 1000 ** 3,
2344     }
2345
2346     ret = lookup_unit_table(_UNIT_TABLE, s)
2347     if ret is not None:
2348         return ret
2349
2350     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2351     if mobj:
2352         return str_to_int(mobj.group(1))
2353
2354
2355 def parse_resolution(s):
2356     if s is None:
2357         return {}
2358
2359     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2360     if mobj:
2361         return {
2362             'width': int(mobj.group('w')),
2363             'height': int(mobj.group('h')),
2364         }
2365
2366     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2367     if mobj:
2368         return {'height': int(mobj.group(1))}
2369
2370     mobj = re.search(r'\b([48])[kK]\b', s)
2371     if mobj:
2372         return {'height': int(mobj.group(1)) * 540}
2373
2374     return {}
2375
2376
2377 def parse_bitrate(s):
2378     if not isinstance(s, compat_str):
2379         return
2380     mobj = re.search(r'\b(\d+)\s*kbps', s)
2381     if mobj:
2382         return int(mobj.group(1))
2383
2384
2385 def month_by_name(name, lang='en'):
2386     """ Return the number of a month by (locale-independently) English name """
2387
2388     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2389
2390     try:
2391         return month_names.index(name) + 1
2392     except ValueError:
2393         return None
2394
2395
2396 def month_by_abbreviation(abbrev):
2397     """ Return the number of a month by (locale-independently) English
2398         abbreviations """
2399
2400     try:
2401         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2402     except ValueError:
2403         return None
2404
2405
2406 def fix_xml_ampersands(xml_str):
2407     """Replace all the '&' by '&amp;' in XML"""
2408     return re.sub(
2409         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2410         '&amp;',
2411         xml_str)
2412
2413
2414 def setproctitle(title):
2415     assert isinstance(title, compat_str)
2416
2417     # ctypes in Jython is not complete
2418     # http://bugs.jython.org/issue2148
2419     if sys.platform.startswith('java'):
2420         return
2421
2422     try:
2423         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2424     except OSError:
2425         return
2426     except TypeError:
2427         # LoadLibrary in Windows Python 2.7.13 only expects
2428         # a bytestring, but since unicode_literals turns
2429         # every string into a unicode string, it fails.
2430         return
2431     title_bytes = title.encode('utf-8')
2432     buf = ctypes.create_string_buffer(len(title_bytes))
2433     buf.value = title_bytes
2434     try:
2435         libc.prctl(15, buf, 0, 0, 0)
2436     except AttributeError:
2437         return  # Strange libc, just skip this
2438
2439
2440 def remove_start(s, start):
2441     return s[len(start):] if s is not None and s.startswith(start) else s
2442
2443
2444 def remove_end(s, end):
2445     return s[:-len(end)] if s is not None and s.endswith(end) else s
2446
2447
2448 def remove_quotes(s):
2449     if s is None or len(s) < 2:
2450         return s
2451     for quote in ('"', "'", ):
2452         if s[0] == quote and s[-1] == quote:
2453             return s[1:-1]
2454     return s
2455
2456
2457 def get_domain(url):
2458     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2459     return domain.group('domain') if domain else None
2460
2461
2462 def url_basename(url):
2463     path = compat_urlparse.urlparse(url).path
2464     return path.strip('/').split('/')[-1]
2465
2466
2467 def base_url(url):
2468     return re.match(r'https?://[^?#&]+/', url).group()
2469
2470
2471 def urljoin(base, path):
2472     if isinstance(path, bytes):
2473         path = path.decode('utf-8')
2474     if not isinstance(path, compat_str) or not path:
2475         return None
2476     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2477         return path
2478     if isinstance(base, bytes):
2479         base = base.decode('utf-8')
2480     if not isinstance(base, compat_str) or not re.match(
2481             r'^(?:https?:)?//', base):
2482         return None
2483     return compat_urlparse.urljoin(base, path)
2484
2485
2486 class HEADRequest(compat_urllib_request.Request):
2487     def get_method(self):
2488         return 'HEAD'
2489
2490
2491 class PUTRequest(compat_urllib_request.Request):
2492     def get_method(self):
2493         return 'PUT'
2494
2495
2496 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2497     if get_attr and v is not None:
2498         v = getattr(v, get_attr, None)
2499     try:
2500         return int(v) * invscale // scale
2501     except (ValueError, TypeError, OverflowError):
2502         return default
2503
2504
2505 def str_or_none(v, default=None):
2506     return default if v is None else compat_str(v)
2507
2508
2509 def str_to_int(int_str):
2510     """ A more relaxed version of int_or_none """
2511     if isinstance(int_str, compat_integer_types):
2512         return int_str
2513     elif isinstance(int_str, compat_str):
2514         int_str = re.sub(r'[,\.\+]', '', int_str)
2515         return int_or_none(int_str)
2516
2517
2518 def float_or_none(v, scale=1, invscale=1, default=None):
2519     if v is None:
2520         return default
2521     try:
2522         return float(v) * invscale / scale
2523     except (ValueError, TypeError):
2524         return default
2525
2526
2527 def bool_or_none(v, default=None):
2528     return v if isinstance(v, bool) else default
2529
2530
2531 def strip_or_none(v, default=None):
2532     return v.strip() if isinstance(v, compat_str) else default
2533
2534
2535 def url_or_none(url):
2536     if not url or not isinstance(url, compat_str):
2537         return None
2538     url = url.strip()
2539     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2540
2541
2542 def strftime_or_none(timestamp, date_format, default=None):
2543     datetime_object = None
2544     try:
2545         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2546             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2547         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2548             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2549         return datetime_object.strftime(date_format)
2550     except (ValueError, TypeError, AttributeError):
2551         return default
2552
2553
2554 def parse_duration(s):
2555     if not isinstance(s, compat_basestring):
2556         return None
2557     s = s.strip()
2558     if not s:
2559         return None
2560
2561     days, hours, mins, secs, ms = [None] * 5
2562     m = re.match(r'''(?x)
2563             (?P<before_secs>
2564                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2565             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2566             (?P<ms>[.:][0-9]+)?Z?$
2567         ''', s)
2568     if m:
2569         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2570     else:
2571         m = re.match(
2572             r'''(?ix)(?:P?
2573                 (?:
2574                     [0-9]+\s*y(?:ears?)?\s*
2575                 )?
2576                 (?:
2577                     [0-9]+\s*m(?:onths?)?\s*
2578                 )?
2579                 (?:
2580                     [0-9]+\s*w(?:eeks?)?\s*
2581                 )?
2582                 (?:
2583                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2584                 )?
2585                 T)?
2586                 (?:
2587                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2588                 )?
2589                 (?:
2590                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2591                 )?
2592                 (?:
2593                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2594                 )?Z?$''', s)
2595         if m:
2596             days, hours, mins, secs, ms = m.groups()
2597         else:
2598             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2599             if m:
2600                 hours, mins = m.groups()
2601             else:
2602                 return None
2603
2604     duration = 0
2605     if secs:
2606         duration += float(secs)
2607     if mins:
2608         duration += float(mins) * 60
2609     if hours:
2610         duration += float(hours) * 60 * 60
2611     if days:
2612         duration += float(days) * 24 * 60 * 60
2613     if ms:
2614         duration += float(ms.replace(':', '.'))
2615     return duration
2616
2617
2618 def prepend_extension(filename, ext, expected_real_ext=None):
2619     name, real_ext = os.path.splitext(filename)
2620     return (
2621         '{0}.{1}{2}'.format(name, ext, real_ext)
2622         if not expected_real_ext or real_ext[1:] == expected_real_ext
2623         else '{0}.{1}'.format(filename, ext))
2624
2625
2626 def replace_extension(filename, ext, expected_real_ext=None):
2627     name, real_ext = os.path.splitext(filename)
2628     return '{0}.{1}'.format(
2629         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2630         ext)
2631
2632
2633 def check_executable(exe, args=[]):
2634     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2635     args can be a list of arguments for a short output (like -version) """
2636     try:
2637         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2638     except OSError:
2639         return False
2640     return exe
2641
2642
2643 def _get_exe_version_output(exe, args):
2644     try:
2645         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2646         # SIGTTOU if yt-dlp is run in the background.
2647         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2648         out, _ = Popen(
2649             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2650             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2651     except OSError:
2652         return False
2653     if isinstance(out, bytes):  # Python 2.x
2654         out = out.decode('ascii', 'ignore')
2655     return out
2656
2657
2658 def detect_exe_version(output, version_re=None, unrecognized='present'):
2659     assert isinstance(output, compat_str)
2660     if version_re is None:
2661         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2662     m = re.search(version_re, output)
2663     if m:
2664         return m.group(1)
2665     else:
2666         return unrecognized
2667
2668
2669 def get_exe_version(exe, args=['--version'],
2670                     version_re=None, unrecognized='present'):
2671     """ Returns the version of the specified executable,
2672     or False if the executable is not present """
2673     out = _get_exe_version_output(exe, args)
2674     return detect_exe_version(out, version_re, unrecognized) if out else False
2675
2676
2677 class LazyList(collections.abc.Sequence):
2678     ''' Lazy immutable list from an iterable
2679     Note that slices of a LazyList are lists and not LazyList'''
2680
2681     class IndexError(IndexError):
2682         pass
2683
2684     def __init__(self, iterable, *, reverse=False, _cache=None):
2685         self.__iterable = iter(iterable)
2686         self.__cache = [] if _cache is None else _cache
2687         self.__reversed = reverse
2688
2689     def __iter__(self):
2690         if self.__reversed:
2691             # We need to consume the entire iterable to iterate in reverse
2692             yield from self.exhaust()
2693             return
2694         yield from self.__cache
2695         for item in self.__iterable:
2696             self.__cache.append(item)
2697             yield item
2698
2699     def __exhaust(self):
2700         self.__cache.extend(self.__iterable)
2701         # Discard the emptied iterable to make it pickle-able
2702         self.__iterable = []
2703         return self.__cache
2704
2705     def exhaust(self):
2706         ''' Evaluate the entire iterable '''
2707         return self.__exhaust()[::-1 if self.__reversed else 1]
2708
2709     @staticmethod
2710     def __reverse_index(x):
2711         return None if x is None else -(x + 1)
2712
2713     def __getitem__(self, idx):
2714         if isinstance(idx, slice):
2715             if self.__reversed:
2716                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2717             start, stop, step = idx.start, idx.stop, idx.step or 1
2718         elif isinstance(idx, int):
2719             if self.__reversed:
2720                 idx = self.__reverse_index(idx)
2721             start, stop, step = idx, idx, 0
2722         else:
2723             raise TypeError('indices must be integers or slices')
2724         if ((start or 0) < 0 or (stop or 0) < 0
2725                 or (start is None and step < 0)
2726                 or (stop is None and step > 0)):
2727             # We need to consume the entire iterable to be able to slice from the end
2728             # Obviously, never use this with infinite iterables
2729             self.__exhaust()
2730             try:
2731                 return self.__cache[idx]
2732             except IndexError as e:
2733                 raise self.IndexError(e) from e
2734         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2735         if n > 0:
2736             self.__cache.extend(itertools.islice(self.__iterable, n))
2737         try:
2738             return self.__cache[idx]
2739         except IndexError as e:
2740             raise self.IndexError(e) from e
2741
2742     def __bool__(self):
2743         try:
2744             self[-1] if self.__reversed else self[0]
2745         except self.IndexError:
2746             return False
2747         return True
2748
2749     def __len__(self):
2750         self.__exhaust()
2751         return len(self.__cache)
2752
2753     def __reversed__(self):
2754         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2755
2756     def __copy__(self):
2757         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2758
2759     def __repr__(self):
2760         # repr and str should mimic a list. So we exhaust the iterable
2761         return repr(self.exhaust())
2762
2763     def __str__(self):
2764         return repr(self.exhaust())
2765
2766
2767 class PagedList:
2768
2769     class IndexError(IndexError):
2770         pass
2771
2772     def __len__(self):
2773         # This is only useful for tests
2774         return len(self.getslice())
2775
2776     def __init__(self, pagefunc, pagesize, use_cache=True):
2777         self._pagefunc = pagefunc
2778         self._pagesize = pagesize
2779         self._use_cache = use_cache
2780         self._cache = {}
2781
2782     def getpage(self, pagenum):
2783         page_results = self._cache.get(pagenum)
2784         if page_results is None:
2785             page_results = list(self._pagefunc(pagenum))
2786         if self._use_cache:
2787             self._cache[pagenum] = page_results
2788         return page_results
2789
2790     def getslice(self, start=0, end=None):
2791         return list(self._getslice(start, end))
2792
2793     def _getslice(self, start, end):
2794         raise NotImplementedError('This method must be implemented by subclasses')
2795
2796     def __getitem__(self, idx):
2797         # NOTE: cache must be enabled if this is used
2798         if not isinstance(idx, int) or idx < 0:
2799             raise TypeError('indices must be non-negative integers')
2800         entries = self.getslice(idx, idx + 1)
2801         if not entries:
2802             raise self.IndexError()
2803         return entries[0]
2804
2805
2806 class OnDemandPagedList(PagedList):
2807     def _getslice(self, start, end):
2808         for pagenum in itertools.count(start // self._pagesize):
2809             firstid = pagenum * self._pagesize
2810             nextfirstid = pagenum * self._pagesize + self._pagesize
2811             if start >= nextfirstid:
2812                 continue
2813
2814             startv = (
2815                 start % self._pagesize
2816                 if firstid <= start < nextfirstid
2817                 else 0)
2818             endv = (
2819                 ((end - 1) % self._pagesize) + 1
2820                 if (end is not None and firstid <= end <= nextfirstid)
2821                 else None)
2822
2823             page_results = self.getpage(pagenum)
2824             if startv != 0 or endv is not None:
2825                 page_results = page_results[startv:endv]
2826             yield from page_results
2827
2828             # A little optimization - if current page is not "full", ie. does
2829             # not contain page_size videos then we can assume that this page
2830             # is the last one - there are no more ids on further pages -
2831             # i.e. no need to query again.
2832             if len(page_results) + startv < self._pagesize:
2833                 break
2834
2835             # If we got the whole page, but the next page is not interesting,
2836             # break out early as well
2837             if end == nextfirstid:
2838                 break
2839
2840
2841 class InAdvancePagedList(PagedList):
2842     def __init__(self, pagefunc, pagecount, pagesize):
2843         self._pagecount = pagecount
2844         PagedList.__init__(self, pagefunc, pagesize, True)
2845
2846     def _getslice(self, start, end):
2847         start_page = start // self._pagesize
2848         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2849         skip_elems = start - start_page * self._pagesize
2850         only_more = None if end is None else end - start
2851         for pagenum in range(start_page, end_page):
2852             page_results = self.getpage(pagenum)
2853             if skip_elems:
2854                 page_results = page_results[skip_elems:]
2855                 skip_elems = None
2856             if only_more is not None:
2857                 if len(page_results) < only_more:
2858                     only_more -= len(page_results)
2859                 else:
2860                     yield from page_results[:only_more]
2861                     break
2862             yield from page_results
2863
2864
2865 def uppercase_escape(s):
2866     unicode_escape = codecs.getdecoder('unicode_escape')
2867     return re.sub(
2868         r'\\U[0-9a-fA-F]{8}',
2869         lambda m: unicode_escape(m.group(0))[0],
2870         s)
2871
2872
2873 def lowercase_escape(s):
2874     unicode_escape = codecs.getdecoder('unicode_escape')
2875     return re.sub(
2876         r'\\u[0-9a-fA-F]{4}',
2877         lambda m: unicode_escape(m.group(0))[0],
2878         s)
2879
2880
2881 def escape_rfc3986(s):
2882     """Escape non-ASCII characters as suggested by RFC 3986"""
2883     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2884         s = s.encode('utf-8')
2885     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2886
2887
2888 def escape_url(url):
2889     """Escape URL as suggested by RFC 3986"""
2890     url_parsed = compat_urllib_parse_urlparse(url)
2891     return url_parsed._replace(
2892         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2893         path=escape_rfc3986(url_parsed.path),
2894         params=escape_rfc3986(url_parsed.params),
2895         query=escape_rfc3986(url_parsed.query),
2896         fragment=escape_rfc3986(url_parsed.fragment)
2897     ).geturl()
2898
2899
2900 def parse_qs(url):
2901     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2902
2903
2904 def read_batch_urls(batch_fd):
2905     def fixup(url):
2906         if not isinstance(url, compat_str):
2907             url = url.decode('utf-8', 'replace')
2908         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2909         for bom in BOM_UTF8:
2910             if url.startswith(bom):
2911                 url = url[len(bom):]
2912         url = url.lstrip()
2913         if not url or url.startswith(('#', ';', ']')):
2914             return False
2915         # "#" cannot be stripped out since it is part of the URI
2916         # However, it can be safely stipped out if follwing a whitespace
2917         return re.split(r'\s#', url, 1)[0].rstrip()
2918
2919     with contextlib.closing(batch_fd) as fd:
2920         return [url for url in map(fixup, fd) if url]
2921
2922
2923 def urlencode_postdata(*args, **kargs):
2924     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2925
2926
2927 def update_url_query(url, query):
2928     if not query:
2929         return url
2930     parsed_url = compat_urlparse.urlparse(url)
2931     qs = compat_parse_qs(parsed_url.query)
2932     qs.update(query)
2933     return compat_urlparse.urlunparse(parsed_url._replace(
2934         query=compat_urllib_parse_urlencode(qs, True)))
2935
2936
2937 def update_Request(req, url=None, data=None, headers={}, query={}):
2938     req_headers = req.headers.copy()
2939     req_headers.update(headers)
2940     req_data = data or req.data
2941     req_url = update_url_query(url or req.get_full_url(), query)
2942     req_get_method = req.get_method()
2943     if req_get_method == 'HEAD':
2944         req_type = HEADRequest
2945     elif req_get_method == 'PUT':
2946         req_type = PUTRequest
2947     else:
2948         req_type = compat_urllib_request.Request
2949     new_req = req_type(
2950         req_url, data=req_data, headers=req_headers,
2951         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2952     if hasattr(req, 'timeout'):
2953         new_req.timeout = req.timeout
2954     return new_req
2955
2956
2957 def _multipart_encode_impl(data, boundary):
2958     content_type = 'multipart/form-data; boundary=%s' % boundary
2959
2960     out = b''
2961     for k, v in data.items():
2962         out += b'--' + boundary.encode('ascii') + b'\r\n'
2963         if isinstance(k, compat_str):
2964             k = k.encode('utf-8')
2965         if isinstance(v, compat_str):
2966             v = v.encode('utf-8')
2967         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2968         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2969         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2970         if boundary.encode('ascii') in content:
2971             raise ValueError('Boundary overlaps with data')
2972         out += content
2973
2974     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2975
2976     return out, content_type
2977
2978
2979 def multipart_encode(data, boundary=None):
2980     '''
2981     Encode a dict to RFC 7578-compliant form-data
2982
2983     data:
2984         A dict where keys and values can be either Unicode or bytes-like
2985         objects.
2986     boundary:
2987         If specified a Unicode object, it's used as the boundary. Otherwise
2988         a random boundary is generated.
2989
2990     Reference: https://tools.ietf.org/html/rfc7578
2991     '''
2992     has_specified_boundary = boundary is not None
2993
2994     while True:
2995         if boundary is None:
2996             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2997
2998         try:
2999             out, content_type = _multipart_encode_impl(data, boundary)
3000             break
3001         except ValueError:
3002             if has_specified_boundary:
3003                 raise
3004             boundary = None
3005
3006     return out, content_type
3007
3008
3009 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3010     if isinstance(key_or_keys, (list, tuple)):
3011         for key in key_or_keys:
3012             if key not in d or d[key] is None or skip_false_values and not d[key]:
3013                 continue
3014             return d[key]
3015         return default
3016     return d.get(key_or_keys, default)
3017
3018
3019 def try_get(src, getter, expected_type=None):
3020     for get in variadic(getter):
3021         try:
3022             v = get(src)
3023         except (AttributeError, KeyError, TypeError, IndexError):
3024             pass
3025         else:
3026             if expected_type is None or isinstance(v, expected_type):
3027                 return v
3028
3029
3030 def merge_dicts(*dicts):
3031     merged = {}
3032     for a_dict in dicts:
3033         for k, v in a_dict.items():
3034             if v is None:
3035                 continue
3036             if (k not in merged
3037                     or (isinstance(v, compat_str) and v
3038                         and isinstance(merged[k], compat_str)
3039                         and not merged[k])):
3040                 merged[k] = v
3041     return merged
3042
3043
3044 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3045     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3046
3047
3048 US_RATINGS = {
3049     'G': 0,
3050     'PG': 10,
3051     'PG-13': 13,
3052     'R': 16,
3053     'NC': 18,
3054 }
3055
3056
3057 TV_PARENTAL_GUIDELINES = {
3058     'TV-Y': 0,
3059     'TV-Y7': 7,
3060     'TV-G': 0,
3061     'TV-PG': 0,
3062     'TV-14': 14,
3063     'TV-MA': 17,
3064 }
3065
3066
3067 def parse_age_limit(s):
3068     if type(s) == int:
3069         return s if 0 <= s <= 21 else None
3070     if not isinstance(s, compat_basestring):
3071         return None
3072     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3073     if m:
3074         return int(m.group('age'))
3075     s = s.upper()
3076     if s in US_RATINGS:
3077         return US_RATINGS[s]
3078     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3079     if m:
3080         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3081     return None
3082
3083
3084 def strip_jsonp(code):
3085     return re.sub(
3086         r'''(?sx)^
3087             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3088             (?:\s*&&\s*(?P=func_name))?
3089             \s*\(\s*(?P<callback_data>.*)\);?
3090             \s*?(?://[^\n]*)*$''',
3091         r'\g<callback_data>', code)
3092
3093
3094 def js_to_json(code, vars={}):
3095     # vars is a dict of var, val pairs to substitute
3096     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3097     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3098     INTEGER_TABLE = (
3099         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3100         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3101     )
3102
3103     def fix_kv(m):
3104         v = m.group(0)
3105         if v in ('true', 'false', 'null'):
3106             return v
3107         elif v in ('undefined', 'void 0'):
3108             return 'null'
3109         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3110             return ""
3111
3112         if v[0] in ("'", '"'):
3113             v = re.sub(r'(?s)\\.|"', lambda m: {
3114                 '"': '\\"',
3115                 "\\'": "'",
3116                 '\\\n': '',
3117                 '\\x': '\\u00',
3118             }.get(m.group(0), m.group(0)), v[1:-1])
3119         else:
3120             for regex, base in INTEGER_TABLE:
3121                 im = re.match(regex, v)
3122                 if im:
3123                     i = int(im.group(1), base)
3124                     return '"%d":' % i if v.endswith(':') else '%d' % i
3125
3126             if v in vars:
3127                 return vars[v]
3128
3129         return '"%s"' % v
3130
3131     return re.sub(r'''(?sx)
3132         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3133         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3134         {comment}|,(?={skip}[\]}}])|
3135         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3136         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3137         [0-9]+(?={skip}:)|
3138         !+
3139         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3140
3141
3142 def qualities(quality_ids):
3143     """ Get a numeric quality value out of a list of possible values """
3144     def q(qid):
3145         try:
3146             return quality_ids.index(qid)
3147         except ValueError:
3148             return -1
3149     return q
3150
3151
3152 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3153
3154
3155 DEFAULT_OUTTMPL = {
3156     'default': '%(title)s [%(id)s].%(ext)s',
3157     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3158 }
3159 OUTTMPL_TYPES = {
3160     'chapter': None,
3161     'subtitle': None,
3162     'thumbnail': None,
3163     'description': 'description',
3164     'annotation': 'annotations.xml',
3165     'infojson': 'info.json',
3166     'link': None,
3167     'pl_video': None,
3168     'pl_thumbnail': None,
3169     'pl_description': 'description',
3170     'pl_infojson': 'info.json',
3171 }
3172
3173 # As of [1] format syntax is:
3174 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3175 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3176 STR_FORMAT_RE_TMPL = r'''(?x)
3177     (?<!%)(?P<prefix>(?:%%)*)
3178     %
3179     (?P<has_key>\((?P<key>{0})\))?
3180     (?P<format>
3181         (?P<conversion>[#0\-+ ]+)?
3182         (?P<min_width>\d+)?
3183         (?P<precision>\.\d+)?
3184         (?P<len_mod>[hlL])?  # unused in python
3185         {1}  # conversion type
3186     )
3187 '''
3188
3189
3190 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3191
3192
3193 def limit_length(s, length):
3194     """ Add ellipses to overly long strings """
3195     if s is None:
3196         return None
3197     ELLIPSES = '...'
3198     if len(s) > length:
3199         return s[:length - len(ELLIPSES)] + ELLIPSES
3200     return s
3201
3202
3203 def version_tuple(v):
3204     return tuple(int(e) for e in re.split(r'[-.]', v))
3205
3206
3207 def is_outdated_version(version, limit, assume_new=True):
3208     if not version:
3209         return not assume_new
3210     try:
3211         return version_tuple(version) < version_tuple(limit)
3212     except ValueError:
3213         return not assume_new
3214
3215
3216 def ytdl_is_updateable():
3217     """ Returns if yt-dlp can be updated with -U """
3218
3219     from .update import is_non_updateable
3220
3221     return not is_non_updateable()
3222
3223
3224 def args_to_str(args):
3225     # Get a short string representation for a subprocess command
3226     return ' '.join(compat_shlex_quote(a) for a in args)
3227
3228
3229 def error_to_compat_str(err):
3230     err_str = str(err)
3231     # On python 2 error byte string must be decoded with proper
3232     # encoding rather than ascii
3233     if sys.version_info[0] < 3:
3234         err_str = err_str.decode(preferredencoding())
3235     return err_str
3236
3237
3238 def mimetype2ext(mt):
3239     if mt is None:
3240         return None
3241
3242     mt, _, params = mt.partition(';')
3243     mt = mt.strip()
3244
3245     FULL_MAP = {
3246         'audio/mp4': 'm4a',
3247         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3248         # it's the most popular one
3249         'audio/mpeg': 'mp3',
3250         'audio/x-wav': 'wav',
3251         'audio/wav': 'wav',
3252         'audio/wave': 'wav',
3253     }
3254
3255     ext = FULL_MAP.get(mt)
3256     if ext is not None:
3257         return ext
3258
3259     SUBTYPE_MAP = {
3260         '3gpp': '3gp',
3261         'smptett+xml': 'tt',
3262         'ttaf+xml': 'dfxp',
3263         'ttml+xml': 'ttml',
3264         'x-flv': 'flv',
3265         'x-mp4-fragmented': 'mp4',
3266         'x-ms-sami': 'sami',
3267         'x-ms-wmv': 'wmv',
3268         'mpegurl': 'm3u8',
3269         'x-mpegurl': 'm3u8',
3270         'vnd.apple.mpegurl': 'm3u8',
3271         'dash+xml': 'mpd',
3272         'f4m+xml': 'f4m',
3273         'hds+xml': 'f4m',
3274         'vnd.ms-sstr+xml': 'ism',
3275         'quicktime': 'mov',
3276         'mp2t': 'ts',
3277         'x-wav': 'wav',
3278         'filmstrip+json': 'fs',
3279         'svg+xml': 'svg',
3280     }
3281
3282     _, _, subtype = mt.rpartition('/')
3283     ext = SUBTYPE_MAP.get(subtype.lower())
3284     if ext is not None:
3285         return ext
3286
3287     SUFFIX_MAP = {
3288         'json': 'json',
3289         'xml': 'xml',
3290         'zip': 'zip',
3291         'gzip': 'gz',
3292     }
3293
3294     _, _, suffix = subtype.partition('+')
3295     ext = SUFFIX_MAP.get(suffix)
3296     if ext is not None:
3297         return ext
3298
3299     return subtype.replace('+', '.')
3300
3301
3302 def ext2mimetype(ext_or_url):
3303     if not ext_or_url:
3304         return None
3305     if '.' not in ext_or_url:
3306         ext_or_url = f'file.{ext_or_url}'
3307     return mimetypes.guess_type(ext_or_url)[0]
3308
3309
3310 def parse_codecs(codecs_str):
3311     # http://tools.ietf.org/html/rfc6381
3312     if not codecs_str:
3313         return {}
3314     split_codecs = list(filter(None, map(
3315         str.strip, codecs_str.strip().strip(',').split(','))))
3316     vcodec, acodec, tcodec, hdr = None, None, None, None
3317     for full_codec in split_codecs:
3318         parts = full_codec.split('.')
3319         codec = parts[0].replace('0', '')
3320         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3321                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3322             if not vcodec:
3323                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3324                 if codec in ('dvh1', 'dvhe'):
3325                     hdr = 'DV'
3326                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3327                     hdr = 'HDR10'
3328                 elif full_codec.replace('0', '').startswith('vp9.2'):
3329                     hdr = 'HDR10'
3330         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3331             if not acodec:
3332                 acodec = full_codec
3333         elif codec in ('stpp', 'wvtt',):
3334             if not tcodec:
3335                 tcodec = full_codec
3336         else:
3337             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3338     if vcodec or acodec or tcodec:
3339         return {
3340             'vcodec': vcodec or 'none',
3341             'acodec': acodec or 'none',
3342             'dynamic_range': hdr,
3343             **({'tcodec': tcodec} if tcodec is not None else {}),
3344         }
3345     elif len(split_codecs) == 2:
3346         return {
3347             'vcodec': split_codecs[0],
3348             'acodec': split_codecs[1],
3349         }
3350     return {}
3351
3352
3353 def urlhandle_detect_ext(url_handle):
3354     getheader = url_handle.headers.get
3355
3356     cd = getheader('Content-Disposition')
3357     if cd:
3358         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3359         if m:
3360             e = determine_ext(m.group('filename'), default_ext=None)
3361             if e:
3362                 return e
3363
3364     return mimetype2ext(getheader('Content-Type'))
3365
3366
3367 def encode_data_uri(data, mime_type):
3368     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3369
3370
3371 def age_restricted(content_limit, age_limit):
3372     """ Returns True iff the content should be blocked """
3373
3374     if age_limit is None:  # No limit set
3375         return False
3376     if content_limit is None:
3377         return False  # Content available for everyone
3378     return age_limit < content_limit
3379
3380
3381 def is_html(first_bytes):
3382     """ Detect whether a file contains HTML by examining its first bytes. """
3383
3384     BOMS = [
3385         (b'\xef\xbb\xbf', 'utf-8'),
3386         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3387         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3388         (b'\xff\xfe', 'utf-16-le'),
3389         (b'\xfe\xff', 'utf-16-be'),
3390     ]
3391     for bom, enc in BOMS:
3392         if first_bytes.startswith(bom):
3393             s = first_bytes[len(bom):].decode(enc, 'replace')
3394             break
3395     else:
3396         s = first_bytes.decode('utf-8', 'replace')
3397
3398     return re.match(r'^\s*<', s)
3399
3400
3401 def determine_protocol(info_dict):
3402     protocol = info_dict.get('protocol')
3403     if protocol is not None:
3404         return protocol
3405
3406     url = sanitize_url(info_dict['url'])
3407     if url.startswith('rtmp'):
3408         return 'rtmp'
3409     elif url.startswith('mms'):
3410         return 'mms'
3411     elif url.startswith('rtsp'):
3412         return 'rtsp'
3413
3414     ext = determine_ext(url)
3415     if ext == 'm3u8':
3416         return 'm3u8'
3417     elif ext == 'f4m':
3418         return 'f4m'
3419
3420     return compat_urllib_parse_urlparse(url).scheme
3421
3422
3423 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3424     """ Render a list of rows, each as a list of values.
3425     Text after a \t will be right aligned """
3426     def width(string):
3427         return len(remove_terminal_sequences(string).replace('\t', ''))
3428
3429     def get_max_lens(table):
3430         return [max(width(str(v)) for v in col) for col in zip(*table)]
3431
3432     def filter_using_list(row, filterArray):
3433         return [col for (take, col) in zip(filterArray, row) if take]
3434
3435     if hide_empty:
3436         max_lens = get_max_lens(data)
3437         header_row = filter_using_list(header_row, max_lens)
3438         data = [filter_using_list(row, max_lens) for row in data]
3439
3440     table = [header_row] + data
3441     max_lens = get_max_lens(table)
3442     extra_gap += 1
3443     if delim:
3444         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3445         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3446     for row in table:
3447         for pos, text in enumerate(map(str, row)):
3448             if '\t' in text:
3449                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3450             else:
3451                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3452     ret = '\n'.join(''.join(row).rstrip() for row in table)
3453     return ret
3454
3455
3456 def _match_one(filter_part, dct, incomplete):
3457     # TODO: Generalize code with YoutubeDL._build_format_filter
3458     STRING_OPERATORS = {
3459         '*=': operator.contains,
3460         '^=': lambda attr, value: attr.startswith(value),
3461         '$=': lambda attr, value: attr.endswith(value),
3462         '~=': lambda attr, value: re.search(value, attr),
3463     }
3464     COMPARISON_OPERATORS = {
3465         **STRING_OPERATORS,
3466         '<=': operator.le,  # "<=" must be defined above "<"
3467         '<': operator.lt,
3468         '>=': operator.ge,
3469         '>': operator.gt,
3470         '=': operator.eq,
3471     }
3472
3473     operator_rex = re.compile(r'''(?x)\s*
3474         (?P<key>[a-z_]+)
3475         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3476         (?:
3477             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3478             (?P<strval>.+?)
3479         )
3480         \s*$
3481         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3482     m = operator_rex.search(filter_part)
3483     if m:
3484         m = m.groupdict()
3485         unnegated_op = COMPARISON_OPERATORS[m['op']]
3486         if m['negation']:
3487             op = lambda attr, value: not unnegated_op(attr, value)
3488         else:
3489             op = unnegated_op
3490         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3491         if m['quote']:
3492             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3493         actual_value = dct.get(m['key'])
3494         numeric_comparison = None
3495         if isinstance(actual_value, compat_numeric_types):
3496             # If the original field is a string and matching comparisonvalue is
3497             # a number we should respect the origin of the original field
3498             # and process comparison value as a string (see
3499             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3500             try:
3501                 numeric_comparison = int(comparison_value)
3502             except ValueError:
3503                 numeric_comparison = parse_filesize(comparison_value)
3504                 if numeric_comparison is None:
3505                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3506                 if numeric_comparison is None:
3507                     numeric_comparison = parse_duration(comparison_value)
3508         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3509             raise ValueError('Operator %s only supports string values!' % m['op'])
3510         if actual_value is None:
3511             return incomplete or m['none_inclusive']
3512         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3513
3514     UNARY_OPERATORS = {
3515         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3516         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3517     }
3518     operator_rex = re.compile(r'''(?x)\s*
3519         (?P<op>%s)\s*(?P<key>[a-z_]+)
3520         \s*$
3521         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3522     m = operator_rex.search(filter_part)
3523     if m:
3524         op = UNARY_OPERATORS[m.group('op')]
3525         actual_value = dct.get(m.group('key'))
3526         if incomplete and actual_value is None:
3527             return True
3528         return op(actual_value)
3529
3530     raise ValueError('Invalid filter part %r' % filter_part)
3531
3532
3533 def match_str(filter_str, dct, incomplete=False):
3534     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3535         When incomplete, all conditions passes on missing fields
3536     """
3537     return all(
3538         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3539         for filter_part in re.split(r'(?<!\\)&', filter_str))
3540
3541
3542 def match_filter_func(filter_str):
3543     def _match_func(info_dict, *args, **kwargs):
3544         if match_str(filter_str, info_dict, *args, **kwargs):
3545             return None
3546         else:
3547             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3548             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3549     return _match_func
3550
3551
3552 def parse_dfxp_time_expr(time_expr):
3553     if not time_expr:
3554         return
3555
3556     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3557     if mobj:
3558         return float(mobj.group('time_offset'))
3559
3560     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3561     if mobj:
3562         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3563
3564
3565 def srt_subtitles_timecode(seconds):
3566     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3567
3568
3569 def ass_subtitles_timecode(seconds):
3570     time = timetuple_from_msec(seconds * 1000)
3571     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3572
3573
3574 def dfxp2srt(dfxp_data):
3575     '''
3576     @param dfxp_data A bytes-like object containing DFXP data
3577     @returns A unicode object containing converted SRT data
3578     '''
3579     LEGACY_NAMESPACES = (
3580         (b'http://www.w3.org/ns/ttml', [
3581             b'http://www.w3.org/2004/11/ttaf1',
3582             b'http://www.w3.org/2006/04/ttaf1',
3583             b'http://www.w3.org/2006/10/ttaf1',
3584         ]),
3585         (b'http://www.w3.org/ns/ttml#styling', [
3586             b'http://www.w3.org/ns/ttml#style',
3587         ]),
3588     )
3589
3590     SUPPORTED_STYLING = [
3591         'color',
3592         'fontFamily',
3593         'fontSize',
3594         'fontStyle',
3595         'fontWeight',
3596         'textDecoration'
3597     ]
3598
3599     _x = functools.partial(xpath_with_ns, ns_map={
3600         'xml': 'http://www.w3.org/XML/1998/namespace',
3601         'ttml': 'http://www.w3.org/ns/ttml',
3602         'tts': 'http://www.w3.org/ns/ttml#styling',
3603     })
3604
3605     styles = {}
3606     default_style = {}
3607
3608     class TTMLPElementParser(object):
3609         _out = ''
3610         _unclosed_elements = []
3611         _applied_styles = []
3612
3613         def start(self, tag, attrib):
3614             if tag in (_x('ttml:br'), 'br'):
3615                 self._out += '\n'
3616             else:
3617                 unclosed_elements = []
3618                 style = {}
3619                 element_style_id = attrib.get('style')
3620                 if default_style:
3621                     style.update(default_style)
3622                 if element_style_id:
3623                     style.update(styles.get(element_style_id, {}))
3624                 for prop in SUPPORTED_STYLING:
3625                     prop_val = attrib.get(_x('tts:' + prop))
3626                     if prop_val:
3627                         style[prop] = prop_val
3628                 if style:
3629                     font = ''
3630                     for k, v in sorted(style.items()):
3631                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3632                             continue
3633                         if k == 'color':
3634                             font += ' color="%s"' % v
3635                         elif k == 'fontSize':
3636                             font += ' size="%s"' % v
3637                         elif k == 'fontFamily':
3638                             font += ' face="%s"' % v
3639                         elif k == 'fontWeight' and v == 'bold':
3640                             self._out += '<b>'
3641                             unclosed_elements.append('b')
3642                         elif k == 'fontStyle' and v == 'italic':
3643                             self._out += '<i>'
3644                             unclosed_elements.append('i')
3645                         elif k == 'textDecoration' and v == 'underline':
3646                             self._out += '<u>'
3647                             unclosed_elements.append('u')
3648                     if font:
3649                         self._out += '<font' + font + '>'
3650                         unclosed_elements.append('font')
3651                     applied_style = {}
3652                     if self._applied_styles:
3653                         applied_style.update(self._applied_styles[-1])
3654                     applied_style.update(style)
3655                     self._applied_styles.append(applied_style)
3656                 self._unclosed_elements.append(unclosed_elements)
3657
3658         def end(self, tag):
3659             if tag not in (_x('ttml:br'), 'br'):
3660                 unclosed_elements = self._unclosed_elements.pop()
3661                 for element in reversed(unclosed_elements):
3662                     self._out += '</%s>' % element
3663                 if unclosed_elements and self._applied_styles:
3664                     self._applied_styles.pop()
3665
3666         def data(self, data):
3667             self._out += data
3668
3669         def close(self):
3670             return self._out.strip()
3671
3672     def parse_node(node):
3673         target = TTMLPElementParser()
3674         parser = xml.etree.ElementTree.XMLParser(target=target)
3675         parser.feed(xml.etree.ElementTree.tostring(node))
3676         return parser.close()
3677
3678     for k, v in LEGACY_NAMESPACES:
3679         for ns in v:
3680             dfxp_data = dfxp_data.replace(ns, k)
3681
3682     dfxp = compat_etree_fromstring(dfxp_data)
3683     out = []
3684     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3685
3686     if not paras:
3687         raise ValueError('Invalid dfxp/TTML subtitle')
3688
3689     repeat = False
3690     while True:
3691         for style in dfxp.findall(_x('.//ttml:style')):
3692             style_id = style.get('id') or style.get(_x('xml:id'))
3693             if not style_id:
3694                 continue
3695             parent_style_id = style.get('style')
3696             if parent_style_id:
3697                 if parent_style_id not in styles:
3698                     repeat = True
3699                     continue
3700                 styles[style_id] = styles[parent_style_id].copy()
3701             for prop in SUPPORTED_STYLING:
3702                 prop_val = style.get(_x('tts:' + prop))
3703                 if prop_val:
3704                     styles.setdefault(style_id, {})[prop] = prop_val
3705         if repeat:
3706             repeat = False
3707         else:
3708             break
3709
3710     for p in ('body', 'div'):
3711         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3712         if ele is None:
3713             continue
3714         style = styles.get(ele.get('style'))
3715         if not style:
3716             continue
3717         default_style.update(style)
3718
3719     for para, index in zip(paras, itertools.count(1)):
3720         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3721         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3722         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3723         if begin_time is None:
3724             continue
3725         if not end_time:
3726             if not dur:
3727                 continue
3728             end_time = begin_time + dur
3729         out.append('%d\n%s --> %s\n%s\n\n' % (
3730             index,
3731             srt_subtitles_timecode(begin_time),
3732             srt_subtitles_timecode(end_time),
3733             parse_node(para)))
3734
3735     return ''.join(out)
3736
3737
3738 def cli_option(params, command_option, param):
3739     param = params.get(param)
3740     if param:
3741         param = compat_str(param)
3742     return [command_option, param] if param is not None else []
3743
3744
3745 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3746     param = params.get(param)
3747     if param is None:
3748         return []
3749     assert isinstance(param, bool)
3750     if separator:
3751         return [command_option + separator + (true_value if param else false_value)]
3752     return [command_option, true_value if param else false_value]
3753
3754
3755 def cli_valueless_option(params, command_option, param, expected_value=True):
3756     param = params.get(param)
3757     return [command_option] if param == expected_value else []
3758
3759
3760 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3761     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3762         if use_compat:
3763             return argdict
3764         else:
3765             argdict = None
3766     if argdict is None:
3767         return default
3768     assert isinstance(argdict, dict)
3769
3770     assert isinstance(keys, (list, tuple))
3771     for key_list in keys:
3772         arg_list = list(filter(
3773             lambda x: x is not None,
3774             [argdict.get(key.lower()) for key in variadic(key_list)]))
3775         if arg_list:
3776             return [arg for args in arg_list for arg in args]
3777     return default
3778
3779
3780 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3781     main_key, exe = main_key.lower(), exe.lower()
3782     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3783     keys = [f'{root_key}{k}' for k in (keys or [''])]
3784     if root_key in keys:
3785         if main_key != exe:
3786             keys.append((main_key, exe))
3787         keys.append('default')
3788     else:
3789         use_compat = False
3790     return cli_configuration_args(argdict, keys, default, use_compat)
3791
3792
3793 class ISO639Utils(object):
3794     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3795     _lang_map = {
3796         'aa': 'aar',
3797         'ab': 'abk',
3798         'ae': 'ave',
3799         'af': 'afr',
3800         'ak': 'aka',
3801         'am': 'amh',
3802         'an': 'arg',
3803         'ar': 'ara',
3804         'as': 'asm',
3805         'av': 'ava',
3806         'ay': 'aym',
3807         'az': 'aze',
3808         'ba': 'bak',
3809         'be': 'bel',
3810         'bg': 'bul',
3811         'bh': 'bih',
3812         'bi': 'bis',
3813         'bm': 'bam',
3814         'bn': 'ben',
3815         'bo': 'bod',
3816         'br': 'bre',
3817         'bs': 'bos',
3818         'ca': 'cat',
3819         'ce': 'che',
3820         'ch': 'cha',
3821         'co': 'cos',
3822         'cr': 'cre',
3823         'cs': 'ces',
3824         'cu': 'chu',
3825         'cv': 'chv',
3826         'cy': 'cym',
3827         'da': 'dan',
3828         'de': 'deu',
3829         'dv': 'div',
3830         'dz': 'dzo',
3831         'ee': 'ewe',
3832         'el': 'ell',
3833         'en': 'eng',
3834         'eo': 'epo',
3835         'es': 'spa',
3836         'et': 'est',
3837         'eu': 'eus',
3838         'fa': 'fas',
3839         'ff': 'ful',
3840         'fi': 'fin',
3841         'fj': 'fij',
3842         'fo': 'fao',
3843         'fr': 'fra',
3844         'fy': 'fry',
3845         'ga': 'gle',
3846         'gd': 'gla',
3847         'gl': 'glg',
3848         'gn': 'grn',
3849         'gu': 'guj',
3850         'gv': 'glv',
3851         'ha': 'hau',
3852         'he': 'heb',
3853         'iw': 'heb',  # Replaced by he in 1989 revision
3854         'hi': 'hin',
3855         'ho': 'hmo',
3856         'hr': 'hrv',
3857         'ht': 'hat',
3858         'hu': 'hun',
3859         'hy': 'hye',
3860         'hz': 'her',
3861         'ia': 'ina',
3862         'id': 'ind',
3863         'in': 'ind',  # Replaced by id in 1989 revision
3864         'ie': 'ile',
3865         'ig': 'ibo',
3866         'ii': 'iii',
3867         'ik': 'ipk',
3868         'io': 'ido',
3869         'is': 'isl',
3870         'it': 'ita',
3871         'iu': 'iku',
3872         'ja': 'jpn',
3873         'jv': 'jav',
3874         'ka': 'kat',
3875         'kg': 'kon',
3876         'ki': 'kik',
3877         'kj': 'kua',
3878         'kk': 'kaz',
3879         'kl': 'kal',
3880         'km': 'khm',
3881         'kn': 'kan',
3882         'ko': 'kor',
3883         'kr': 'kau',
3884         'ks': 'kas',
3885         'ku': 'kur',
3886         'kv': 'kom',
3887         'kw': 'cor',
3888         'ky': 'kir',
3889         'la': 'lat',
3890         'lb': 'ltz',
3891         'lg': 'lug',
3892         'li': 'lim',
3893         'ln': 'lin',
3894         'lo': 'lao',
3895         'lt': 'lit',
3896         'lu': 'lub',
3897         'lv': 'lav',
3898         'mg': 'mlg',
3899         'mh': 'mah',
3900         'mi': 'mri',
3901         'mk': 'mkd',
3902         'ml': 'mal',
3903         'mn': 'mon',
3904         'mr': 'mar',
3905         'ms': 'msa',
3906         'mt': 'mlt',
3907         'my': 'mya',
3908         'na': 'nau',
3909         'nb': 'nob',
3910         'nd': 'nde',
3911         'ne': 'nep',
3912         'ng': 'ndo',
3913         'nl': 'nld',
3914         'nn': 'nno',
3915         'no': 'nor',
3916         'nr': 'nbl',
3917         'nv': 'nav',
3918         'ny': 'nya',
3919         'oc': 'oci',
3920         'oj': 'oji',
3921         'om': 'orm',
3922         'or': 'ori',
3923         'os': 'oss',
3924         'pa': 'pan',
3925         'pi': 'pli',
3926         'pl': 'pol',
3927         'ps': 'pus',
3928         'pt': 'por',
3929         'qu': 'que',
3930         'rm': 'roh',
3931         'rn': 'run',
3932         'ro': 'ron',
3933         'ru': 'rus',
3934         'rw': 'kin',
3935         'sa': 'san',
3936         'sc': 'srd',
3937         'sd': 'snd',
3938         'se': 'sme',
3939         'sg': 'sag',
3940         'si': 'sin',
3941         'sk': 'slk',
3942         'sl': 'slv',
3943         'sm': 'smo',
3944         'sn': 'sna',
3945         'so': 'som',
3946         'sq': 'sqi',
3947         'sr': 'srp',
3948         'ss': 'ssw',
3949         'st': 'sot',
3950         'su': 'sun',
3951         'sv': 'swe',
3952         'sw': 'swa',
3953         'ta': 'tam',
3954         'te': 'tel',
3955         'tg': 'tgk',
3956         'th': 'tha',
3957         'ti': 'tir',
3958         'tk': 'tuk',
3959         'tl': 'tgl',
3960         'tn': 'tsn',
3961         'to': 'ton',
3962         'tr': 'tur',
3963         'ts': 'tso',
3964         'tt': 'tat',
3965         'tw': 'twi',
3966         'ty': 'tah',
3967         'ug': 'uig',
3968         'uk': 'ukr',
3969         'ur': 'urd',
3970         'uz': 'uzb',
3971         've': 'ven',
3972         'vi': 'vie',
3973         'vo': 'vol',
3974         'wa': 'wln',
3975         'wo': 'wol',
3976         'xh': 'xho',
3977         'yi': 'yid',
3978         'ji': 'yid',  # Replaced by yi in 1989 revision
3979         'yo': 'yor',
3980         'za': 'zha',
3981         'zh': 'zho',
3982         'zu': 'zul',
3983     }
3984
3985     @classmethod
3986     def short2long(cls, code):
3987         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3988         return cls._lang_map.get(code[:2])
3989
3990     @classmethod
3991     def long2short(cls, code):
3992         """Convert language code from ISO 639-2/T to ISO 639-1"""
3993         for short_name, long_name in cls._lang_map.items():
3994             if long_name == code:
3995                 return short_name
3996
3997
3998 class ISO3166Utils(object):
3999     # From http://data.okfn.org/data/core/country-list
4000     _country_map = {
4001         'AF': 'Afghanistan',
4002         'AX': 'Åland Islands',
4003         'AL': 'Albania',
4004         'DZ': 'Algeria',
4005         'AS': 'American Samoa',
4006         'AD': 'Andorra',
4007         'AO': 'Angola',
4008         'AI': 'Anguilla',
4009         'AQ': 'Antarctica',
4010         'AG': 'Antigua and Barbuda',
4011         'AR': 'Argentina',
4012         'AM': 'Armenia',
4013         'AW': 'Aruba',
4014         'AU': 'Australia',
4015         'AT': 'Austria',
4016         'AZ': 'Azerbaijan',
4017         'BS': 'Bahamas',
4018         'BH': 'Bahrain',
4019         'BD': 'Bangladesh',
4020         'BB': 'Barbados',
4021         'BY': 'Belarus',
4022         'BE': 'Belgium',
4023         'BZ': 'Belize',
4024         'BJ': 'Benin',
4025         'BM': 'Bermuda',
4026         'BT': 'Bhutan',
4027         'BO': 'Bolivia, Plurinational State of',
4028         'BQ': 'Bonaire, Sint Eustatius and Saba',
4029         'BA': 'Bosnia and Herzegovina',
4030         'BW': 'Botswana',
4031         'BV': 'Bouvet Island',
4032         'BR': 'Brazil',
4033         'IO': 'British Indian Ocean Territory',
4034         'BN': 'Brunei Darussalam',
4035         'BG': 'Bulgaria',
4036         'BF': 'Burkina Faso',
4037         'BI': 'Burundi',
4038         'KH': 'Cambodia',
4039         'CM': 'Cameroon',
4040         'CA': 'Canada',
4041         'CV': 'Cape Verde',
4042         'KY': 'Cayman Islands',
4043         'CF': 'Central African Republic',
4044         'TD': 'Chad',
4045         'CL': 'Chile',
4046         'CN': 'China',
4047         'CX': 'Christmas Island',
4048         'CC': 'Cocos (Keeling) Islands',
4049         'CO': 'Colombia',
4050         'KM': 'Comoros',
4051         'CG': 'Congo',
4052         'CD': 'Congo, the Democratic Republic of the',
4053         'CK': 'Cook Islands',
4054         'CR': 'Costa Rica',
4055         'CI': 'Côte d\'Ivoire',
4056         'HR': 'Croatia',
4057         'CU': 'Cuba',
4058         'CW': 'Curaçao',
4059         'CY': 'Cyprus',
4060         'CZ': 'Czech Republic',
4061         'DK': 'Denmark',
4062         'DJ': 'Djibouti',
4063         'DM': 'Dominica',
4064         'DO': 'Dominican Republic',
4065         'EC': 'Ecuador',
4066         'EG': 'Egypt',
4067         'SV': 'El Salvador',
4068         'GQ': 'Equatorial Guinea',
4069         'ER': 'Eritrea',
4070         'EE': 'Estonia',
4071         'ET': 'Ethiopia',
4072         'FK': 'Falkland Islands (Malvinas)',
4073         'FO': 'Faroe Islands',
4074         'FJ': 'Fiji',
4075         'FI': 'Finland',
4076         'FR': 'France',
4077         'GF': 'French Guiana',
4078         'PF': 'French Polynesia',
4079         'TF': 'French Southern Territories',
4080         'GA': 'Gabon',
4081         'GM': 'Gambia',
4082         'GE': 'Georgia',
4083         'DE': 'Germany',
4084         'GH': 'Ghana',
4085         'GI': 'Gibraltar',
4086         'GR': 'Greece',
4087         'GL': 'Greenland',
4088         'GD': 'Grenada',
4089         'GP': 'Guadeloupe',
4090         'GU': 'Guam',
4091         'GT': 'Guatemala',
4092         'GG': 'Guernsey',
4093         'GN': 'Guinea',
4094         'GW': 'Guinea-Bissau',
4095         'GY': 'Guyana',
4096         'HT': 'Haiti',
4097         'HM': 'Heard Island and McDonald Islands',
4098         'VA': 'Holy See (Vatican City State)',
4099         'HN': 'Honduras',
4100         'HK': 'Hong Kong',
4101         'HU': 'Hungary',
4102         'IS': 'Iceland',
4103         'IN': 'India',
4104         'ID': 'Indonesia',
4105         'IR': 'Iran, Islamic Republic of',
4106         'IQ': 'Iraq',
4107         'IE': 'Ireland',
4108         'IM': 'Isle of Man',
4109         'IL': 'Israel',
4110         'IT': 'Italy',
4111         'JM': 'Jamaica',
4112         'JP': 'Japan',
4113         'JE': 'Jersey',
4114         'JO': 'Jordan',
4115         'KZ': 'Kazakhstan',
4116         'KE': 'Kenya',
4117         'KI': 'Kiribati',
4118         'KP': 'Korea, Democratic People\'s Republic of',
4119         'KR': 'Korea, Republic of',
4120         'KW': 'Kuwait',
4121         'KG': 'Kyrgyzstan',
4122         'LA': 'Lao People\'s Democratic Republic',
4123         'LV': 'Latvia',
4124         'LB': 'Lebanon',
4125         'LS': 'Lesotho',
4126         'LR': 'Liberia',
4127         'LY': 'Libya',
4128         'LI': 'Liechtenstein',
4129         'LT': 'Lithuania',
4130         'LU': 'Luxembourg',
4131         'MO': 'Macao',
4132         'MK': 'Macedonia, the Former Yugoslav Republic of',
4133         'MG': 'Madagascar',
4134         'MW': 'Malawi',
4135         'MY': 'Malaysia',
4136         'MV': 'Maldives',
4137         'ML': 'Mali',
4138         'MT': 'Malta',
4139         'MH': 'Marshall Islands',
4140         'MQ': 'Martinique',
4141         'MR': 'Mauritania',
4142         'MU': 'Mauritius',
4143         'YT': 'Mayotte',
4144         'MX': 'Mexico',
4145         'FM': 'Micronesia, Federated States of',
4146         'MD': 'Moldova, Republic of',
4147         'MC': 'Monaco',
4148         'MN': 'Mongolia',
4149         'ME': 'Montenegro',
4150         'MS': 'Montserrat',
4151         'MA': 'Morocco',
4152         'MZ': 'Mozambique',
4153         'MM': 'Myanmar',
4154         'NA': 'Namibia',
4155         'NR': 'Nauru',
4156         'NP': 'Nepal',
4157         'NL': 'Netherlands',
4158         'NC': 'New Caledonia',
4159         'NZ': 'New Zealand',
4160         'NI': 'Nicaragua',
4161         'NE': 'Niger',
4162         'NG': 'Nigeria',
4163         'NU': 'Niue',
4164         'NF': 'Norfolk Island',
4165         'MP': 'Northern Mariana Islands',
4166         'NO': 'Norway',
4167         'OM': 'Oman',
4168         'PK': 'Pakistan',
4169         'PW': 'Palau',
4170         'PS': 'Palestine, State of',
4171         'PA': 'Panama',
4172         'PG': 'Papua New Guinea',
4173         'PY': 'Paraguay',
4174         'PE': 'Peru',
4175         'PH': 'Philippines',
4176         'PN': 'Pitcairn',
4177         'PL': 'Poland',
4178         'PT': 'Portugal',
4179         'PR': 'Puerto Rico',
4180         'QA': 'Qatar',
4181         'RE': 'Réunion',
4182         'RO': 'Romania',
4183         'RU': 'Russian Federation',
4184         'RW': 'Rwanda',
4185         'BL': 'Saint Barthélemy',
4186         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4187         'KN': 'Saint Kitts and Nevis',
4188         'LC': 'Saint Lucia',
4189         'MF': 'Saint Martin (French part)',
4190         'PM': 'Saint Pierre and Miquelon',
4191         'VC': 'Saint Vincent and the Grenadines',
4192         'WS': 'Samoa',
4193         'SM': 'San Marino',
4194         'ST': 'Sao Tome and Principe',
4195         'SA': 'Saudi Arabia',
4196         'SN': 'Senegal',
4197         'RS': 'Serbia',
4198         'SC': 'Seychelles',
4199         'SL': 'Sierra Leone',
4200         'SG': 'Singapore',
4201         'SX': 'Sint Maarten (Dutch part)',
4202         'SK': 'Slovakia',
4203         'SI': 'Slovenia',
4204         'SB': 'Solomon Islands',
4205         'SO': 'Somalia',
4206         'ZA': 'South Africa',
4207         'GS': 'South Georgia and the South Sandwich Islands',
4208         'SS': 'South Sudan',
4209         'ES': 'Spain',
4210         'LK': 'Sri Lanka',
4211         'SD': 'Sudan',
4212         'SR': 'Suriname',
4213         'SJ': 'Svalbard and Jan Mayen',
4214         'SZ': 'Swaziland',
4215         'SE': 'Sweden',
4216         'CH': 'Switzerland',
4217         'SY': 'Syrian Arab Republic',
4218         'TW': 'Taiwan, Province of China',
4219         'TJ': 'Tajikistan',
4220         'TZ': 'Tanzania, United Republic of',
4221         'TH': 'Thailand',
4222         'TL': 'Timor-Leste',
4223         'TG': 'Togo',
4224         'TK': 'Tokelau',
4225         'TO': 'Tonga',
4226         'TT': 'Trinidad and Tobago',
4227         'TN': 'Tunisia',
4228         'TR': 'Turkey',
4229         'TM': 'Turkmenistan',
4230         'TC': 'Turks and Caicos Islands',
4231         'TV': 'Tuvalu',
4232         'UG': 'Uganda',
4233         'UA': 'Ukraine',
4234         'AE': 'United Arab Emirates',
4235         'GB': 'United Kingdom',
4236         'US': 'United States',
4237         'UM': 'United States Minor Outlying Islands',
4238         'UY': 'Uruguay',
4239         'UZ': 'Uzbekistan',
4240         'VU': 'Vanuatu',
4241         'VE': 'Venezuela, Bolivarian Republic of',
4242         'VN': 'Viet Nam',
4243         'VG': 'Virgin Islands, British',
4244         'VI': 'Virgin Islands, U.S.',
4245         'WF': 'Wallis and Futuna',
4246         'EH': 'Western Sahara',
4247         'YE': 'Yemen',
4248         'ZM': 'Zambia',
4249         'ZW': 'Zimbabwe',
4250     }
4251
4252     @classmethod
4253     def short2full(cls, code):
4254         """Convert an ISO 3166-2 country code to the corresponding full name"""
4255         return cls._country_map.get(code.upper())
4256
4257
4258 class GeoUtils(object):
4259     # Major IPv4 address blocks per country
4260     _country_ip_map = {
4261         'AD': '46.172.224.0/19',
4262         'AE': '94.200.0.0/13',
4263         'AF': '149.54.0.0/17',
4264         'AG': '209.59.64.0/18',
4265         'AI': '204.14.248.0/21',
4266         'AL': '46.99.0.0/16',
4267         'AM': '46.70.0.0/15',
4268         'AO': '105.168.0.0/13',
4269         'AP': '182.50.184.0/21',
4270         'AQ': '23.154.160.0/24',
4271         'AR': '181.0.0.0/12',
4272         'AS': '202.70.112.0/20',
4273         'AT': '77.116.0.0/14',
4274         'AU': '1.128.0.0/11',
4275         'AW': '181.41.0.0/18',
4276         'AX': '185.217.4.0/22',
4277         'AZ': '5.197.0.0/16',
4278         'BA': '31.176.128.0/17',
4279         'BB': '65.48.128.0/17',
4280         'BD': '114.130.0.0/16',
4281         'BE': '57.0.0.0/8',
4282         'BF': '102.178.0.0/15',
4283         'BG': '95.42.0.0/15',
4284         'BH': '37.131.0.0/17',
4285         'BI': '154.117.192.0/18',
4286         'BJ': '137.255.0.0/16',
4287         'BL': '185.212.72.0/23',
4288         'BM': '196.12.64.0/18',
4289         'BN': '156.31.0.0/16',
4290         'BO': '161.56.0.0/16',
4291         'BQ': '161.0.80.0/20',
4292         'BR': '191.128.0.0/12',
4293         'BS': '24.51.64.0/18',
4294         'BT': '119.2.96.0/19',
4295         'BW': '168.167.0.0/16',
4296         'BY': '178.120.0.0/13',
4297         'BZ': '179.42.192.0/18',
4298         'CA': '99.224.0.0/11',
4299         'CD': '41.243.0.0/16',
4300         'CF': '197.242.176.0/21',
4301         'CG': '160.113.0.0/16',
4302         'CH': '85.0.0.0/13',
4303         'CI': '102.136.0.0/14',
4304         'CK': '202.65.32.0/19',
4305         'CL': '152.172.0.0/14',
4306         'CM': '102.244.0.0/14',
4307         'CN': '36.128.0.0/10',
4308         'CO': '181.240.0.0/12',
4309         'CR': '201.192.0.0/12',
4310         'CU': '152.206.0.0/15',
4311         'CV': '165.90.96.0/19',
4312         'CW': '190.88.128.0/17',
4313         'CY': '31.153.0.0/16',
4314         'CZ': '88.100.0.0/14',
4315         'DE': '53.0.0.0/8',
4316         'DJ': '197.241.0.0/17',
4317         'DK': '87.48.0.0/12',
4318         'DM': '192.243.48.0/20',
4319         'DO': '152.166.0.0/15',
4320         'DZ': '41.96.0.0/12',
4321         'EC': '186.68.0.0/15',
4322         'EE': '90.190.0.0/15',
4323         'EG': '156.160.0.0/11',
4324         'ER': '196.200.96.0/20',
4325         'ES': '88.0.0.0/11',
4326         'ET': '196.188.0.0/14',
4327         'EU': '2.16.0.0/13',
4328         'FI': '91.152.0.0/13',
4329         'FJ': '144.120.0.0/16',
4330         'FK': '80.73.208.0/21',
4331         'FM': '119.252.112.0/20',
4332         'FO': '88.85.32.0/19',
4333         'FR': '90.0.0.0/9',
4334         'GA': '41.158.0.0/15',
4335         'GB': '25.0.0.0/8',
4336         'GD': '74.122.88.0/21',
4337         'GE': '31.146.0.0/16',
4338         'GF': '161.22.64.0/18',
4339         'GG': '62.68.160.0/19',
4340         'GH': '154.160.0.0/12',
4341         'GI': '95.164.0.0/16',
4342         'GL': '88.83.0.0/19',
4343         'GM': '160.182.0.0/15',
4344         'GN': '197.149.192.0/18',
4345         'GP': '104.250.0.0/19',
4346         'GQ': '105.235.224.0/20',
4347         'GR': '94.64.0.0/13',
4348         'GT': '168.234.0.0/16',
4349         'GU': '168.123.0.0/16',
4350         'GW': '197.214.80.0/20',
4351         'GY': '181.41.64.0/18',
4352         'HK': '113.252.0.0/14',
4353         'HN': '181.210.0.0/16',
4354         'HR': '93.136.0.0/13',
4355         'HT': '148.102.128.0/17',
4356         'HU': '84.0.0.0/14',
4357         'ID': '39.192.0.0/10',
4358         'IE': '87.32.0.0/12',
4359         'IL': '79.176.0.0/13',
4360         'IM': '5.62.80.0/20',
4361         'IN': '117.192.0.0/10',
4362         'IO': '203.83.48.0/21',
4363         'IQ': '37.236.0.0/14',
4364         'IR': '2.176.0.0/12',
4365         'IS': '82.221.0.0/16',
4366         'IT': '79.0.0.0/10',
4367         'JE': '87.244.64.0/18',
4368         'JM': '72.27.0.0/17',
4369         'JO': '176.29.0.0/16',
4370         'JP': '133.0.0.0/8',
4371         'KE': '105.48.0.0/12',
4372         'KG': '158.181.128.0/17',
4373         'KH': '36.37.128.0/17',
4374         'KI': '103.25.140.0/22',
4375         'KM': '197.255.224.0/20',
4376         'KN': '198.167.192.0/19',
4377         'KP': '175.45.176.0/22',
4378         'KR': '175.192.0.0/10',
4379         'KW': '37.36.0.0/14',
4380         'KY': '64.96.0.0/15',
4381         'KZ': '2.72.0.0/13',
4382         'LA': '115.84.64.0/18',
4383         'LB': '178.135.0.0/16',
4384         'LC': '24.92.144.0/20',
4385         'LI': '82.117.0.0/19',
4386         'LK': '112.134.0.0/15',
4387         'LR': '102.183.0.0/16',
4388         'LS': '129.232.0.0/17',
4389         'LT': '78.56.0.0/13',
4390         'LU': '188.42.0.0/16',
4391         'LV': '46.109.0.0/16',
4392         'LY': '41.252.0.0/14',
4393         'MA': '105.128.0.0/11',
4394         'MC': '88.209.64.0/18',
4395         'MD': '37.246.0.0/16',
4396         'ME': '178.175.0.0/17',
4397         'MF': '74.112.232.0/21',
4398         'MG': '154.126.0.0/17',
4399         'MH': '117.103.88.0/21',
4400         'MK': '77.28.0.0/15',
4401         'ML': '154.118.128.0/18',
4402         'MM': '37.111.0.0/17',
4403         'MN': '49.0.128.0/17',
4404         'MO': '60.246.0.0/16',
4405         'MP': '202.88.64.0/20',
4406         'MQ': '109.203.224.0/19',
4407         'MR': '41.188.64.0/18',
4408         'MS': '208.90.112.0/22',
4409         'MT': '46.11.0.0/16',
4410         'MU': '105.16.0.0/12',
4411         'MV': '27.114.128.0/18',
4412         'MW': '102.70.0.0/15',
4413         'MX': '187.192.0.0/11',
4414         'MY': '175.136.0.0/13',
4415         'MZ': '197.218.0.0/15',
4416         'NA': '41.182.0.0/16',
4417         'NC': '101.101.0.0/18',
4418         'NE': '197.214.0.0/18',
4419         'NF': '203.17.240.0/22',
4420         'NG': '105.112.0.0/12',
4421         'NI': '186.76.0.0/15',
4422         'NL': '145.96.0.0/11',
4423         'NO': '84.208.0.0/13',
4424         'NP': '36.252.0.0/15',
4425         'NR': '203.98.224.0/19',
4426         'NU': '49.156.48.0/22',
4427         'NZ': '49.224.0.0/14',
4428         'OM': '5.36.0.0/15',
4429         'PA': '186.72.0.0/15',
4430         'PE': '186.160.0.0/14',
4431         'PF': '123.50.64.0/18',
4432         'PG': '124.240.192.0/19',
4433         'PH': '49.144.0.0/13',
4434         'PK': '39.32.0.0/11',
4435         'PL': '83.0.0.0/11',
4436         'PM': '70.36.0.0/20',
4437         'PR': '66.50.0.0/16',
4438         'PS': '188.161.0.0/16',
4439         'PT': '85.240.0.0/13',
4440         'PW': '202.124.224.0/20',
4441         'PY': '181.120.0.0/14',
4442         'QA': '37.210.0.0/15',
4443         'RE': '102.35.0.0/16',
4444         'RO': '79.112.0.0/13',
4445         'RS': '93.86.0.0/15',
4446         'RU': '5.136.0.0/13',
4447         'RW': '41.186.0.0/16',
4448         'SA': '188.48.0.0/13',
4449         'SB': '202.1.160.0/19',
4450         'SC': '154.192.0.0/11',
4451         'SD': '102.120.0.0/13',
4452         'SE': '78.64.0.0/12',
4453         'SG': '8.128.0.0/10',
4454         'SI': '188.196.0.0/14',
4455         'SK': '78.98.0.0/15',
4456         'SL': '102.143.0.0/17',
4457         'SM': '89.186.32.0/19',
4458         'SN': '41.82.0.0/15',
4459         'SO': '154.115.192.0/18',
4460         'SR': '186.179.128.0/17',
4461         'SS': '105.235.208.0/21',
4462         'ST': '197.159.160.0/19',
4463         'SV': '168.243.0.0/16',
4464         'SX': '190.102.0.0/20',
4465         'SY': '5.0.0.0/16',
4466         'SZ': '41.84.224.0/19',
4467         'TC': '65.255.48.0/20',
4468         'TD': '154.68.128.0/19',
4469         'TG': '196.168.0.0/14',
4470         'TH': '171.96.0.0/13',
4471         'TJ': '85.9.128.0/18',
4472         'TK': '27.96.24.0/21',
4473         'TL': '180.189.160.0/20',
4474         'TM': '95.85.96.0/19',
4475         'TN': '197.0.0.0/11',
4476         'TO': '175.176.144.0/21',
4477         'TR': '78.160.0.0/11',
4478         'TT': '186.44.0.0/15',
4479         'TV': '202.2.96.0/19',
4480         'TW': '120.96.0.0/11',
4481         'TZ': '156.156.0.0/14',
4482         'UA': '37.52.0.0/14',
4483         'UG': '102.80.0.0/13',
4484         'US': '6.0.0.0/8',
4485         'UY': '167.56.0.0/13',
4486         'UZ': '84.54.64.0/18',
4487         'VA': '212.77.0.0/19',
4488         'VC': '207.191.240.0/21',
4489         'VE': '186.88.0.0/13',
4490         'VG': '66.81.192.0/20',
4491         'VI': '146.226.0.0/16',
4492         'VN': '14.160.0.0/11',
4493         'VU': '202.80.32.0/20',
4494         'WF': '117.20.32.0/21',
4495         'WS': '202.4.32.0/19',
4496         'YE': '134.35.0.0/16',
4497         'YT': '41.242.116.0/22',
4498         'ZA': '41.0.0.0/11',
4499         'ZM': '102.144.0.0/13',
4500         'ZW': '102.177.192.0/18',
4501     }
4502
4503     @classmethod
4504     def random_ipv4(cls, code_or_block):
4505         if len(code_or_block) == 2:
4506             block = cls._country_ip_map.get(code_or_block.upper())
4507             if not block:
4508                 return None
4509         else:
4510             block = code_or_block
4511         addr, preflen = block.split('/')
4512         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4513         addr_max = addr_min | (0xffffffff >> int(preflen))
4514         return compat_str(socket.inet_ntoa(
4515             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4516
4517
4518 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4519     def __init__(self, proxies=None):
4520         # Set default handlers
4521         for type in ('http', 'https'):
4522             setattr(self, '%s_open' % type,
4523                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4524                         meth(r, proxy, type))
4525         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4526
4527     def proxy_open(self, req, proxy, type):
4528         req_proxy = req.headers.get('Ytdl-request-proxy')
4529         if req_proxy is not None:
4530             proxy = req_proxy
4531             del req.headers['Ytdl-request-proxy']
4532
4533         if proxy == '__noproxy__':
4534             return None  # No Proxy
4535         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4536             req.add_header('Ytdl-socks-proxy', proxy)
4537             # yt-dlp's http/https handlers do wrapping the socket with socks
4538             return None
4539         return compat_urllib_request.ProxyHandler.proxy_open(
4540             self, req, proxy, type)
4541
4542
4543 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4544 # released into Public Domain
4545 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4546
4547 def long_to_bytes(n, blocksize=0):
4548     """long_to_bytes(n:long, blocksize:int) : string
4549     Convert a long integer to a byte string.
4550
4551     If optional blocksize is given and greater than zero, pad the front of the
4552     byte string with binary zeros so that the length is a multiple of
4553     blocksize.
4554     """
4555     # after much testing, this algorithm was deemed to be the fastest
4556     s = b''
4557     n = int(n)
4558     while n > 0:
4559         s = compat_struct_pack('>I', n & 0xffffffff) + s
4560         n = n >> 32
4561     # strip off leading zeros
4562     for i in range(len(s)):
4563         if s[i] != b'\000'[0]:
4564             break
4565     else:
4566         # only happens when n == 0
4567         s = b'\000'
4568         i = 0
4569     s = s[i:]
4570     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4571     # de-padding being done above, but sigh...
4572     if blocksize > 0 and len(s) % blocksize:
4573         s = (blocksize - len(s) % blocksize) * b'\000' + s
4574     return s
4575
4576
4577 def bytes_to_long(s):
4578     """bytes_to_long(string) : long
4579     Convert a byte string to a long integer.
4580
4581     This is (essentially) the inverse of long_to_bytes().
4582     """
4583     acc = 0
4584     length = len(s)
4585     if length % 4:
4586         extra = (4 - length % 4)
4587         s = b'\000' * extra + s
4588         length = length + extra
4589     for i in range(0, length, 4):
4590         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4591     return acc
4592
4593
4594 def ohdave_rsa_encrypt(data, exponent, modulus):
4595     '''
4596     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4597
4598     Input:
4599         data: data to encrypt, bytes-like object
4600         exponent, modulus: parameter e and N of RSA algorithm, both integer
4601     Output: hex string of encrypted data
4602
4603     Limitation: supports one block encryption only
4604     '''
4605
4606     payload = int(binascii.hexlify(data[::-1]), 16)
4607     encrypted = pow(payload, exponent, modulus)
4608     return '%x' % encrypted
4609
4610
4611 def pkcs1pad(data, length):
4612     """
4613     Padding input data with PKCS#1 scheme
4614
4615     @param {int[]} data        input data
4616     @param {int}   length      target length
4617     @returns {int[]}           padded data
4618     """
4619     if len(data) > length - 11:
4620         raise ValueError('Input data too long for PKCS#1 padding')
4621
4622     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4623     return [0, 2] + pseudo_random + [0] + data
4624
4625
4626 def encode_base_n(num, n, table=None):
4627     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4628     if not table:
4629         table = FULL_TABLE[:n]
4630
4631     if n > len(table):
4632         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4633
4634     if num == 0:
4635         return table[0]
4636
4637     ret = ''
4638     while num:
4639         ret = table[num % n] + ret
4640         num = num // n
4641     return ret
4642
4643
4644 def decode_packed_codes(code):
4645     mobj = re.search(PACKED_CODES_RE, code)
4646     obfuscated_code, base, count, symbols = mobj.groups()
4647     base = int(base)
4648     count = int(count)
4649     symbols = symbols.split('|')
4650     symbol_table = {}
4651
4652     while count:
4653         count -= 1
4654         base_n_count = encode_base_n(count, base)
4655         symbol_table[base_n_count] = symbols[count] or base_n_count
4656
4657     return re.sub(
4658         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4659         obfuscated_code)
4660
4661
4662 def caesar(s, alphabet, shift):
4663     if shift == 0:
4664         return s
4665     l = len(alphabet)
4666     return ''.join(
4667         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4668         for c in s)
4669
4670
4671 def rot47(s):
4672     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4673
4674
4675 def parse_m3u8_attributes(attrib):
4676     info = {}
4677     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4678         if val.startswith('"'):
4679             val = val[1:-1]
4680         info[key] = val
4681     return info
4682
4683
4684 def urshift(val, n):
4685     return val >> n if val >= 0 else (val + 0x100000000) >> n
4686
4687
4688 # Based on png2str() written by @gdkchan and improved by @yokrysty
4689 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4690 def decode_png(png_data):
4691     # Reference: https://www.w3.org/TR/PNG/
4692     header = png_data[8:]
4693
4694     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4695         raise IOError('Not a valid PNG file.')
4696
4697     int_map = {1: '>B', 2: '>H', 4: '>I'}
4698     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4699
4700     chunks = []
4701
4702     while header:
4703         length = unpack_integer(header[:4])
4704         header = header[4:]
4705
4706         chunk_type = header[:4]
4707         header = header[4:]
4708
4709         chunk_data = header[:length]
4710         header = header[length:]
4711
4712         header = header[4:]  # Skip CRC
4713
4714         chunks.append({
4715             'type': chunk_type,
4716             'length': length,
4717             'data': chunk_data
4718         })
4719
4720     ihdr = chunks[0]['data']
4721
4722     width = unpack_integer(ihdr[:4])
4723     height = unpack_integer(ihdr[4:8])
4724
4725     idat = b''
4726
4727     for chunk in chunks:
4728         if chunk['type'] == b'IDAT':
4729             idat += chunk['data']
4730
4731     if not idat:
4732         raise IOError('Unable to read PNG data.')
4733
4734     decompressed_data = bytearray(zlib.decompress(idat))
4735
4736     stride = width * 3
4737     pixels = []
4738
4739     def _get_pixel(idx):
4740         x = idx % stride
4741         y = idx // stride
4742         return pixels[y][x]
4743
4744     for y in range(height):
4745         basePos = y * (1 + stride)
4746         filter_type = decompressed_data[basePos]
4747
4748         current_row = []
4749
4750         pixels.append(current_row)
4751
4752         for x in range(stride):
4753             color = decompressed_data[1 + basePos + x]
4754             basex = y * stride + x
4755             left = 0
4756             up = 0
4757
4758             if x > 2:
4759                 left = _get_pixel(basex - 3)
4760             if y > 0:
4761                 up = _get_pixel(basex - stride)
4762
4763             if filter_type == 1:  # Sub
4764                 color = (color + left) & 0xff
4765             elif filter_type == 2:  # Up
4766                 color = (color + up) & 0xff
4767             elif filter_type == 3:  # Average
4768                 color = (color + ((left + up) >> 1)) & 0xff
4769             elif filter_type == 4:  # Paeth
4770                 a = left
4771                 b = up
4772                 c = 0
4773
4774                 if x > 2 and y > 0:
4775                     c = _get_pixel(basex - stride - 3)
4776
4777                 p = a + b - c
4778
4779                 pa = abs(p - a)
4780                 pb = abs(p - b)
4781                 pc = abs(p - c)
4782
4783                 if pa <= pb and pa <= pc:
4784                     color = (color + a) & 0xff
4785                 elif pb <= pc:
4786                     color = (color + b) & 0xff
4787                 else:
4788                     color = (color + c) & 0xff
4789
4790             current_row.append(color)
4791
4792     return width, height, pixels
4793
4794
4795 def write_xattr(path, key, value):
4796     # This mess below finds the best xattr tool for the job
4797     try:
4798         # try the pyxattr module...
4799         import xattr
4800
4801         if hasattr(xattr, 'set'):  # pyxattr
4802             # Unicode arguments are not supported in python-pyxattr until
4803             # version 0.5.0
4804             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4805             pyxattr_required_version = '0.5.0'
4806             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4807                 # TODO: fallback to CLI tools
4808                 raise XAttrUnavailableError(
4809                     'python-pyxattr is detected but is too old. '
4810                     'yt-dlp requires %s or above while your version is %s. '
4811                     'Falling back to other xattr implementations' % (
4812                         pyxattr_required_version, xattr.__version__))
4813
4814             setxattr = xattr.set
4815         else:  # xattr
4816             setxattr = xattr.setxattr
4817
4818         try:
4819             setxattr(path, key, value)
4820         except EnvironmentError as e:
4821             raise XAttrMetadataError(e.errno, e.strerror)
4822
4823     except ImportError:
4824         if compat_os_name == 'nt':
4825             # Write xattrs to NTFS Alternate Data Streams:
4826             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4827             assert ':' not in key
4828             assert os.path.exists(path)
4829
4830             ads_fn = path + ':' + key
4831             try:
4832                 with open(ads_fn, 'wb') as f:
4833                     f.write(value)
4834             except EnvironmentError as e:
4835                 raise XAttrMetadataError(e.errno, e.strerror)
4836         else:
4837             user_has_setfattr = check_executable('setfattr', ['--version'])
4838             user_has_xattr = check_executable('xattr', ['-h'])
4839
4840             if user_has_setfattr or user_has_xattr:
4841
4842                 value = value.decode('utf-8')
4843                 if user_has_setfattr:
4844                     executable = 'setfattr'
4845                     opts = ['-n', key, '-v', value]
4846                 elif user_has_xattr:
4847                     executable = 'xattr'
4848                     opts = ['-w', key, value]
4849
4850                 cmd = ([encodeFilename(executable, True)]
4851                        + [encodeArgument(o) for o in opts]
4852                        + [encodeFilename(path, True)])
4853
4854                 try:
4855                     p = Popen(
4856                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4857                 except EnvironmentError as e:
4858                     raise XAttrMetadataError(e.errno, e.strerror)
4859                 stdout, stderr = p.communicate_or_kill()
4860                 stderr = stderr.decode('utf-8', 'replace')
4861                 if p.returncode != 0:
4862                     raise XAttrMetadataError(p.returncode, stderr)
4863
4864             else:
4865                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4866                 if sys.platform.startswith('linux'):
4867                     raise XAttrUnavailableError(
4868                         "Couldn't find a tool to set the xattrs. "
4869                         "Install either the python 'pyxattr' or 'xattr' "
4870                         "modules, or the GNU 'attr' package "
4871                         "(which contains the 'setfattr' tool).")
4872                 else:
4873                     raise XAttrUnavailableError(
4874                         "Couldn't find a tool to set the xattrs. "
4875                         "Install either the python 'xattr' module, "
4876                         "or the 'xattr' binary.")
4877
4878
4879 def random_birthday(year_field, month_field, day_field):
4880     start_date = datetime.date(1950, 1, 1)
4881     end_date = datetime.date(1995, 12, 31)
4882     offset = random.randint(0, (end_date - start_date).days)
4883     random_date = start_date + datetime.timedelta(offset)
4884     return {
4885         year_field: str(random_date.year),
4886         month_field: str(random_date.month),
4887         day_field: str(random_date.day),
4888     }
4889
4890
4891 # Templates for internet shortcut files, which are plain text files.
4892 DOT_URL_LINK_TEMPLATE = '''
4893 [InternetShortcut]
4894 URL=%(url)s
4895 '''.lstrip()
4896
4897 DOT_WEBLOC_LINK_TEMPLATE = '''
4898 <?xml version="1.0" encoding="UTF-8"?>
4899 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4900 <plist version="1.0">
4901 <dict>
4902 \t<key>URL</key>
4903 \t<string>%(url)s</string>
4904 </dict>
4905 </plist>
4906 '''.lstrip()
4907
4908 DOT_DESKTOP_LINK_TEMPLATE = '''
4909 [Desktop Entry]
4910 Encoding=UTF-8
4911 Name=%(filename)s
4912 Type=Link
4913 URL=%(url)s
4914 Icon=text-html
4915 '''.lstrip()
4916
4917 LINK_TEMPLATES = {
4918     'url': DOT_URL_LINK_TEMPLATE,
4919     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4920     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4921 }
4922
4923
4924 def iri_to_uri(iri):
4925     """
4926     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4927
4928     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4929     """
4930
4931     iri_parts = compat_urllib_parse_urlparse(iri)
4932
4933     if '[' in iri_parts.netloc:
4934         raise ValueError('IPv6 URIs are not, yet, supported.')
4935         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4936
4937     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4938
4939     net_location = ''
4940     if iri_parts.username:
4941         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4942         if iri_parts.password is not None:
4943             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4944         net_location += '@'
4945
4946     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4947     # The 'idna' encoding produces ASCII text.
4948     if iri_parts.port is not None and iri_parts.port != 80:
4949         net_location += ':' + str(iri_parts.port)
4950
4951     return compat_urllib_parse_urlunparse(
4952         (iri_parts.scheme,
4953             net_location,
4954
4955             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4956
4957             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4958             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4959
4960             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4961             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4962
4963             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4964
4965     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4966
4967
4968 def to_high_limit_path(path):
4969     if sys.platform in ['win32', 'cygwin']:
4970         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4971         return r'\\?\ '.rstrip() + os.path.abspath(path)
4972
4973     return path
4974
4975
4976 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4977     val = traverse_obj(obj, *variadic(field))
4978     if val in ignore:
4979         return default
4980     return template % (func(val) if func else val)
4981
4982
4983 def clean_podcast_url(url):
4984     return re.sub(r'''(?x)
4985         (?:
4986             (?:
4987                 chtbl\.com/track|
4988                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4989                 play\.podtrac\.com
4990             )/[^/]+|
4991             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4992             flex\.acast\.com|
4993             pd(?:
4994                 cn\.co| # https://podcorn.com/analytics-prefix/
4995                 st\.fm # https://podsights.com/docs/
4996             )/e
4997         )/''', '', url)
4998
4999
5000 _HEX_TABLE = '0123456789abcdef'
5001
5002
5003 def random_uuidv4():
5004     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5005
5006
5007 def make_dir(path, to_screen=None):
5008     try:
5009         dn = os.path.dirname(path)
5010         if dn and not os.path.exists(dn):
5011             os.makedirs(dn)
5012         return True
5013     except (OSError, IOError) as err:
5014         if callable(to_screen) is not None:
5015             to_screen('unable to create directory ' + error_to_compat_str(err))
5016         return False
5017
5018
5019 def get_executable_path():
5020     from zipimport import zipimporter
5021     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5022         path = os.path.dirname(sys.executable)
5023     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5024         path = os.path.join(os.path.dirname(__file__), '../..')
5025     else:
5026         path = os.path.join(os.path.dirname(__file__), '..')
5027     return os.path.abspath(path)
5028
5029
5030 def load_plugins(name, suffix, namespace):
5031     classes = {}
5032     try:
5033         plugins_spec = importlib.util.spec_from_file_location(
5034             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5035         plugins = importlib.util.module_from_spec(plugins_spec)
5036         sys.modules[plugins_spec.name] = plugins
5037         plugins_spec.loader.exec_module(plugins)
5038         for name in dir(plugins):
5039             if name in namespace:
5040                 continue
5041             if not name.endswith(suffix):
5042                 continue
5043             klass = getattr(plugins, name)
5044             classes[name] = namespace[name] = klass
5045     except FileNotFoundError:
5046         pass
5047     return classes
5048
5049
5050 def traverse_obj(
5051         obj, *path_list, default=None, expected_type=None, get_all=True,
5052         casesense=True, is_user_input=False, traverse_string=False):
5053     ''' Traverse nested list/dict/tuple
5054     @param path_list        A list of paths which are checked one by one.
5055                             Each path is a list of keys where each key is a string,
5056                             a function, a tuple of strings/None or "...".
5057                             When a fuction is given, it takes the key as argument and
5058                             returns whether the key matches or not. When a tuple is given,
5059                             all the keys given in the tuple are traversed, and
5060                             "..." traverses all the keys in the object
5061                             "None" returns the object without traversal
5062     @param default          Default value to return
5063     @param expected_type    Only accept final value of this type (Can also be any callable)
5064     @param get_all          Return all the values obtained from a path or only the first one
5065     @param casesense        Whether to consider dictionary keys as case sensitive
5066     @param is_user_input    Whether the keys are generated from user input. If True,
5067                             strings are converted to int/slice if necessary
5068     @param traverse_string  Whether to traverse inside strings. If True, any
5069                             non-compatible object will also be converted into a string
5070     # TODO: Write tests
5071     '''
5072     if not casesense:
5073         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5074         path_list = (map(_lower, variadic(path)) for path in path_list)
5075
5076     def _traverse_obj(obj, path, _current_depth=0):
5077         nonlocal depth
5078         path = tuple(variadic(path))
5079         for i, key in enumerate(path):
5080             if None in (key, obj):
5081                 return obj
5082             if isinstance(key, (list, tuple)):
5083                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5084                 key = ...
5085             if key is ...:
5086                 obj = (obj.values() if isinstance(obj, dict)
5087                        else obj if isinstance(obj, (list, tuple, LazyList))
5088                        else str(obj) if traverse_string else [])
5089                 _current_depth += 1
5090                 depth = max(depth, _current_depth)
5091                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5092             elif callable(key):
5093                 if isinstance(obj, (list, tuple, LazyList)):
5094                     obj = enumerate(obj)
5095                 elif isinstance(obj, dict):
5096                     obj = obj.items()
5097                 else:
5098                     if not traverse_string:
5099                         return None
5100                     obj = str(obj)
5101                 _current_depth += 1
5102                 depth = max(depth, _current_depth)
5103                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5104             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5105                 obj = (obj.get(key) if casesense or (key in obj)
5106                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5107             else:
5108                 if is_user_input:
5109                     key = (int_or_none(key) if ':' not in key
5110                            else slice(*map(int_or_none, key.split(':'))))
5111                     if key == slice(None):
5112                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5113                 if not isinstance(key, (int, slice)):
5114                     return None
5115                 if not isinstance(obj, (list, tuple, LazyList)):
5116                     if not traverse_string:
5117                         return None
5118                     obj = str(obj)
5119                 try:
5120                     obj = obj[key]
5121                 except IndexError:
5122                     return None
5123         return obj
5124
5125     if isinstance(expected_type, type):
5126         type_test = lambda val: val if isinstance(val, expected_type) else None
5127     elif expected_type is not None:
5128         type_test = expected_type
5129     else:
5130         type_test = lambda val: val
5131
5132     for path in path_list:
5133         depth = 0
5134         val = _traverse_obj(obj, path)
5135         if val is not None:
5136             if depth:
5137                 for _ in range(depth - 1):
5138                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5139                 val = [v for v in map(type_test, val) if v is not None]
5140                 if val:
5141                     return val if get_all else val[0]
5142             else:
5143                 val = type_test(val)
5144                 if val is not None:
5145                     return val
5146     return default
5147
5148
5149 def traverse_dict(dictn, keys, casesense=True):
5150     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5151                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5152     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5153
5154
5155 def variadic(x, allowed_types=(str, bytes, dict)):
5156     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5157
5158
5159 # create a JSON Web Signature (jws) with HS256 algorithm
5160 # the resulting format is in JWS Compact Serialization
5161 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5162 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5163 def jwt_encode_hs256(payload_data, key, headers={}):
5164     header_data = {
5165         'alg': 'HS256',
5166         'typ': 'JWT',
5167     }
5168     if headers:
5169         header_data.update(headers)
5170     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5171     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5172     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5173     signature_b64 = base64.b64encode(h.digest())
5174     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5175     return token
5176
5177
5178 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5179 def jwt_decode_hs256(jwt):
5180     header_b64, payload_b64, signature_b64 = jwt.split('.')
5181     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5182     return payload_data
5183
5184
5185 def supports_terminal_sequences(stream):
5186     if compat_os_name == 'nt':
5187         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5188         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5189             return False
5190     elif not os.getenv('TERM'):
5191         return False
5192     try:
5193         return stream.isatty()
5194     except BaseException:
5195         return False
5196
5197
5198 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5199
5200
5201 def remove_terminal_sequences(string):
5202     return _terminal_sequences_re.sub('', string)
5203
5204
5205 def number_of_digits(number):
5206     return len('%d' % number)
5207
5208
5209 def join_nonempty(*values, delim='-', from_dict=None):
5210     if from_dict is not None:
5211         values = map(from_dict.get, values)
5212     return delim.join(map(str, filter(None, values)))
5213
5214
5215 class Config:
5216     own_args = None
5217     filename = None
5218     __initialized = False
5219
5220     def __init__(self, parser, label=None):
5221         self._parser, self.label = parser, label
5222         self._loaded_paths, self.configs = set(), []
5223
5224     def init(self, args=None, filename=None):
5225         assert not self.__initialized
5226         if filename:
5227             location = os.path.realpath(filename)
5228             if location in self._loaded_paths:
5229                 return False
5230             self._loaded_paths.add(location)
5231
5232         self.__initialized = True
5233         self.own_args, self.filename = args, filename
5234         for location in self._parser.parse_args(args)[0].config_locations or []:
5235             location = compat_expanduser(location)
5236             if os.path.isdir(location):
5237                 location = os.path.join(location, 'yt-dlp.conf')
5238             if not os.path.exists(location):
5239                 self._parser.error(f'config location {location} does not exist')
5240             self.append_config(self.read_file(location), location)
5241         return True
5242
5243     def __str__(self):
5244         label = join_nonempty(
5245             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5246             delim=' ')
5247         return join_nonempty(
5248             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5249             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5250             delim='\n')
5251
5252     @staticmethod
5253     def read_file(filename, default=[]):
5254         try:
5255             optionf = open(filename)
5256         except IOError:
5257             return default  # silently skip if file is not present
5258         try:
5259             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5260             contents = optionf.read()
5261             if sys.version_info < (3,):
5262                 contents = contents.decode(preferredencoding())
5263             res = compat_shlex_split(contents, comments=True)
5264         finally:
5265             optionf.close()
5266         return res
5267
5268     @staticmethod
5269     def hide_login_info(opts):
5270         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5271         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5272
5273         def _scrub_eq(o):
5274             m = eqre.match(o)
5275             if m:
5276                 return m.group('key') + '=PRIVATE'
5277             else:
5278                 return o
5279
5280         opts = list(map(_scrub_eq, opts))
5281         for idx, opt in enumerate(opts):
5282             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5283                 opts[idx + 1] = 'PRIVATE'
5284         return opts
5285
5286     def append_config(self, *args, label=None):
5287         config = type(self)(self._parser, label)
5288         config._loaded_paths = self._loaded_paths
5289         if config.init(*args):
5290             self.configs.append(config)
5291
5292     @property
5293     def all_args(self):
5294         for config in reversed(self.configs):
5295             yield from config.all_args
5296         yield from self.own_args or []
5297
5298     def parse_args(self):
5299         return self._parser.parse_args(list(self.all_args))