yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_split,
  62     compat_shlex_quote,
  63     compat_str,
  64     compat_struct_pack,
  65     compat_struct_unpack,
  66     compat_urllib_error,
  67     compat_urllib_parse,
  68     compat_urllib_parse_urlencode,
  69     compat_urllib_parse_urlparse,
  70     compat_urllib_parse_urlunparse,
  71     compat_urllib_parse_quote,
  72     compat_urllib_parse_quote_plus,
  73     compat_urllib_parse_unquote_plus,
  74     compat_urllib_request,
  75     compat_urlparse,
  76     compat_xpath,
  77 )
  78
  79 from .socks import (
  80     ProxyType,
  81     sockssocket,
  82 )
  83
  84
  85 def register_socks_protocols():
  86     # "Register" SOCKS protocols
  87     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  88     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  89     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  90         if scheme not in compat_urlparse.uses_netloc:
  91             compat_urlparse.uses_netloc.append(scheme)
  92
  93
  94 # This is not clearly defined otherwise
  95 compiled_regex_type = type(re.compile(''))
  96
  97
  98 def random_user_agent():
  99     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 100     _CHROME_VERSIONS = (
 101         '90.0.4430.212',
 102         '90.0.4430.24',
 103         '90.0.4430.70',
 104         '90.0.4430.72',
 105         '90.0.4430.85',
 106         '90.0.4430.93',
 107         '91.0.4472.101',
 108         '91.0.4472.106',
 109         '91.0.4472.114',
 110         '91.0.4472.124',
 111         '91.0.4472.164',
 112         '91.0.4472.19',
 113         '91.0.4472.77',
 114         '92.0.4515.107',
 115         '92.0.4515.115',
 116         '92.0.4515.131',
 117         '92.0.4515.159',
 118         '92.0.4515.43',
 119         '93.0.4556.0',
 120         '93.0.4577.15',
 121         '93.0.4577.63',
 122         '93.0.4577.82',
 123         '94.0.4606.41',
 124         '94.0.4606.54',
 125         '94.0.4606.61',
 126         '94.0.4606.71',
 127         '94.0.4606.81',
 128         '94.0.4606.85',
 129         '95.0.4638.17',
 130         '95.0.4638.50',
 131         '95.0.4638.54',
 132         '95.0.4638.69',
 133         '95.0.4638.74',
 134         '96.0.4664.18',
 135         '96.0.4664.45',
 136         '96.0.4664.55',
 137         '96.0.4664.93',
 138         '97.0.4692.20',
 139     )
 140     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 141
 142
 143 std_headers = {
 144     'User-Agent': random_user_agent(),
 145     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 146     'Accept-Encoding': 'gzip, deflate',
 147     'Accept-Language': 'en-us,en;q=0.5',
 148     'Sec-Fetch-Mode': 'navigate',
 149 }
 150
 151
 152 USER_AGENTS = {
 153     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 154 }
 155
 156
 157 NO_DEFAULT = object()
 158
 159 ENGLISH_MONTH_NAMES = [
 160     'January', 'February', 'March', 'April', 'May', 'June',
 161     'July', 'August', 'September', 'October', 'November', 'December']
 162
 163 MONTH_NAMES = {
 164     'en': ENGLISH_MONTH_NAMES,
 165     'fr': [
 166         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 167         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 168 }
 169
 170 KNOWN_EXTENSIONS = (
 171     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 172     'flv', 'f4v', 'f4a', 'f4b',
 173     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 174     'mkv', 'mka', 'mk3d',
 175     'avi', 'divx',
 176     'mov',
 177     'asf', 'wmv', 'wma',
 178     '3gp', '3g2',
 179     'mp3',
 180     'flac',
 181     'ape',
 182     'wav',
 183     'f4f', 'f4m', 'm3u8', 'smil')
 184
 185 # needed for sanitizing filenames in restricted mode
 186 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 187                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 188                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 189
 190 DATE_FORMATS = (
 191     '%d %B %Y',
 192     '%d %b %Y',
 193     '%B %d %Y',
 194     '%B %dst %Y',
 195     '%B %dnd %Y',
 196     '%B %drd %Y',
 197     '%B %dth %Y',
 198     '%b %d %Y',
 199     '%b %dst %Y',
 200     '%b %dnd %Y',
 201     '%b %drd %Y',
 202     '%b %dth %Y',
 203     '%b %dst %Y %I:%M',
 204     '%b %dnd %Y %I:%M',
 205     '%b %drd %Y %I:%M',
 206     '%b %dth %Y %I:%M',
 207     '%Y %m %d',
 208     '%Y-%m-%d',
 209     '%Y.%m.%d.',
 210     '%Y/%m/%d',
 211     '%Y/%m/%d %H:%M',
 212     '%Y/%m/%d %H:%M:%S',
 213     '%Y%m%d%H%M',
 214     '%Y%m%d%H%M%S',
 215     '%Y%m%d',
 216     '%Y-%m-%d %H:%M',
 217     '%Y-%m-%d %H:%M:%S',
 218     '%Y-%m-%d %H:%M:%S.%f',
 219     '%Y-%m-%d %H:%M:%S:%f',
 220     '%d.%m.%Y %H:%M',
 221     '%d.%m.%Y %H.%M',
 222     '%Y-%m-%dT%H:%M:%SZ',
 223     '%Y-%m-%dT%H:%M:%S.%fZ',
 224     '%Y-%m-%dT%H:%M:%S.%f0Z',
 225     '%Y-%m-%dT%H:%M:%S',
 226     '%Y-%m-%dT%H:%M:%S.%f',
 227     '%Y-%m-%dT%H:%M',
 228     '%b %d %Y at %H:%M',
 229     '%b %d %Y at %H:%M:%S',
 230     '%B %d %Y at %H:%M',
 231     '%B %d %Y at %H:%M:%S',
 232     '%H:%M %d-%b-%Y',
 233 )
 234
 235 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 236 DATE_FORMATS_DAY_FIRST.extend([
 237     '%d-%m-%Y',
 238     '%d.%m.%Y',
 239     '%d.%m.%y',
 240     '%d/%m/%Y',
 241     '%d/%m/%y',
 242     '%d/%m/%Y %H:%M:%S',
 243 ])
 244
 245 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 246 DATE_FORMATS_MONTH_FIRST.extend([
 247     '%m-%d-%Y',
 248     '%m.%d.%Y',
 249     '%m/%d/%Y',
 250     '%m/%d/%y',
 251     '%m/%d/%Y %H:%M:%S',
 252 ])
 253
 254 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 255 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 256
 257
 258 def preferredencoding():
 259     """Get preferred encoding.
 260
 261     Returns the best encoding scheme for the system, based on
 262     locale.getpreferredencoding() and some further tweaks.
 263     """
 264     try:
 265         pref = locale.getpreferredencoding()
 266         'TEST'.encode(pref)
 267     except Exception:
 268         pref = 'UTF-8'
 269
 270     return pref
 271
 272
 273 def write_json_file(obj, fn):
 274     """ Encode obj as JSON and write it to fn, atomically if possible """
 275
 276     fn = encodeFilename(fn)
 277     if sys.version_info < (3, 0) and sys.platform != 'win32':
 278         encoding = get_filesystem_encoding()
 279         # os.path.basename returns a bytes object, but NamedTemporaryFile
 280         # will fail if the filename contains non ascii characters unless we
 281         # use a unicode object
 282         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 283         # the same for os.path.dirname
 284         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 285     else:
 286         path_basename = os.path.basename
 287         path_dirname = os.path.dirname
 288
 289     args = {
 290         'suffix': '.tmp',
 291         'prefix': path_basename(fn) + '.',
 292         'dir': path_dirname(fn),
 293         'delete': False,
 294     }
 295
 296     # In Python 2.x, json.dump expects a bytestream.
 297     # In Python 3.x, it writes to a character stream
 298     if sys.version_info < (3, 0):
 299         args['mode'] = 'wb'
 300     else:
 301         args.update({
 302             'mode': 'w',
 303             'encoding': 'utf-8',
 304         })
 305
 306     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 307
 308     try:
 309         with tf:
 310             json.dump(obj, tf, ensure_ascii=False)
 311         if sys.platform == 'win32':
 312             # Need to remove existing file on Windows, else os.rename raises
 313             # WindowsError or FileExistsError.
 314             try:
 315                 os.unlink(fn)
 316             except OSError:
 317                 pass
 318         try:
 319             mask = os.umask(0)
 320             os.umask(mask)
 321             os.chmod(tf.name, 0o666 & ~mask)
 322         except OSError:
 323             pass
 324         os.rename(tf.name, fn)
 325     except Exception:
 326         try:
 327             os.remove(tf.name)
 328         except OSError:
 329             pass
 330         raise
 331
 332
 333 if sys.version_info >= (2, 7):
 334     def find_xpath_attr(node, xpath, key, val=None):
 335         """ Find the xpath xpath[@key=val] """
 336         assert re.match(r'^[a-zA-Z_-]+$', key)
 337         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 338         return node.find(expr)
 339 else:
 340     def find_xpath_attr(node, xpath, key, val=None):
 341         for f in node.findall(compat_xpath(xpath)):
 342             if key not in f.attrib:
 343                 continue
 344             if val is None or f.attrib.get(key) == val:
 345                 return f
 346         return None
 347
 348 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 349 # the namespace parameter
 350
 351
 352 def xpath_with_ns(path, ns_map):
 353     components = [c.split(':') for c in path.split('/')]
 354     replaced = []
 355     for c in components:
 356         if len(c) == 1:
 357             replaced.append(c[0])
 358         else:
 359             ns, tag = c
 360             replaced.append('{%s}%s' % (ns_map[ns], tag))
 361     return '/'.join(replaced)
 362
 363
 364 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 365     def _find_xpath(xpath):
 366         return node.find(compat_xpath(xpath))
 367
 368     if isinstance(xpath, (str, compat_str)):
 369         n = _find_xpath(xpath)
 370     else:
 371         for xp in xpath:
 372             n = _find_xpath(xp)
 373             if n is not None:
 374                 break
 375
 376     if n is None:
 377         if default is not NO_DEFAULT:
 378             return default
 379         elif fatal:
 380             name = xpath if name is None else name
 381             raise ExtractorError('Could not find XML element %s' % name)
 382         else:
 383             return None
 384     return n
 385
 386
 387 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 388     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 389     if n is None or n == default:
 390         return n
 391     if n.text is None:
 392         if default is not NO_DEFAULT:
 393             return default
 394         elif fatal:
 395             name = xpath if name is None else name
 396             raise ExtractorError('Could not find XML element\'s text %s' % name)
 397         else:
 398             return None
 399     return n.text
 400
 401
 402 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 403     n = find_xpath_attr(node, xpath, key)
 404     if n is None:
 405         if default is not NO_DEFAULT:
 406             return default
 407         elif fatal:
 408             name = '%s[@%s]' % (xpath, key) if name is None else name
 409             raise ExtractorError('Could not find XML attribute %s' % name)
 410         else:
 411             return None
 412     return n.attrib[key]
 413
 414
 415 def get_element_by_id(id, html):
 416     """Return the content of the tag with the specified ID in the passed HTML document"""
 417     return get_element_by_attribute('id', id, html)
 418
 419
 420 def get_element_html_by_id(id, html):
 421     """Return the html of the tag with the specified ID in the passed HTML document"""
 422     return get_element_html_by_attribute('id', id, html)
 423
 424
 425 def get_element_by_class(class_name, html):
 426     """Return the content of the first tag with the specified class in the passed HTML document"""
 427     retval = get_elements_by_class(class_name, html)
 428     return retval[0] if retval else None
 429
 430
 431 def get_element_html_by_class(class_name, html):
 432     """Return the html of the first tag with the specified class in the passed HTML document"""
 433     retval = get_elements_html_by_class(class_name, html)
 434     return retval[0] if retval else None
 435
 436
 437 def get_element_by_attribute(attribute, value, html, escape_value=True):
 438     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 439     return retval[0] if retval else None
 440
 441
 442 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 443     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 444     return retval[0] if retval else None
 445
 446
 447 def get_elements_by_class(class_name, html):
 448     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 449     return get_elements_by_attribute(
 450         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 451         html, escape_value=False)
 452
 453
 454 def get_elements_html_by_class(class_name, html):
 455     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 456     return get_elements_html_by_attribute(
 457         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 458         html, escape_value=False)
 459
 460
 461 def get_elements_by_attribute(*args, **kwargs):
 462     """Return the content of the tag with the specified attribute in the passed HTML document"""
 463     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 464
 465
 466 def get_elements_html_by_attribute(*args, **kwargs):
 467     """Return the html of the tag with the specified attribute in the passed HTML document"""
 468     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 469
 470
 471 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 472     """
 473     Return the text (content) and the html (whole) of the tag with the specified
 474     attribute in the passed HTML document
 475     """
 476
 477     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 478
 479     value = re.escape(value) if escape_value else value
 480
 481     partial_element_re = r'''(?x)
 482         <(?P<tag>[a-zA-Z0-9:._-]+)
 483          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 484          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 485         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 486
 487     for m in re.finditer(partial_element_re, html):
 488         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 489
 490         yield (
 491             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 492             whole
 493         )
 494
 495
 496 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 497     """
 498     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 499     closing tag for the first opening tag it has encountered, and can be used
 500     as a context manager
 501     """
 502
 503     class HTMLBreakOnClosingTagException(Exception):
 504         pass
 505
 506     def __init__(self):
 507         self.tagstack = collections.deque()
 508         compat_HTMLParser.__init__(self)
 509
 510     def __enter__(self):
 511         return self
 512
 513     def __exit__(self, *_):
 514         self.close()
 515
 516     def close(self):
 517         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 518         # so data remains buffered; we no longer have any interest in it, thus
 519         # override this method to discard it
 520         pass
 521
 522     def handle_starttag(self, tag, _):
 523         self.tagstack.append(tag)
 524
 525     def handle_endtag(self, tag):
 526         if not self.tagstack:
 527             raise compat_HTMLParseError('no tags in the stack')
 528         while self.tagstack:
 529             inner_tag = self.tagstack.pop()
 530             if inner_tag == tag:
 531                 break
 532         else:
 533             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 534         if not self.tagstack:
 535             raise self.HTMLBreakOnClosingTagException()
 536
 537
 538 def get_element_text_and_html_by_tag(tag, html):
 539     """
 540     For the first element with the specified tag in the passed HTML document
 541     return its' content (text) and the whole element (html)
 542     """
 543     def find_or_raise(haystack, needle, exc):
 544         try:
 545             return haystack.index(needle)
 546         except ValueError:
 547             raise exc
 548     closing_tag = f'</{tag}>'
 549     whole_start = find_or_raise(
 550         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 551     content_start = find_or_raise(
 552         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 553     content_start += whole_start + 1
 554     with HTMLBreakOnClosingTagParser() as parser:
 555         parser.feed(html[whole_start:content_start])
 556         if not parser.tagstack or parser.tagstack[0] != tag:
 557             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 558         offset = content_start
 559         while offset < len(html):
 560             next_closing_tag_start = find_or_raise(
 561                 html[offset:], closing_tag,
 562                 compat_HTMLParseError(f'closing {tag} tag not found'))
 563             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 564             try:
 565                 parser.feed(html[offset:offset + next_closing_tag_end])
 566                 offset += next_closing_tag_end
 567             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 568                 return html[content_start:offset + next_closing_tag_start], \
 569                     html[whole_start:offset + next_closing_tag_end]
 570         raise compat_HTMLParseError('unexpected end of html')
 571
 572
 573 class HTMLAttributeParser(compat_HTMLParser):
 574     """Trivial HTML parser to gather the attributes for a single element"""
 575
 576     def __init__(self):
 577         self.attrs = {}
 578         compat_HTMLParser.__init__(self)
 579
 580     def handle_starttag(self, tag, attrs):
 581         self.attrs = dict(attrs)
 582
 583
 584 class HTMLListAttrsParser(compat_HTMLParser):
 585     """HTML parser to gather the attributes for the elements of a list"""
 586
 587     def __init__(self):
 588         compat_HTMLParser.__init__(self)
 589         self.items = []
 590         self._level = 0
 591
 592     def handle_starttag(self, tag, attrs):
 593         if tag == 'li' and self._level == 0:
 594             self.items.append(dict(attrs))
 595         self._level += 1
 596
 597     def handle_endtag(self, tag):
 598         self._level -= 1
 599
 600
 601 def extract_attributes(html_element):
 602     """Given a string for an HTML element such as
 603     <el
 604          a="foo" B="bar" c="&98;az" d=boz
 605          empty= noval entity="&amp;"
 606          sq='"' dq="'"
 607     >
 608     Decode and return a dictionary of attributes.
 609     {
 610         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 611         'empty': '', 'noval': None, 'entity': '&',
 612         'sq': '"', 'dq': '\''
 613     }.
 614     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 615     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 616     """
 617     parser = HTMLAttributeParser()
 618     try:
 619         parser.feed(html_element)
 620         parser.close()
 621     # Older Python may throw HTMLParseError in case of malformed HTML
 622     except compat_HTMLParseError:
 623         pass
 624     return parser.attrs
 625
 626
 627 def parse_list(webpage):
 628     """Given a string for an series of HTML <li> elements,
 629     return a dictionary of their attributes"""
 630     parser = HTMLListAttrsParser()
 631     parser.feed(webpage)
 632     parser.close()
 633     return parser.items
 634
 635
 636 def clean_html(html):
 637     """Clean an HTML snippet into a readable string"""
 638
 639     if html is None:  # Convenience for sanitizing descriptions etc.
 640         return html
 641
 642     # Newline vs <br />
 643     html = html.replace('\n', ' ')
 644     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 645     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 646     # Strip html tags
 647     html = re.sub('<.*?>', '', html)
 648     # Replace html entities
 649     html = unescapeHTML(html)
 650     return html.strip()
 651
 652
 653 def sanitize_open(filename, open_mode):
 654     """Try to open the given filename, and slightly tweak it if this fails.
 655
 656     Attempts to open the given filename. If this fails, it tries to change
 657     the filename slightly, step by step, until it's either able to open it
 658     or it fails and raises a final exception, like the standard open()
 659     function.
 660
 661     It returns the tuple (stream, definitive_file_name).
 662     """
 663     try:
 664         if filename == '-':
 665             if sys.platform == 'win32':
 666                 import msvcrt
 667                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 668             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 669         stream = open(encodeFilename(filename), open_mode)
 670         return (stream, filename)
 671     except (IOError, OSError) as err:
 672         if err.errno in (errno.EACCES,):
 673             raise
 674
 675         # In case of error, try to remove win32 forbidden chars
 676         alt_filename = sanitize_path(filename)
 677         if alt_filename == filename:
 678             raise
 679         else:
 680             # An exception here should be caught in the caller
 681             stream = open(encodeFilename(alt_filename), open_mode)
 682             return (stream, alt_filename)
 683
 684
 685 def timeconvert(timestr):
 686     """Convert RFC 2822 defined time string into system timestamp"""
 687     timestamp = None
 688     timetuple = email.utils.parsedate_tz(timestr)
 689     if timetuple is not None:
 690         timestamp = email.utils.mktime_tz(timetuple)
 691     return timestamp
 692
 693
 694 def sanitize_filename(s, restricted=False, is_id=False):
 695     """Sanitizes a string so it could be used as part of a filename.
 696     If restricted is set, use a stricter subset of allowed characters.
 697     Set is_id if this is not an arbitrary string, but an ID that should be kept
 698     if possible.
 699     """
 700     def replace_insane(char):
 701         if restricted and char in ACCENT_CHARS:
 702             return ACCENT_CHARS[char]
 703         elif not restricted and char == '\n':
 704             return ' '
 705         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 706             return ''
 707         elif char == '"':
 708             return '' if restricted else '\''
 709         elif char == ':':
 710             return '_-' if restricted else ' -'
 711         elif char in '\\/|*<>':
 712             return '_'
 713         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 714             return '_'
 715         if restricted and ord(char) > 127:
 716             return '_'
 717         return char
 718
 719     if s == '':
 720         return ''
 721     # Handle timestamps
 722     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 723     result = ''.join(map(replace_insane, s))
 724     if not is_id:
 725         while '__' in result:
 726             result = result.replace('__', '_')
 727         result = result.strip('_')
 728         # Common case of "Foreign band name - English song title"
 729         if restricted and result.startswith('-_'):
 730             result = result[2:]
 731         if result.startswith('-'):
 732             result = '_' + result[len('-'):]
 733         result = result.lstrip('.')
 734         if not result:
 735             result = '_'
 736     return result
 737
 738
 739 def sanitize_path(s, force=False):
 740     """Sanitizes and normalizes path on Windows"""
 741     if sys.platform == 'win32':
 742         force = False
 743         drive_or_unc, _ = os.path.splitdrive(s)
 744         if sys.version_info < (2, 7) and not drive_or_unc:
 745             drive_or_unc, _ = os.path.splitunc(s)
 746     elif force:
 747         drive_or_unc = ''
 748     else:
 749         return s
 750
 751     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 752     if drive_or_unc:
 753         norm_path.pop(0)
 754     sanitized_path = [
 755         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 756         for path_part in norm_path]
 757     if drive_or_unc:
 758         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 759     elif force and s[0] == os.path.sep:
 760         sanitized_path.insert(0, os.path.sep)
 761     return os.path.join(*sanitized_path)
 762
 763
 764 def sanitize_url(url):
 765     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 766     # the number of unwanted failures due to missing protocol
 767     if url.startswith('//'):
 768         return 'http:%s' % url
 769     # Fix some common typos seen so far
 770     COMMON_TYPOS = (
 771         # https://github.com/ytdl-org/youtube-dl/issues/15649
 772         (r'^httpss://', r'https://'),
 773         # https://bx1.be/lives/direct-tv/
 774         (r'^rmtp([es]?)://', r'rtmp\1://'),
 775     )
 776     for mistake, fixup in COMMON_TYPOS:
 777         if re.match(mistake, url):
 778             return re.sub(mistake, fixup, url)
 779     return url
 780
 781
 782 def extract_basic_auth(url):
 783     parts = compat_urlparse.urlsplit(url)
 784     if parts.username is None:
 785         return url, None
 786     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 787         parts.hostname if parts.port is None
 788         else '%s:%d' % (parts.hostname, parts.port))))
 789     auth_payload = base64.b64encode(
 790         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 791     return url, 'Basic ' + auth_payload.decode('utf-8')
 792
 793
 794 def sanitized_Request(url, *args, **kwargs):
 795     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 796     if auth_header is not None:
 797         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 798         headers['Authorization'] = auth_header
 799     return compat_urllib_request.Request(url, *args, **kwargs)
 800
 801
 802 def expand_path(s):
 803     """Expand shell variables and ~"""
 804     return os.path.expandvars(compat_expanduser(s))
 805
 806
 807 def orderedSet(iterable):
 808     """ Remove all duplicates from the input iterable """
 809     res = []
 810     for el in iterable:
 811         if el not in res:
 812             res.append(el)
 813     return res
 814
 815
 816 def _htmlentity_transform(entity_with_semicolon):
 817     """Transforms an HTML entity to a character."""
 818     entity = entity_with_semicolon[:-1]
 819
 820     # Known non-numeric HTML entity
 821     if entity in compat_html_entities.name2codepoint:
 822         return compat_chr(compat_html_entities.name2codepoint[entity])
 823
 824     # TODO: HTML5 allows entities without a semicolon. For example,
 825     # '&Eacuteric' should be decoded as 'Éric'.
 826     if entity_with_semicolon in compat_html_entities_html5:
 827         return compat_html_entities_html5[entity_with_semicolon]
 828
 829     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 830     if mobj is not None:
 831         numstr = mobj.group(1)
 832         if numstr.startswith('x'):
 833             base = 16
 834             numstr = '0%s' % numstr
 835         else:
 836             base = 10
 837         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 838         try:
 839             return compat_chr(int(numstr, base))
 840         except ValueError:
 841             pass
 842
 843     # Unknown entity in name, return its literal representation
 844     return '&%s;' % entity
 845
 846
 847 def unescapeHTML(s):
 848     if s is None:
 849         return None
 850     assert type(s) == compat_str
 851
 852     return re.sub(
 853         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 854
 855
 856 def escapeHTML(text):
 857     return (
 858         text
 859         .replace('&', '&amp;')
 860         .replace('<', '&lt;')
 861         .replace('>', '&gt;')
 862         .replace('"', '&quot;')
 863         .replace("'", '&#39;')
 864     )
 865
 866
 867 def process_communicate_or_kill(p, *args, **kwargs):
 868     try:
 869         return p.communicate(*args, **kwargs)
 870     except BaseException:  # Including KeyboardInterrupt
 871         p.kill()
 872         p.wait()
 873         raise
 874
 875
 876 class Popen(subprocess.Popen):
 877     if sys.platform == 'win32':
 878         _startupinfo = subprocess.STARTUPINFO()
 879         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 880     else:
 881         _startupinfo = None
 882
 883     def __init__(self, *args, **kwargs):
 884         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 885
 886     def communicate_or_kill(self, *args, **kwargs):
 887         return process_communicate_or_kill(self, *args, **kwargs)
 888
 889
 890 def get_subprocess_encoding():
 891     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 892         # For subprocess calls, encode with locale encoding
 893         # Refer to http://stackoverflow.com/a/9951851/35070
 894         encoding = preferredencoding()
 895     else:
 896         encoding = sys.getfilesystemencoding()
 897     if encoding is None:
 898         encoding = 'utf-8'
 899     return encoding
 900
 901
 902 def encodeFilename(s, for_subprocess=False):
 903     """
 904     @param s The name of the file
 905     """
 906
 907     assert type(s) == compat_str
 908
 909     # Python 3 has a Unicode API
 910     if sys.version_info >= (3, 0):
 911         return s
 912
 913     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 914     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 915     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 916     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 917         return s
 918
 919     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 920     if sys.platform.startswith('java'):
 921         return s
 922
 923     return s.encode(get_subprocess_encoding(), 'ignore')
 924
 925
 926 def decodeFilename(b, for_subprocess=False):
 927
 928     if sys.version_info >= (3, 0):
 929         return b
 930
 931     if not isinstance(b, bytes):
 932         return b
 933
 934     return b.decode(get_subprocess_encoding(), 'ignore')
 935
 936
 937 def encodeArgument(s):
 938     if not isinstance(s, compat_str):
 939         # Legacy code that uses byte strings
 940         # Uncomment the following line after fixing all post processors
 941         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 942         s = s.decode('ascii')
 943     return encodeFilename(s, True)
 944
 945
 946 def decodeArgument(b):
 947     return decodeFilename(b, True)
 948
 949
 950 def decodeOption(optval):
 951     if optval is None:
 952         return optval
 953     if isinstance(optval, bytes):
 954         optval = optval.decode(preferredencoding())
 955
 956     assert isinstance(optval, compat_str)
 957     return optval
 958
 959
 960 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 961
 962
 963 def timetuple_from_msec(msec):
 964     secs, msec = divmod(msec, 1000)
 965     mins, secs = divmod(secs, 60)
 966     hrs, mins = divmod(mins, 60)
 967     return _timetuple(hrs, mins, secs, msec)
 968
 969
 970 def formatSeconds(secs, delim=':', msec=False):
 971     time = timetuple_from_msec(secs * 1000)
 972     if time.hours:
 973         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 974     elif time.minutes:
 975         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 976     else:
 977         ret = '%d' % time.seconds
 978     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 979
 980
 981 def _ssl_load_windows_store_certs(ssl_context, storename):
 982     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 983     try:
 984         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 985                  if encoding == 'x509_asn' and (
 986                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 987     except PermissionError:
 988         return
 989     for cert in certs:
 990         try:
 991             ssl_context.load_verify_locations(cadata=cert)
 992         except ssl.SSLError:
 993             pass
 994
 995
 996 def make_HTTPS_handler(params, **kwargs):
 997     opts_check_certificate = not params.get('nocheckcertificate')
 998     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 999     context.check_hostname = opts_check_certificate
1000     if params.get('legacyserverconnect'):
1001         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1002     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003     if opts_check_certificate:
1004         try:
1005             context.load_default_certs()
1006             # Work around the issue in load_default_certs when there are bad certificates. See:
1007             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1008             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1009         except ssl.SSLError:
1010             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1011             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1012                 # Create a new context to discard any certificates that were already loaded
1013                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1014                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1015                 for storename in ('CA', 'ROOT'):
1016                     _ssl_load_windows_store_certs(context, storename)
1017             context.set_default_verify_paths()
1018     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1019
1020
1021 def bug_reports_message(before=';'):
1022     if ytdl_is_updateable():
1023         update_cmd = 'type  yt-dlp -U  to update'
1024     else:
1025         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
1026     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
1027     msg += ' Make sure you are using the latest version; %s.' % update_cmd
1028     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
1029
1030     before = before.rstrip()
1031     if not before or before.endswith(('.', '!', '?')):
1032         msg = msg[0].title() + msg[1:]
1033
1034     return (before + ' ' if before else '') + msg
1035
1036
1037 class YoutubeDLError(Exception):
1038     """Base exception for YoutubeDL errors."""
1039     msg = None
1040
1041     def __init__(self, msg=None):
1042         if msg is not None:
1043             self.msg = msg
1044         elif self.msg is None:
1045             self.msg = type(self).__name__
1046         super().__init__(self.msg)
1047
1048
1049 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1050 if hasattr(ssl, 'CertificateError'):
1051     network_exceptions.append(ssl.CertificateError)
1052 network_exceptions = tuple(network_exceptions)
1053
1054
1055 class ExtractorError(YoutubeDLError):
1056     """Error during info extraction."""
1057
1058     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1059         """ tb, if given, is the original traceback (so that it can be printed out).
1060         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1061         """
1062         if sys.exc_info()[0] in network_exceptions:
1063             expected = True
1064
1065         self.msg = str(msg)
1066         self.traceback = tb
1067         self.expected = expected
1068         self.cause = cause
1069         self.video_id = video_id
1070         self.ie = ie
1071         self.exc_info = sys.exc_info()  # preserve original exception
1072
1073         super(ExtractorError, self).__init__(''.join((
1074             format_field(ie, template='[%s] '),
1075             format_field(video_id, template='%s: '),
1076             self.msg,
1077             format_field(cause, template=' (caused by %r)'),
1078             '' if expected else bug_reports_message())))
1079
1080     def format_traceback(self):
1081         if self.traceback is None:
1082             return None
1083         return ''.join(traceback.format_tb(self.traceback))
1084
1085
1086 class UnsupportedError(ExtractorError):
1087     def __init__(self, url):
1088         super(UnsupportedError, self).__init__(
1089             'Unsupported URL: %s' % url, expected=True)
1090         self.url = url
1091
1092
1093 class RegexNotFoundError(ExtractorError):
1094     """Error when a regex didn't match"""
1095     pass
1096
1097
1098 class GeoRestrictedError(ExtractorError):
1099     """Geographic restriction Error exception.
1100
1101     This exception may be thrown when a video is not available from your
1102     geographic location due to geographic restrictions imposed by a website.
1103     """
1104
1105     def __init__(self, msg, countries=None, **kwargs):
1106         kwargs['expected'] = True
1107         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1108         self.countries = countries
1109
1110
1111 class DownloadError(YoutubeDLError):
1112     """Download Error exception.
1113
1114     This exception may be thrown by FileDownloader objects if they are not
1115     configured to continue on errors. They will contain the appropriate
1116     error message.
1117     """
1118
1119     def __init__(self, msg, exc_info=None):
1120         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1121         super(DownloadError, self).__init__(msg)
1122         self.exc_info = exc_info
1123
1124
1125 class EntryNotInPlaylist(YoutubeDLError):
1126     """Entry not in playlist exception.
1127
1128     This exception will be thrown by YoutubeDL when a requested entry
1129     is not found in the playlist info_dict
1130     """
1131     msg = 'Entry not found in info'
1132
1133
1134 class SameFileError(YoutubeDLError):
1135     """Same File exception.
1136
1137     This exception will be thrown by FileDownloader objects if they detect
1138     multiple files would have to be downloaded to the same file on disk.
1139     """
1140     msg = 'Fixed output name but more than one file to download'
1141
1142     def __init__(self, filename=None):
1143         if filename is not None:
1144             self.msg += f': {filename}'
1145         super().__init__(self.msg)
1146
1147
1148 class PostProcessingError(YoutubeDLError):
1149     """Post Processing exception.
1150
1151     This exception may be raised by PostProcessor's .run() method to
1152     indicate an error in the postprocessing task.
1153     """
1154
1155
1156 class DownloadCancelled(YoutubeDLError):
1157     """ Exception raised when the download queue should be interrupted """
1158     msg = 'The download was cancelled'
1159
1160
1161 class ExistingVideoReached(DownloadCancelled):
1162     """ --break-on-existing triggered """
1163     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1164
1165
1166 class RejectedVideoReached(DownloadCancelled):
1167     """ --break-on-reject triggered """
1168     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1169
1170
1171 class MaxDownloadsReached(DownloadCancelled):
1172     """ --max-downloads limit has been reached. """
1173     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1174
1175
1176 class ReExtractInfo(YoutubeDLError):
1177     """ Video info needs to be re-extracted. """
1178
1179     def __init__(self, msg, expected=False):
1180         super().__init__(msg)
1181         self.expected = expected
1182
1183
1184 class ThrottledDownload(ReExtractInfo):
1185     """ Download speed below --throttled-rate. """
1186     msg = 'The download speed is below throttle limit'
1187
1188     def __init__(self):
1189         super().__init__(self.msg, expected=False)
1190
1191
1192 class UnavailableVideoError(YoutubeDLError):
1193     """Unavailable Format exception.
1194
1195     This exception will be thrown when a video is requested
1196     in a format that is not available for that video.
1197     """
1198     msg = 'Unable to download video'
1199
1200     def __init__(self, err=None):
1201         if err is not None:
1202             self.msg += f': {err}'
1203         super().__init__(self.msg)
1204
1205
1206 class ContentTooShortError(YoutubeDLError):
1207     """Content Too Short exception.
1208
1209     This exception may be raised by FileDownloader objects when a file they
1210     download is too small for what the server announced first, indicating
1211     the connection was probably interrupted.
1212     """
1213
1214     def __init__(self, downloaded, expected):
1215         super(ContentTooShortError, self).__init__(
1216             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1217         )
1218         # Both in bytes
1219         self.downloaded = downloaded
1220         self.expected = expected
1221
1222
1223 class XAttrMetadataError(YoutubeDLError):
1224     def __init__(self, code=None, msg='Unknown error'):
1225         super(XAttrMetadataError, self).__init__(msg)
1226         self.code = code
1227         self.msg = msg
1228
1229         # Parsing code and msg
1230         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1231                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1232             self.reason = 'NO_SPACE'
1233         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1234             self.reason = 'VALUE_TOO_LONG'
1235         else:
1236             self.reason = 'NOT_SUPPORTED'
1237
1238
1239 class XAttrUnavailableError(YoutubeDLError):
1240     pass
1241
1242
1243 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1244     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1245     # expected HTTP responses to meet HTTP/1.0 or later (see also
1246     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1247     if sys.version_info < (3, 0):
1248         kwargs['strict'] = True
1249     hc = http_class(*args, **compat_kwargs(kwargs))
1250     source_address = ydl_handler._params.get('source_address')
1251
1252     if source_address is not None:
1253         # This is to workaround _create_connection() from socket where it will try all
1254         # address data from getaddrinfo() including IPv6. This filters the result from
1255         # getaddrinfo() based on the source_address value.
1256         # This is based on the cpython socket.create_connection() function.
1257         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1258         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1259             host, port = address
1260             err = None
1261             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1262             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1263             ip_addrs = [addr for addr in addrs if addr[0] == af]
1264             if addrs and not ip_addrs:
1265                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1266                 raise socket.error(
1267                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1268                     % (ip_version, source_address[0]))
1269             for res in ip_addrs:
1270                 af, socktype, proto, canonname, sa = res
1271                 sock = None
1272                 try:
1273                     sock = socket.socket(af, socktype, proto)
1274                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1275                         sock.settimeout(timeout)
1276                     sock.bind(source_address)
1277                     sock.connect(sa)
1278                     err = None  # Explicitly break reference cycle
1279                     return sock
1280                 except socket.error as _:
1281                     err = _
1282                     if sock is not None:
1283                         sock.close()
1284             if err is not None:
1285                 raise err
1286             else:
1287                 raise socket.error('getaddrinfo returns an empty list')
1288         if hasattr(hc, '_create_connection'):
1289             hc._create_connection = _create_connection
1290         sa = (source_address, 0)
1291         if hasattr(hc, 'source_address'):  # Python 2.7+
1292             hc.source_address = sa
1293         else:  # Python 2.6
1294             def _hc_connect(self, *args, **kwargs):
1295                 sock = _create_connection(
1296                     (self.host, self.port), self.timeout, sa)
1297                 if is_https:
1298                     self.sock = ssl.wrap_socket(
1299                         sock, self.key_file, self.cert_file,
1300                         ssl_version=ssl.PROTOCOL_TLSv1)
1301                 else:
1302                     self.sock = sock
1303             hc.connect = functools.partial(_hc_connect, hc)
1304
1305     return hc
1306
1307
1308 def handle_youtubedl_headers(headers):
1309     filtered_headers = headers
1310
1311     if 'Youtubedl-no-compression' in filtered_headers:
1312         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1313         del filtered_headers['Youtubedl-no-compression']
1314
1315     return filtered_headers
1316
1317
1318 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1319     """Handler for HTTP requests and responses.
1320
1321     This class, when installed with an OpenerDirector, automatically adds
1322     the standard headers to every HTTP request and handles gzipped and
1323     deflated responses from web servers. If compression is to be avoided in
1324     a particular request, the original request in the program code only has
1325     to include the HTTP header "Youtubedl-no-compression", which will be
1326     removed before making the real request.
1327
1328     Part of this code was copied from:
1329
1330     http://techknack.net/python-urllib2-handlers/
1331
1332     Andrew Rowls, the author of that code, agreed to release it to the
1333     public domain.
1334     """
1335
1336     def __init__(self, params, *args, **kwargs):
1337         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1338         self._params = params
1339
1340     def http_open(self, req):
1341         conn_class = compat_http_client.HTTPConnection
1342
1343         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344         if socks_proxy:
1345             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346             del req.headers['Ytdl-socks-proxy']
1347
1348         return self.do_open(functools.partial(
1349             _create_http_connection, self, conn_class, False),
1350             req)
1351
1352     @staticmethod
1353     def deflate(data):
1354         if not data:
1355             return data
1356         try:
1357             return zlib.decompress(data, -zlib.MAX_WBITS)
1358         except zlib.error:
1359             return zlib.decompress(data)
1360
1361     def http_request(self, req):
1362         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1363         # always respected by websites, some tend to give out URLs with non percent-encoded
1364         # non-ASCII characters (see telemb.py, ard.py [#3412])
1365         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1366         # To work around aforementioned issue we will replace request's original URL with
1367         # percent-encoded one
1368         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1369         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1370         url = req.get_full_url()
1371         url_escaped = escape_url(url)
1372
1373         # Substitute URL if any change after escaping
1374         if url != url_escaped:
1375             req = update_Request(req, url=url_escaped)
1376
1377         for h, v in std_headers.items():
1378             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1379             # The dict keys are capitalized because of this bug by urllib
1380             if h.capitalize() not in req.headers:
1381                 req.add_header(h, v)
1382
1383         req.headers = handle_youtubedl_headers(req.headers)
1384
1385         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1386             # Python 2.6 is brain-dead when it comes to fragments
1387             req._Request__original = req._Request__original.partition('#')[0]
1388             req._Request__r_type = req._Request__r_type.partition('#')[0]
1389
1390         return req
1391
1392     def http_response(self, req, resp):
1393         old_resp = resp
1394         # gzip
1395         if resp.headers.get('Content-encoding', '') == 'gzip':
1396             content = resp.read()
1397             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1398             try:
1399                 uncompressed = io.BytesIO(gz.read())
1400             except IOError as original_ioerror:
1401                 # There may be junk add the end of the file
1402                 # See http://stackoverflow.com/q/4928560/35070 for details
1403                 for i in range(1, 1024):
1404                     try:
1405                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1406                         uncompressed = io.BytesIO(gz.read())
1407                     except IOError:
1408                         continue
1409                     break
1410                 else:
1411                     raise original_ioerror
1412             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1413             resp.msg = old_resp.msg
1414             del resp.headers['Content-encoding']
1415         # deflate
1416         if resp.headers.get('Content-encoding', '') == 'deflate':
1417             gz = io.BytesIO(self.deflate(resp.read()))
1418             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1419             resp.msg = old_resp.msg
1420             del resp.headers['Content-encoding']
1421         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1422         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1423         if 300 <= resp.code < 400:
1424             location = resp.headers.get('Location')
1425             if location:
1426                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1427                 if sys.version_info >= (3, 0):
1428                     location = location.encode('iso-8859-1').decode('utf-8')
1429                 else:
1430                     location = location.decode('utf-8')
1431                 location_escaped = escape_url(location)
1432                 if location != location_escaped:
1433                     del resp.headers['Location']
1434                     if sys.version_info < (3, 0):
1435                         location_escaped = location_escaped.encode('utf-8')
1436                     resp.headers['Location'] = location_escaped
1437         return resp
1438
1439     https_request = http_request
1440     https_response = http_response
1441
1442
1443 def make_socks_conn_class(base_class, socks_proxy):
1444     assert issubclass(base_class, (
1445         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1446
1447     url_components = compat_urlparse.urlparse(socks_proxy)
1448     if url_components.scheme.lower() == 'socks5':
1449         socks_type = ProxyType.SOCKS5
1450     elif url_components.scheme.lower() in ('socks', 'socks4'):
1451         socks_type = ProxyType.SOCKS4
1452     elif url_components.scheme.lower() == 'socks4a':
1453         socks_type = ProxyType.SOCKS4A
1454
1455     def unquote_if_non_empty(s):
1456         if not s:
1457             return s
1458         return compat_urllib_parse_unquote_plus(s)
1459
1460     proxy_args = (
1461         socks_type,
1462         url_components.hostname, url_components.port or 1080,
1463         True,  # Remote DNS
1464         unquote_if_non_empty(url_components.username),
1465         unquote_if_non_empty(url_components.password),
1466     )
1467
1468     class SocksConnection(base_class):
1469         def connect(self):
1470             self.sock = sockssocket()
1471             self.sock.setproxy(*proxy_args)
1472             if type(self.timeout) in (int, float):
1473                 self.sock.settimeout(self.timeout)
1474             self.sock.connect((self.host, self.port))
1475
1476             if isinstance(self, compat_http_client.HTTPSConnection):
1477                 if hasattr(self, '_context'):  # Python > 2.6
1478                     self.sock = self._context.wrap_socket(
1479                         self.sock, server_hostname=self.host)
1480                 else:
1481                     self.sock = ssl.wrap_socket(self.sock)
1482
1483     return SocksConnection
1484
1485
1486 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1487     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1488         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1489         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1490         self._params = params
1491
1492     def https_open(self, req):
1493         kwargs = {}
1494         conn_class = self._https_conn_class
1495
1496         if hasattr(self, '_context'):  # python > 2.6
1497             kwargs['context'] = self._context
1498         if hasattr(self, '_check_hostname'):  # python 3.x
1499             kwargs['check_hostname'] = self._check_hostname
1500
1501         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1502         if socks_proxy:
1503             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1504             del req.headers['Ytdl-socks-proxy']
1505
1506         return self.do_open(functools.partial(
1507             _create_http_connection, self, conn_class, True),
1508             req, **kwargs)
1509
1510
1511 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1512     """
1513     See [1] for cookie file format.
1514
1515     1. https://curl.haxx.se/docs/http-cookies.html
1516     """
1517     _HTTPONLY_PREFIX = '#HttpOnly_'
1518     _ENTRY_LEN = 7
1519     _HEADER = '''# Netscape HTTP Cookie File
1520 # This file is generated by yt-dlp.  Do not edit.
1521
1522 '''
1523     _CookieFileEntry = collections.namedtuple(
1524         'CookieFileEntry',
1525         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1526
1527     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1528         """
1529         Save cookies to a file.
1530
1531         Most of the code is taken from CPython 3.8 and slightly adapted
1532         to support cookie files with UTF-8 in both python 2 and 3.
1533         """
1534         if filename is None:
1535             if self.filename is not None:
1536                 filename = self.filename
1537             else:
1538                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1539
1540         # Store session cookies with `expires` set to 0 instead of an empty
1541         # string
1542         for cookie in self:
1543             if cookie.expires is None:
1544                 cookie.expires = 0
1545
1546         with io.open(filename, 'w', encoding='utf-8') as f:
1547             f.write(self._HEADER)
1548             now = time.time()
1549             for cookie in self:
1550                 if not ignore_discard and cookie.discard:
1551                     continue
1552                 if not ignore_expires and cookie.is_expired(now):
1553                     continue
1554                 if cookie.secure:
1555                     secure = 'TRUE'
1556                 else:
1557                     secure = 'FALSE'
1558                 if cookie.domain.startswith('.'):
1559                     initial_dot = 'TRUE'
1560                 else:
1561                     initial_dot = 'FALSE'
1562                 if cookie.expires is not None:
1563                     expires = compat_str(cookie.expires)
1564                 else:
1565                     expires = ''
1566                 if cookie.value is None:
1567                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1568                     # with no name, whereas http.cookiejar regards it as a
1569                     # cookie with no value.
1570                     name = ''
1571                     value = cookie.name
1572                 else:
1573                     name = cookie.name
1574                     value = cookie.value
1575                 f.write(
1576                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1577                                secure, expires, name, value]) + '\n')
1578
1579     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1580         """Load cookies from a file."""
1581         if filename is None:
1582             if self.filename is not None:
1583                 filename = self.filename
1584             else:
1585                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1586
1587         def prepare_line(line):
1588             if line.startswith(self._HTTPONLY_PREFIX):
1589                 line = line[len(self._HTTPONLY_PREFIX):]
1590             # comments and empty lines are fine
1591             if line.startswith('#') or not line.strip():
1592                 return line
1593             cookie_list = line.split('\t')
1594             if len(cookie_list) != self._ENTRY_LEN:
1595                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1596             cookie = self._CookieFileEntry(*cookie_list)
1597             if cookie.expires_at and not cookie.expires_at.isdigit():
1598                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1599             return line
1600
1601         cf = io.StringIO()
1602         with io.open(filename, encoding='utf-8') as f:
1603             for line in f:
1604                 try:
1605                     cf.write(prepare_line(line))
1606                 except compat_cookiejar.LoadError as e:
1607                     write_string(
1608                         'WARNING: skipping cookie file entry due to %s: %r\n'
1609                         % (e, line), sys.stderr)
1610                     continue
1611         cf.seek(0)
1612         self._really_load(cf, filename, ignore_discard, ignore_expires)
1613         # Session cookies are denoted by either `expires` field set to
1614         # an empty string or 0. MozillaCookieJar only recognizes the former
1615         # (see [1]). So we need force the latter to be recognized as session
1616         # cookies on our own.
1617         # Session cookies may be important for cookies-based authentication,
1618         # e.g. usually, when user does not check 'Remember me' check box while
1619         # logging in on a site, some important cookies are stored as session
1620         # cookies so that not recognizing them will result in failed login.
1621         # 1. https://bugs.python.org/issue17164
1622         for cookie in self:
1623             # Treat `expires=0` cookies as session cookies
1624             if cookie.expires == 0:
1625                 cookie.expires = None
1626                 cookie.discard = True
1627
1628
1629 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1630     def __init__(self, cookiejar=None):
1631         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1632
1633     def http_response(self, request, response):
1634         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1635         # characters in Set-Cookie HTTP header of last response (see
1636         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1637         # In order to at least prevent crashing we will percent encode Set-Cookie
1638         # header before HTTPCookieProcessor starts processing it.
1639         # if sys.version_info < (3, 0) and response.headers:
1640         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1641         #         set_cookie = response.headers.get(set_cookie_header)
1642         #         if set_cookie:
1643         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1644         #             if set_cookie != set_cookie_escaped:
1645         #                 del response.headers[set_cookie_header]
1646         #                 response.headers[set_cookie_header] = set_cookie_escaped
1647         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1648
1649     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1650     https_response = http_response
1651
1652
1653 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1654     """YoutubeDL redirect handler
1655
1656     The code is based on HTTPRedirectHandler implementation from CPython [1].
1657
1658     This redirect handler solves two issues:
1659      - ensures redirect URL is always unicode under python 2
1660      - introduces support for experimental HTTP response status code
1661        308 Permanent Redirect [2] used by some sites [3]
1662
1663     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1664     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1665     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1666     """
1667
1668     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1669
1670     def redirect_request(self, req, fp, code, msg, headers, newurl):
1671         """Return a Request or None in response to a redirect.
1672
1673         This is called by the http_error_30x methods when a
1674         redirection response is received.  If a redirection should
1675         take place, return a new Request to allow http_error_30x to
1676         perform the redirect.  Otherwise, raise HTTPError if no-one
1677         else should try to handle this url.  Return None if you can't
1678         but another Handler might.
1679         """
1680         m = req.get_method()
1681         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1682                  or code in (301, 302, 303) and m == "POST")):
1683             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1684         # Strictly (according to RFC 2616), 301 or 302 in response to
1685         # a POST MUST NOT cause a redirection without confirmation
1686         # from the user (of urllib.request, in this case).  In practice,
1687         # essentially all clients do redirect in this case, so we do
1688         # the same.
1689
1690         # On python 2 urlh.geturl() may sometimes return redirect URL
1691         # as byte string instead of unicode. This workaround allows
1692         # to force it always return unicode.
1693         if sys.version_info[0] < 3:
1694             newurl = compat_str(newurl)
1695
1696         # Be conciliant with URIs containing a space.  This is mainly
1697         # redundant with the more complete encoding done in http_error_302(),
1698         # but it is kept for compatibility with other callers.
1699         newurl = newurl.replace(' ', '%20')
1700
1701         CONTENT_HEADERS = ("content-length", "content-type")
1702         # NB: don't use dict comprehension for python 2.6 compatibility
1703         newheaders = dict((k, v) for k, v in req.headers.items()
1704                           if k.lower() not in CONTENT_HEADERS)
1705         return compat_urllib_request.Request(
1706             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1707             unverifiable=True)
1708
1709
1710 def extract_timezone(date_str):
1711     m = re.search(
1712         r'''(?x)
1713             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1714             (?P<tz>Z|                                            # just the UTC Z, or
1715                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1716                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1717                    [ ]?                                          # optional space
1718                 (?P<sign>\+|-)                                   # +/-
1719                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1720             $)
1721         ''', date_str)
1722     if not m:
1723         timezone = datetime.timedelta()
1724     else:
1725         date_str = date_str[:-len(m.group('tz'))]
1726         if not m.group('sign'):
1727             timezone = datetime.timedelta()
1728         else:
1729             sign = 1 if m.group('sign') == '+' else -1
1730             timezone = datetime.timedelta(
1731                 hours=sign * int(m.group('hours')),
1732                 minutes=sign * int(m.group('minutes')))
1733     return timezone, date_str
1734
1735
1736 def parse_iso8601(date_str, delimiter='T', timezone=None):
1737     """ Return a UNIX timestamp from the given date """
1738
1739     if date_str is None:
1740         return None
1741
1742     date_str = re.sub(r'\.[0-9]+', '', date_str)
1743
1744     if timezone is None:
1745         timezone, date_str = extract_timezone(date_str)
1746
1747     try:
1748         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1749         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1750         return calendar.timegm(dt.timetuple())
1751     except ValueError:
1752         pass
1753
1754
1755 def date_formats(day_first=True):
1756     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1757
1758
1759 def unified_strdate(date_str, day_first=True):
1760     """Return a string with the date in the format YYYYMMDD"""
1761
1762     if date_str is None:
1763         return None
1764     upload_date = None
1765     # Replace commas
1766     date_str = date_str.replace(',', ' ')
1767     # Remove AM/PM + timezone
1768     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1769     _, date_str = extract_timezone(date_str)
1770
1771     for expression in date_formats(day_first):
1772         try:
1773             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1774         except ValueError:
1775             pass
1776     if upload_date is None:
1777         timetuple = email.utils.parsedate_tz(date_str)
1778         if timetuple:
1779             try:
1780                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1781             except ValueError:
1782                 pass
1783     if upload_date is not None:
1784         return compat_str(upload_date)
1785
1786
1787 def unified_timestamp(date_str, day_first=True):
1788     if date_str is None:
1789         return None
1790
1791     date_str = re.sub(r'[,|]', '', date_str)
1792
1793     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1794     timezone, date_str = extract_timezone(date_str)
1795
1796     # Remove AM/PM + timezone
1797     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1798
1799     # Remove unrecognized timezones from ISO 8601 alike timestamps
1800     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1801     if m:
1802         date_str = date_str[:-len(m.group('tz'))]
1803
1804     # Python only supports microseconds, so remove nanoseconds
1805     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1806     if m:
1807         date_str = m.group(1)
1808
1809     for expression in date_formats(day_first):
1810         try:
1811             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1812             return calendar.timegm(dt.timetuple())
1813         except ValueError:
1814             pass
1815     timetuple = email.utils.parsedate_tz(date_str)
1816     if timetuple:
1817         return calendar.timegm(timetuple) + pm_delta * 3600
1818
1819
1820 def determine_ext(url, default_ext='unknown_video'):
1821     if url is None or '.' not in url:
1822         return default_ext
1823     guess = url.partition('?')[0].rpartition('.')[2]
1824     if re.match(r'^[A-Za-z0-9]+$', guess):
1825         return guess
1826     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1827     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1828         return guess.rstrip('/')
1829     else:
1830         return default_ext
1831
1832
1833 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1834     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1835
1836
1837 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1838     """
1839     Return a datetime object from a string in the format YYYYMMDD or
1840     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1841
1842     format: string date format used to return datetime object from
1843     precision: round the time portion of a datetime object.
1844                 auto|microsecond|second|minute|hour|day.
1845                 auto: round to the unit provided in date_str (if applicable).
1846     """
1847     auto_precision = False
1848     if precision == 'auto':
1849         auto_precision = True
1850         precision = 'microsecond'
1851     today = datetime_round(datetime.datetime.utcnow(), precision)
1852     if date_str in ('now', 'today'):
1853         return today
1854     if date_str == 'yesterday':
1855         return today - datetime.timedelta(days=1)
1856     match = re.match(
1857         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1858         date_str)
1859     if match is not None:
1860         start_time = datetime_from_str(match.group('start'), precision, format)
1861         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1862         unit = match.group('unit')
1863         if unit == 'month' or unit == 'year':
1864             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1865             unit = 'day'
1866         else:
1867             if unit == 'week':
1868                 unit = 'day'
1869                 time *= 7
1870             delta = datetime.timedelta(**{unit + 's': time})
1871             new_date = start_time + delta
1872         if auto_precision:
1873             return datetime_round(new_date, unit)
1874         return new_date
1875
1876     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1877
1878
1879 def date_from_str(date_str, format='%Y%m%d'):
1880     """
1881     Return a datetime object from a string in the format YYYYMMDD or
1882     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1883
1884     format: string date format used to return datetime object from
1885     """
1886     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1887
1888
1889 def datetime_add_months(dt, months):
1890     """Increment/Decrement a datetime object by months."""
1891     month = dt.month + months - 1
1892     year = dt.year + month // 12
1893     month = month % 12 + 1
1894     day = min(dt.day, calendar.monthrange(year, month)[1])
1895     return dt.replace(year, month, day)
1896
1897
1898 def datetime_round(dt, precision='day'):
1899     """
1900     Round a datetime object's time to a specific precision
1901     """
1902     if precision == 'microsecond':
1903         return dt
1904
1905     unit_seconds = {
1906         'day': 86400,
1907         'hour': 3600,
1908         'minute': 60,
1909         'second': 1,
1910     }
1911     roundto = lambda x, n: ((x + n / 2) // n) * n
1912     timestamp = calendar.timegm(dt.timetuple())
1913     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1914
1915
1916 def hyphenate_date(date_str):
1917     """
1918     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1919     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1920     if match is not None:
1921         return '-'.join(match.groups())
1922     else:
1923         return date_str
1924
1925
1926 class DateRange(object):
1927     """Represents a time interval between two dates"""
1928
1929     def __init__(self, start=None, end=None):
1930         """start and end must be strings in the format accepted by date"""
1931         if start is not None:
1932             self.start = date_from_str(start)
1933         else:
1934             self.start = datetime.datetime.min.date()
1935         if end is not None:
1936             self.end = date_from_str(end)
1937         else:
1938             self.end = datetime.datetime.max.date()
1939         if self.start > self.end:
1940             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1941
1942     @classmethod
1943     def day(cls, day):
1944         """Returns a range that only contains the given day"""
1945         return cls(day, day)
1946
1947     def __contains__(self, date):
1948         """Check if the date is in the range"""
1949         if not isinstance(date, datetime.date):
1950             date = date_from_str(date)
1951         return self.start <= date <= self.end
1952
1953     def __str__(self):
1954         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1955
1956
1957 def platform_name():
1958     """ Returns the platform name as a compat_str """
1959     res = platform.platform()
1960     if isinstance(res, bytes):
1961         res = res.decode(preferredencoding())
1962
1963     assert isinstance(res, compat_str)
1964     return res
1965
1966
1967 def get_windows_version():
1968     ''' Get Windows version. None if it's not running on Windows '''
1969     if compat_os_name == 'nt':
1970         return version_tuple(platform.win32_ver()[1])
1971     else:
1972         return None
1973
1974
1975 def _windows_write_string(s, out):
1976     """ Returns True if the string was written using special methods,
1977     False if it has yet to be written out."""
1978     # Adapted from http://stackoverflow.com/a/3259271/35070
1979
1980     import ctypes.wintypes
1981
1982     WIN_OUTPUT_IDS = {
1983         1: -11,
1984         2: -12,
1985     }
1986
1987     try:
1988         fileno = out.fileno()
1989     except AttributeError:
1990         # If the output stream doesn't have a fileno, it's virtual
1991         return False
1992     except io.UnsupportedOperation:
1993         # Some strange Windows pseudo files?
1994         return False
1995     if fileno not in WIN_OUTPUT_IDS:
1996         return False
1997
1998     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1999         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2000         ('GetStdHandle', ctypes.windll.kernel32))
2001     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2002
2003     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2004         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2005         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2006         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2007     written = ctypes.wintypes.DWORD(0)
2008
2009     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2010     FILE_TYPE_CHAR = 0x0002
2011     FILE_TYPE_REMOTE = 0x8000
2012     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2013         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2014         ctypes.POINTER(ctypes.wintypes.DWORD))(
2015         ('GetConsoleMode', ctypes.windll.kernel32))
2016     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2017
2018     def not_a_console(handle):
2019         if handle == INVALID_HANDLE_VALUE or handle is None:
2020             return True
2021         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2022                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2023
2024     if not_a_console(h):
2025         return False
2026
2027     def next_nonbmp_pos(s):
2028         try:
2029             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2030         except StopIteration:
2031             return len(s)
2032
2033     while s:
2034         count = min(next_nonbmp_pos(s), 1024)
2035
2036         ret = WriteConsoleW(
2037             h, s, count if count else 2, ctypes.byref(written), None)
2038         if ret == 0:
2039             raise OSError('Failed to write string')
2040         if not count:  # We just wrote a non-BMP character
2041             assert written.value == 2
2042             s = s[1:]
2043         else:
2044             assert written.value > 0
2045             s = s[written.value:]
2046     return True
2047
2048
2049 def write_string(s, out=None, encoding=None):
2050     if out is None:
2051         out = sys.stderr
2052     assert type(s) == compat_str
2053
2054     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2055         if _windows_write_string(s, out):
2056             return
2057
2058     if ('b' in getattr(out, 'mode', '')
2059             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2060         byt = s.encode(encoding or preferredencoding(), 'ignore')
2061         out.write(byt)
2062     elif hasattr(out, 'buffer'):
2063         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2064         byt = s.encode(enc, 'ignore')
2065         out.buffer.write(byt)
2066     else:
2067         out.write(s)
2068     out.flush()
2069
2070
2071 def bytes_to_intlist(bs):
2072     if not bs:
2073         return []
2074     if isinstance(bs[0], int):  # Python 3
2075         return list(bs)
2076     else:
2077         return [ord(c) for c in bs]
2078
2079
2080 def intlist_to_bytes(xs):
2081     if not xs:
2082         return b''
2083     return compat_struct_pack('%dB' % len(xs), *xs)
2084
2085
2086 # Cross-platform file locking
2087 if sys.platform == 'win32':
2088     import ctypes.wintypes
2089     import msvcrt
2090
2091     class OVERLAPPED(ctypes.Structure):
2092         _fields_ = [
2093             ('Internal', ctypes.wintypes.LPVOID),
2094             ('InternalHigh', ctypes.wintypes.LPVOID),
2095             ('Offset', ctypes.wintypes.DWORD),
2096             ('OffsetHigh', ctypes.wintypes.DWORD),
2097             ('hEvent', ctypes.wintypes.HANDLE),
2098         ]
2099
2100     kernel32 = ctypes.windll.kernel32
2101     LockFileEx = kernel32.LockFileEx
2102     LockFileEx.argtypes = [
2103         ctypes.wintypes.HANDLE,     # hFile
2104         ctypes.wintypes.DWORD,      # dwFlags
2105         ctypes.wintypes.DWORD,      # dwReserved
2106         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2107         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2108         ctypes.POINTER(OVERLAPPED)  # Overlapped
2109     ]
2110     LockFileEx.restype = ctypes.wintypes.BOOL
2111     UnlockFileEx = kernel32.UnlockFileEx
2112     UnlockFileEx.argtypes = [
2113         ctypes.wintypes.HANDLE,     # hFile
2114         ctypes.wintypes.DWORD,      # dwReserved
2115         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2116         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2117         ctypes.POINTER(OVERLAPPED)  # Overlapped
2118     ]
2119     UnlockFileEx.restype = ctypes.wintypes.BOOL
2120     whole_low = 0xffffffff
2121     whole_high = 0x7fffffff
2122
2123     def _lock_file(f, exclusive):
2124         overlapped = OVERLAPPED()
2125         overlapped.Offset = 0
2126         overlapped.OffsetHigh = 0
2127         overlapped.hEvent = 0
2128         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2129         handle = msvcrt.get_osfhandle(f.fileno())
2130         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2131                           whole_low, whole_high, f._lock_file_overlapped_p):
2132             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2133
2134     def _unlock_file(f):
2135         assert f._lock_file_overlapped_p
2136         handle = msvcrt.get_osfhandle(f.fileno())
2137         if not UnlockFileEx(handle, 0,
2138                             whole_low, whole_high, f._lock_file_overlapped_p):
2139             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2140
2141 else:
2142     # Some platforms, such as Jython, is missing fcntl
2143     try:
2144         import fcntl
2145
2146         def _lock_file(f, exclusive):
2147             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2148
2149         def _unlock_file(f):
2150             fcntl.flock(f, fcntl.LOCK_UN)
2151     except ImportError:
2152         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2153
2154         def _lock_file(f, exclusive):
2155             raise IOError(UNSUPPORTED_MSG)
2156
2157         def _unlock_file(f):
2158             raise IOError(UNSUPPORTED_MSG)
2159
2160
2161 class locked_file(object):
2162     def __init__(self, filename, mode, encoding=None):
2163         assert mode in ['r', 'a', 'w']
2164         self.f = io.open(filename, mode, encoding=encoding)
2165         self.mode = mode
2166
2167     def __enter__(self):
2168         exclusive = self.mode != 'r'
2169         try:
2170             _lock_file(self.f, exclusive)
2171         except IOError:
2172             self.f.close()
2173             raise
2174         return self
2175
2176     def __exit__(self, etype, value, traceback):
2177         try:
2178             _unlock_file(self.f)
2179         finally:
2180             self.f.close()
2181
2182     def __iter__(self):
2183         return iter(self.f)
2184
2185     def write(self, *args):
2186         return self.f.write(*args)
2187
2188     def read(self, *args):
2189         return self.f.read(*args)
2190
2191
2192 def get_filesystem_encoding():
2193     encoding = sys.getfilesystemencoding()
2194     return encoding if encoding is not None else 'utf-8'
2195
2196
2197 def shell_quote(args):
2198     quoted_args = []
2199     encoding = get_filesystem_encoding()
2200     for a in args:
2201         if isinstance(a, bytes):
2202             # We may get a filename encoded with 'encodeFilename'
2203             a = a.decode(encoding)
2204         quoted_args.append(compat_shlex_quote(a))
2205     return ' '.join(quoted_args)
2206
2207
2208 def smuggle_url(url, data):
2209     """ Pass additional data in a URL for internal use. """
2210
2211     url, idata = unsmuggle_url(url, {})
2212     data.update(idata)
2213     sdata = compat_urllib_parse_urlencode(
2214         {'__youtubedl_smuggle': json.dumps(data)})
2215     return url + '#' + sdata
2216
2217
2218 def unsmuggle_url(smug_url, default=None):
2219     if '#__youtubedl_smuggle' not in smug_url:
2220         return smug_url, default
2221     url, _, sdata = smug_url.rpartition('#')
2222     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2223     data = json.loads(jsond)
2224     return url, data
2225
2226
2227 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2228     """ Formats numbers with decimal sufixes like K, M, etc """
2229     num, factor = float_or_none(num), float(factor)
2230     if num is None:
2231         return None
2232     exponent = 0 if num == 0 else int(math.log(num, factor))
2233     suffix = ['', *'kMGTPEZY'][exponent]
2234     if factor == 1024:
2235         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2236     converted = num / (factor ** exponent)
2237     return fmt % (converted, suffix)
2238
2239
2240 def format_bytes(bytes):
2241     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2242
2243
2244 def lookup_unit_table(unit_table, s):
2245     units_re = '|'.join(re.escape(u) for u in unit_table)
2246     m = re.match(
2247         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2248     if not m:
2249         return None
2250     num_str = m.group('num').replace(',', '.')
2251     mult = unit_table[m.group('unit')]
2252     return int(float(num_str) * mult)
2253
2254
2255 def parse_filesize(s):
2256     if s is None:
2257         return None
2258
2259     # The lower-case forms are of course incorrect and unofficial,
2260     # but we support those too
2261     _UNIT_TABLE = {
2262         'B': 1,
2263         'b': 1,
2264         'bytes': 1,
2265         'KiB': 1024,
2266         'KB': 1000,
2267         'kB': 1024,
2268         'Kb': 1000,
2269         'kb': 1000,
2270         'kilobytes': 1000,
2271         'kibibytes': 1024,
2272         'MiB': 1024 ** 2,
2273         'MB': 1000 ** 2,
2274         'mB': 1024 ** 2,
2275         'Mb': 1000 ** 2,
2276         'mb': 1000 ** 2,
2277         'megabytes': 1000 ** 2,
2278         'mebibytes': 1024 ** 2,
2279         'GiB': 1024 ** 3,
2280         'GB': 1000 ** 3,
2281         'gB': 1024 ** 3,
2282         'Gb': 1000 ** 3,
2283         'gb': 1000 ** 3,
2284         'gigabytes': 1000 ** 3,
2285         'gibibytes': 1024 ** 3,
2286         'TiB': 1024 ** 4,
2287         'TB': 1000 ** 4,
2288         'tB': 1024 ** 4,
2289         'Tb': 1000 ** 4,
2290         'tb': 1000 ** 4,
2291         'terabytes': 1000 ** 4,
2292         'tebibytes': 1024 ** 4,
2293         'PiB': 1024 ** 5,
2294         'PB': 1000 ** 5,
2295         'pB': 1024 ** 5,
2296         'Pb': 1000 ** 5,
2297         'pb': 1000 ** 5,
2298         'petabytes': 1000 ** 5,
2299         'pebibytes': 1024 ** 5,
2300         'EiB': 1024 ** 6,
2301         'EB': 1000 ** 6,
2302         'eB': 1024 ** 6,
2303         'Eb': 1000 ** 6,
2304         'eb': 1000 ** 6,
2305         'exabytes': 1000 ** 6,
2306         'exbibytes': 1024 ** 6,
2307         'ZiB': 1024 ** 7,
2308         'ZB': 1000 ** 7,
2309         'zB': 1024 ** 7,
2310         'Zb': 1000 ** 7,
2311         'zb': 1000 ** 7,
2312         'zettabytes': 1000 ** 7,
2313         'zebibytes': 1024 ** 7,
2314         'YiB': 1024 ** 8,
2315         'YB': 1000 ** 8,
2316         'yB': 1024 ** 8,
2317         'Yb': 1000 ** 8,
2318         'yb': 1000 ** 8,
2319         'yottabytes': 1000 ** 8,
2320         'yobibytes': 1024 ** 8,
2321     }
2322
2323     return lookup_unit_table(_UNIT_TABLE, s)
2324
2325
2326 def parse_count(s):
2327     if s is None:
2328         return None
2329
2330     s = re.sub(r'^[^\d]+\s', '', s).strip()
2331
2332     if re.match(r'^[\d,.]+$', s):
2333         return str_to_int(s)
2334
2335     _UNIT_TABLE = {
2336         'k': 1000,
2337         'K': 1000,
2338         'm': 1000 ** 2,
2339         'M': 1000 ** 2,
2340         'kk': 1000 ** 2,
2341         'KK': 1000 ** 2,
2342         'b': 1000 ** 3,
2343         'B': 1000 ** 3,
2344     }
2345
2346     ret = lookup_unit_table(_UNIT_TABLE, s)
2347     if ret is not None:
2348         return ret
2349
2350     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2351     if mobj:
2352         return str_to_int(mobj.group(1))
2353
2354
2355 def parse_resolution(s):
2356     if s is None:
2357         return {}
2358
2359     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2360     if mobj:
2361         return {
2362             'width': int(mobj.group('w')),
2363             'height': int(mobj.group('h')),
2364         }
2365
2366     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2367     if mobj:
2368         return {'height': int(mobj.group(1))}
2369
2370     mobj = re.search(r'\b([48])[kK]\b', s)
2371     if mobj:
2372         return {'height': int(mobj.group(1)) * 540}
2373
2374     return {}
2375
2376
2377 def parse_bitrate(s):
2378     if not isinstance(s, compat_str):
2379         return
2380     mobj = re.search(r'\b(\d+)\s*kbps', s)
2381     if mobj:
2382         return int(mobj.group(1))
2383
2384
2385 def month_by_name(name, lang='en'):
2386     """ Return the number of a month by (locale-independently) English name """
2387
2388     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2389
2390     try:
2391         return month_names.index(name) + 1
2392     except ValueError:
2393         return None
2394
2395
2396 def month_by_abbreviation(abbrev):
2397     """ Return the number of a month by (locale-independently) English
2398         abbreviations """
2399
2400     try:
2401         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2402     except ValueError:
2403         return None
2404
2405
2406 def fix_xml_ampersands(xml_str):
2407     """Replace all the '&' by '&amp;' in XML"""
2408     return re.sub(
2409         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2410         '&amp;',
2411         xml_str)
2412
2413
2414 def setproctitle(title):
2415     assert isinstance(title, compat_str)
2416
2417     # ctypes in Jython is not complete
2418     # http://bugs.jython.org/issue2148
2419     if sys.platform.startswith('java'):
2420         return
2421
2422     try:
2423         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2424     except OSError:
2425         return
2426     except TypeError:
2427         # LoadLibrary in Windows Python 2.7.13 only expects
2428         # a bytestring, but since unicode_literals turns
2429         # every string into a unicode string, it fails.
2430         return
2431     title_bytes = title.encode('utf-8')
2432     buf = ctypes.create_string_buffer(len(title_bytes))
2433     buf.value = title_bytes
2434     try:
2435         libc.prctl(15, buf, 0, 0, 0)
2436     except AttributeError:
2437         return  # Strange libc, just skip this
2438
2439
2440 def remove_start(s, start):
2441     return s[len(start):] if s is not None and s.startswith(start) else s
2442
2443
2444 def remove_end(s, end):
2445     return s[:-len(end)] if s is not None and s.endswith(end) else s
2446
2447
2448 def remove_quotes(s):
2449     if s is None or len(s) < 2:
2450         return s
2451     for quote in ('"', "'", ):
2452         if s[0] == quote and s[-1] == quote:
2453             return s[1:-1]
2454     return s
2455
2456
2457 def get_domain(url):
2458     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2459     return domain.group('domain') if domain else None
2460
2461
2462 def url_basename(url):
2463     path = compat_urlparse.urlparse(url).path
2464     return path.strip('/').split('/')[-1]
2465
2466
2467 def base_url(url):
2468     return re.match(r'https?://[^?#&]+/', url).group()
2469
2470
2471 def urljoin(base, path):
2472     if isinstance(path, bytes):
2473         path = path.decode('utf-8')
2474     if not isinstance(path, compat_str) or not path:
2475         return None
2476     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2477         return path
2478     if isinstance(base, bytes):
2479         base = base.decode('utf-8')
2480     if not isinstance(base, compat_str) or not re.match(
2481             r'^(?:https?:)?//', base):
2482         return None
2483     return compat_urlparse.urljoin(base, path)
2484
2485
2486 class HEADRequest(compat_urllib_request.Request):
2487     def get_method(self):
2488         return 'HEAD'
2489
2490
2491 class PUTRequest(compat_urllib_request.Request):
2492     def get_method(self):
2493         return 'PUT'
2494
2495
2496 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2497     if get_attr and v is not None:
2498         v = getattr(v, get_attr, None)
2499     try:
2500         return int(v) * invscale // scale
2501     except (ValueError, TypeError, OverflowError):
2502         return default
2503
2504
2505 def str_or_none(v, default=None):
2506     return default if v is None else compat_str(v)
2507
2508
2509 def str_to_int(int_str):
2510     """ A more relaxed version of int_or_none """
2511     if isinstance(int_str, compat_integer_types):
2512         return int_str
2513     elif isinstance(int_str, compat_str):
2514         int_str = re.sub(r'[,\.\+]', '', int_str)
2515         return int_or_none(int_str)
2516
2517
2518 def float_or_none(v, scale=1, invscale=1, default=None):
2519     if v is None:
2520         return default
2521     try:
2522         return float(v) * invscale / scale
2523     except (ValueError, TypeError):
2524         return default
2525
2526
2527 def bool_or_none(v, default=None):
2528     return v if isinstance(v, bool) else default
2529
2530
2531 def strip_or_none(v, default=None):
2532     return v.strip() if isinstance(v, compat_str) else default
2533
2534
2535 def url_or_none(url):
2536     if not url or not isinstance(url, compat_str):
2537         return None
2538     url = url.strip()
2539     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2540
2541
2542 def strftime_or_none(timestamp, date_format, default=None):
2543     datetime_object = None
2544     try:
2545         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2546             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2547         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2548             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2549         return datetime_object.strftime(date_format)
2550     except (ValueError, TypeError, AttributeError):
2551         return default
2552
2553
2554 def parse_duration(s):
2555     if not isinstance(s, compat_basestring):
2556         return None
2557     s = s.strip()
2558     if not s:
2559         return None
2560
2561     days, hours, mins, secs, ms = [None] * 5
2562     m = re.match(r'''(?x)
2563             (?P<before_secs>
2564                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2565             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2566             (?P<ms>[.:][0-9]+)?Z?$
2567         ''', s)
2568     if m:
2569         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2570     else:
2571         m = re.match(
2572             r'''(?ix)(?:P?
2573                 (?:
2574                     [0-9]+\s*y(?:ears?)?\s*
2575                 )?
2576                 (?:
2577                     [0-9]+\s*m(?:onths?)?\s*
2578                 )?
2579                 (?:
2580                     [0-9]+\s*w(?:eeks?)?\s*
2581                 )?
2582                 (?:
2583                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2584                 )?
2585                 T)?
2586                 (?:
2587                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2588                 )?
2589                 (?:
2590                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2591                 )?
2592                 (?:
2593                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2594                 )?Z?$''', s)
2595         if m:
2596             days, hours, mins, secs, ms = m.groups()
2597         else:
2598             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2599             if m:
2600                 hours, mins = m.groups()
2601             else:
2602                 return None
2603
2604     duration = 0
2605     if secs:
2606         duration += float(secs)
2607     if mins:
2608         duration += float(mins) * 60
2609     if hours:
2610         duration += float(hours) * 60 * 60
2611     if days:
2612         duration += float(days) * 24 * 60 * 60
2613     if ms:
2614         duration += float(ms.replace(':', '.'))
2615     return duration
2616
2617
2618 def prepend_extension(filename, ext, expected_real_ext=None):
2619     name, real_ext = os.path.splitext(filename)
2620     return (
2621         '{0}.{1}{2}'.format(name, ext, real_ext)
2622         if not expected_real_ext or real_ext[1:] == expected_real_ext
2623         else '{0}.{1}'.format(filename, ext))
2624
2625
2626 def replace_extension(filename, ext, expected_real_ext=None):
2627     name, real_ext = os.path.splitext(filename)
2628     return '{0}.{1}'.format(
2629         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2630         ext)
2631
2632
2633 def check_executable(exe, args=[]):
2634     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2635     args can be a list of arguments for a short output (like -version) """
2636     try:
2637         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2638     except OSError:
2639         return False
2640     return exe
2641
2642
2643 def _get_exe_version_output(exe, args):
2644     try:
2645         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2646         # SIGTTOU if yt-dlp is run in the background.
2647         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2648         out, _ = Popen(
2649             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2650             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2651     except OSError:
2652         return False
2653     if isinstance(out, bytes):  # Python 2.x
2654         out = out.decode('ascii', 'ignore')
2655     return out
2656
2657
2658 def detect_exe_version(output, version_re=None, unrecognized='present'):
2659     assert isinstance(output, compat_str)
2660     if version_re is None:
2661         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2662     m = re.search(version_re, output)
2663     if m:
2664         return m.group(1)
2665     else:
2666         return unrecognized
2667
2668
2669 def get_exe_version(exe, args=['--version'],
2670                     version_re=None, unrecognized='present'):
2671     """ Returns the version of the specified executable,
2672     or False if the executable is not present """
2673     out = _get_exe_version_output(exe, args)
2674     return detect_exe_version(out, version_re, unrecognized) if out else False
2675
2676
2677 class LazyList(collections.abc.Sequence):
2678     ''' Lazy immutable list from an iterable
2679     Note that slices of a LazyList are lists and not LazyList'''
2680
2681     class IndexError(IndexError):
2682         pass
2683
2684     def __init__(self, iterable, *, reverse=False, _cache=None):
2685         self.__iterable = iter(iterable)
2686         self.__cache = [] if _cache is None else _cache
2687         self.__reversed = reverse
2688
2689     def __iter__(self):
2690         if self.__reversed:
2691             # We need to consume the entire iterable to iterate in reverse
2692             yield from self.exhaust()
2693             return
2694         yield from self.__cache
2695         for item in self.__iterable:
2696             self.__cache.append(item)
2697             yield item
2698
2699     def __exhaust(self):
2700         self.__cache.extend(self.__iterable)
2701         # Discard the emptied iterable to make it pickle-able
2702         self.__iterable = []
2703         return self.__cache
2704
2705     def exhaust(self):
2706         ''' Evaluate the entire iterable '''
2707         return self.__exhaust()[::-1 if self.__reversed else 1]
2708
2709     @staticmethod
2710     def __reverse_index(x):
2711         return None if x is None else -(x + 1)
2712
2713     def __getitem__(self, idx):
2714         if isinstance(idx, slice):
2715             if self.__reversed:
2716                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2717             start, stop, step = idx.start, idx.stop, idx.step or 1
2718         elif isinstance(idx, int):
2719             if self.__reversed:
2720                 idx = self.__reverse_index(idx)
2721             start, stop, step = idx, idx, 0
2722         else:
2723             raise TypeError('indices must be integers or slices')
2724         if ((start or 0) < 0 or (stop or 0) < 0
2725                 or (start is None and step < 0)
2726                 or (stop is None and step > 0)):
2727             # We need to consume the entire iterable to be able to slice from the end
2728             # Obviously, never use this with infinite iterables
2729             self.__exhaust()
2730             try:
2731                 return self.__cache[idx]
2732             except IndexError as e:
2733                 raise self.IndexError(e) from e
2734         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2735         if n > 0:
2736             self.__cache.extend(itertools.islice(self.__iterable, n))
2737         try:
2738             return self.__cache[idx]
2739         except IndexError as e:
2740             raise self.IndexError(e) from e
2741
2742     def __bool__(self):
2743         try:
2744             self[-1] if self.__reversed else self[0]
2745         except self.IndexError:
2746             return False
2747         return True
2748
2749     def __len__(self):
2750         self.__exhaust()
2751         return len(self.__cache)
2752
2753     def __reversed__(self):
2754         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2755
2756     def __copy__(self):
2757         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2758
2759     def __repr__(self):
2760         # repr and str should mimic a list. So we exhaust the iterable
2761         return repr(self.exhaust())
2762
2763     def __str__(self):
2764         return repr(self.exhaust())
2765
2766
2767 class PagedList:
2768
2769     class IndexError(IndexError):
2770         pass
2771
2772     def __len__(self):
2773         # This is only useful for tests
2774         return len(self.getslice())
2775
2776     def __init__(self, pagefunc, pagesize, use_cache=True):
2777         self._pagefunc = pagefunc
2778         self._pagesize = pagesize
2779         self._use_cache = use_cache
2780         self._cache = {}
2781
2782     def getpage(self, pagenum):
2783         page_results = self._cache.get(pagenum)
2784         if page_results is None:
2785             page_results = list(self._pagefunc(pagenum))
2786         if self._use_cache:
2787             self._cache[pagenum] = page_results
2788         return page_results
2789
2790     def getslice(self, start=0, end=None):
2791         return list(self._getslice(start, end))
2792
2793     def _getslice(self, start, end):
2794         raise NotImplementedError('This method must be implemented by subclasses')
2795
2796     def __getitem__(self, idx):
2797         # NOTE: cache must be enabled if this is used
2798         if not isinstance(idx, int) or idx < 0:
2799             raise TypeError('indices must be non-negative integers')
2800         entries = self.getslice(idx, idx + 1)
2801         if not entries:
2802             raise self.IndexError()
2803         return entries[0]
2804
2805
2806 class OnDemandPagedList(PagedList):
2807     def _getslice(self, start, end):
2808         for pagenum in itertools.count(start // self._pagesize):
2809             firstid = pagenum * self._pagesize
2810             nextfirstid = pagenum * self._pagesize + self._pagesize
2811             if start >= nextfirstid:
2812                 continue
2813
2814             startv = (
2815                 start % self._pagesize
2816                 if firstid <= start < nextfirstid
2817                 else 0)
2818             endv = (
2819                 ((end - 1) % self._pagesize) + 1
2820                 if (end is not None and firstid <= end <= nextfirstid)
2821                 else None)
2822
2823             page_results = self.getpage(pagenum)
2824             if startv != 0 or endv is not None:
2825                 page_results = page_results[startv:endv]
2826             yield from page_results
2827
2828             # A little optimization - if current page is not "full", ie. does
2829             # not contain page_size videos then we can assume that this page
2830             # is the last one - there are no more ids on further pages -
2831             # i.e. no need to query again.
2832             if len(page_results) + startv < self._pagesize:
2833                 break
2834
2835             # If we got the whole page, but the next page is not interesting,
2836             # break out early as well
2837             if end == nextfirstid:
2838                 break
2839
2840
2841 class InAdvancePagedList(PagedList):
2842     def __init__(self, pagefunc, pagecount, pagesize):
2843         self._pagecount = pagecount
2844         PagedList.__init__(self, pagefunc, pagesize, True)
2845
2846     def _getslice(self, start, end):
2847         start_page = start // self._pagesize
2848         end_page = (
2849             self._pagecount if end is None else (end // self._pagesize + 1))
2850         skip_elems = start - start_page * self._pagesize
2851         only_more = None if end is None else end - start
2852         for pagenum in range(start_page, end_page):
2853             page_results = self.getpage(pagenum)
2854             if skip_elems:
2855                 page_results = page_results[skip_elems:]
2856                 skip_elems = None
2857             if only_more is not None:
2858                 if len(page_results) < only_more:
2859                     only_more -= len(page_results)
2860                 else:
2861                     yield from page_results[:only_more]
2862                     break
2863             yield from page_results
2864
2865
2866 def uppercase_escape(s):
2867     unicode_escape = codecs.getdecoder('unicode_escape')
2868     return re.sub(
2869         r'\\U[0-9a-fA-F]{8}',
2870         lambda m: unicode_escape(m.group(0))[0],
2871         s)
2872
2873
2874 def lowercase_escape(s):
2875     unicode_escape = codecs.getdecoder('unicode_escape')
2876     return re.sub(
2877         r'\\u[0-9a-fA-F]{4}',
2878         lambda m: unicode_escape(m.group(0))[0],
2879         s)
2880
2881
2882 def escape_rfc3986(s):
2883     """Escape non-ASCII characters as suggested by RFC 3986"""
2884     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2885         s = s.encode('utf-8')
2886     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2887
2888
2889 def escape_url(url):
2890     """Escape URL as suggested by RFC 3986"""
2891     url_parsed = compat_urllib_parse_urlparse(url)
2892     return url_parsed._replace(
2893         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2894         path=escape_rfc3986(url_parsed.path),
2895         params=escape_rfc3986(url_parsed.params),
2896         query=escape_rfc3986(url_parsed.query),
2897         fragment=escape_rfc3986(url_parsed.fragment)
2898     ).geturl()
2899
2900
2901 def parse_qs(url):
2902     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2903
2904
2905 def read_batch_urls(batch_fd):
2906     def fixup(url):
2907         if not isinstance(url, compat_str):
2908             url = url.decode('utf-8', 'replace')
2909         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2910         for bom in BOM_UTF8:
2911             if url.startswith(bom):
2912                 url = url[len(bom):]
2913         url = url.lstrip()
2914         if not url or url.startswith(('#', ';', ']')):
2915             return False
2916         # "#" cannot be stripped out since it is part of the URI
2917         # However, it can be safely stipped out if follwing a whitespace
2918         return re.split(r'\s#', url, 1)[0].rstrip()
2919
2920     with contextlib.closing(batch_fd) as fd:
2921         return [url for url in map(fixup, fd) if url]
2922
2923
2924 def urlencode_postdata(*args, **kargs):
2925     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2926
2927
2928 def update_url_query(url, query):
2929     if not query:
2930         return url
2931     parsed_url = compat_urlparse.urlparse(url)
2932     qs = compat_parse_qs(parsed_url.query)
2933     qs.update(query)
2934     return compat_urlparse.urlunparse(parsed_url._replace(
2935         query=compat_urllib_parse_urlencode(qs, True)))
2936
2937
2938 def update_Request(req, url=None, data=None, headers={}, query={}):
2939     req_headers = req.headers.copy()
2940     req_headers.update(headers)
2941     req_data = data or req.data
2942     req_url = update_url_query(url or req.get_full_url(), query)
2943     req_get_method = req.get_method()
2944     if req_get_method == 'HEAD':
2945         req_type = HEADRequest
2946     elif req_get_method == 'PUT':
2947         req_type = PUTRequest
2948     else:
2949         req_type = compat_urllib_request.Request
2950     new_req = req_type(
2951         req_url, data=req_data, headers=req_headers,
2952         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2953     if hasattr(req, 'timeout'):
2954         new_req.timeout = req.timeout
2955     return new_req
2956
2957
2958 def _multipart_encode_impl(data, boundary):
2959     content_type = 'multipart/form-data; boundary=%s' % boundary
2960
2961     out = b''
2962     for k, v in data.items():
2963         out += b'--' + boundary.encode('ascii') + b'\r\n'
2964         if isinstance(k, compat_str):
2965             k = k.encode('utf-8')
2966         if isinstance(v, compat_str):
2967             v = v.encode('utf-8')
2968         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2969         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2970         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2971         if boundary.encode('ascii') in content:
2972             raise ValueError('Boundary overlaps with data')
2973         out += content
2974
2975     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2976
2977     return out, content_type
2978
2979
2980 def multipart_encode(data, boundary=None):
2981     '''
2982     Encode a dict to RFC 7578-compliant form-data
2983
2984     data:
2985         A dict where keys and values can be either Unicode or bytes-like
2986         objects.
2987     boundary:
2988         If specified a Unicode object, it's used as the boundary. Otherwise
2989         a random boundary is generated.
2990
2991     Reference: https://tools.ietf.org/html/rfc7578
2992     '''
2993     has_specified_boundary = boundary is not None
2994
2995     while True:
2996         if boundary is None:
2997             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2998
2999         try:
3000             out, content_type = _multipart_encode_impl(data, boundary)
3001             break
3002         except ValueError:
3003             if has_specified_boundary:
3004                 raise
3005             boundary = None
3006
3007     return out, content_type
3008
3009
3010 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3011     if isinstance(key_or_keys, (list, tuple)):
3012         for key in key_or_keys:
3013             if key not in d or d[key] is None or skip_false_values and not d[key]:
3014                 continue
3015             return d[key]
3016         return default
3017     return d.get(key_or_keys, default)
3018
3019
3020 def try_get(src, getter, expected_type=None):
3021     for get in variadic(getter):
3022         try:
3023             v = get(src)
3024         except (AttributeError, KeyError, TypeError, IndexError):
3025             pass
3026         else:
3027             if expected_type is None or isinstance(v, expected_type):
3028                 return v
3029
3030
3031 def merge_dicts(*dicts):
3032     merged = {}
3033     for a_dict in dicts:
3034         for k, v in a_dict.items():
3035             if v is None:
3036                 continue
3037             if (k not in merged
3038                     or (isinstance(v, compat_str) and v
3039                         and isinstance(merged[k], compat_str)
3040                         and not merged[k])):
3041                 merged[k] = v
3042     return merged
3043
3044
3045 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3046     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3047
3048
3049 US_RATINGS = {
3050     'G': 0,
3051     'PG': 10,
3052     'PG-13': 13,
3053     'R': 16,
3054     'NC': 18,
3055 }
3056
3057
3058 TV_PARENTAL_GUIDELINES = {
3059     'TV-Y': 0,
3060     'TV-Y7': 7,
3061     'TV-G': 0,
3062     'TV-PG': 0,
3063     'TV-14': 14,
3064     'TV-MA': 17,
3065 }
3066
3067
3068 def parse_age_limit(s):
3069     if type(s) == int:
3070         return s if 0 <= s <= 21 else None
3071     if not isinstance(s, compat_basestring):
3072         return None
3073     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3074     if m:
3075         return int(m.group('age'))
3076     s = s.upper()
3077     if s in US_RATINGS:
3078         return US_RATINGS[s]
3079     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3080     if m:
3081         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3082     return None
3083
3084
3085 def strip_jsonp(code):
3086     return re.sub(
3087         r'''(?sx)^
3088             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3089             (?:\s*&&\s*(?P=func_name))?
3090             \s*\(\s*(?P<callback_data>.*)\);?
3091             \s*?(?://[^\n]*)*$''',
3092         r'\g<callback_data>', code)
3093
3094
3095 def js_to_json(code, vars={}):
3096     # vars is a dict of var, val pairs to substitute
3097     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3098     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3099     INTEGER_TABLE = (
3100         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3101         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3102     )
3103
3104     def fix_kv(m):
3105         v = m.group(0)
3106         if v in ('true', 'false', 'null'):
3107             return v
3108         elif v in ('undefined', 'void 0'):
3109             return 'null'
3110         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3111             return ""
3112
3113         if v[0] in ("'", '"'):
3114             v = re.sub(r'(?s)\\.|"', lambda m: {
3115                 '"': '\\"',
3116                 "\\'": "'",
3117                 '\\\n': '',
3118                 '\\x': '\\u00',
3119             }.get(m.group(0), m.group(0)), v[1:-1])
3120         else:
3121             for regex, base in INTEGER_TABLE:
3122                 im = re.match(regex, v)
3123                 if im:
3124                     i = int(im.group(1), base)
3125                     return '"%d":' % i if v.endswith(':') else '%d' % i
3126
3127             if v in vars:
3128                 return vars[v]
3129
3130         return '"%s"' % v
3131
3132     return re.sub(r'''(?sx)
3133         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3134         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3135         {comment}|,(?={skip}[\]}}])|
3136         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3137         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3138         [0-9]+(?={skip}:)|
3139         !+
3140         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3141
3142
3143 def qualities(quality_ids):
3144     """ Get a numeric quality value out of a list of possible values """
3145     def q(qid):
3146         try:
3147             return quality_ids.index(qid)
3148         except ValueError:
3149             return -1
3150     return q
3151
3152
3153 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3154
3155
3156 DEFAULT_OUTTMPL = {
3157     'default': '%(title)s [%(id)s].%(ext)s',
3158     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3159 }
3160 OUTTMPL_TYPES = {
3161     'chapter': None,
3162     'subtitle': None,
3163     'thumbnail': None,
3164     'description': 'description',
3165     'annotation': 'annotations.xml',
3166     'infojson': 'info.json',
3167     'link': None,
3168     'pl_video': None,
3169     'pl_thumbnail': None,
3170     'pl_description': 'description',
3171     'pl_infojson': 'info.json',
3172 }
3173
3174 # As of [1] format syntax is:
3175 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3176 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3177 STR_FORMAT_RE_TMPL = r'''(?x)
3178     (?<!%)(?P<prefix>(?:%%)*)
3179     %
3180     (?P<has_key>\((?P<key>{0})\))?
3181     (?P<format>
3182         (?P<conversion>[#0\-+ ]+)?
3183         (?P<min_width>\d+)?
3184         (?P<precision>\.\d+)?
3185         (?P<len_mod>[hlL])?  # unused in python
3186         {1}  # conversion type
3187     )
3188 '''
3189
3190
3191 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3192
3193
3194 def limit_length(s, length):
3195     """ Add ellipses to overly long strings """
3196     if s is None:
3197         return None
3198     ELLIPSES = '...'
3199     if len(s) > length:
3200         return s[:length - len(ELLIPSES)] + ELLIPSES
3201     return s
3202
3203
3204 def version_tuple(v):
3205     return tuple(int(e) for e in re.split(r'[-.]', v))
3206
3207
3208 def is_outdated_version(version, limit, assume_new=True):
3209     if not version:
3210         return not assume_new
3211     try:
3212         return version_tuple(version) < version_tuple(limit)
3213     except ValueError:
3214         return not assume_new
3215
3216
3217 def ytdl_is_updateable():
3218     """ Returns if yt-dlp can be updated with -U """
3219
3220     from .update import is_non_updateable
3221
3222     return not is_non_updateable()
3223
3224
3225 def args_to_str(args):
3226     # Get a short string representation for a subprocess command
3227     return ' '.join(compat_shlex_quote(a) for a in args)
3228
3229
3230 def error_to_compat_str(err):
3231     err_str = str(err)
3232     # On python 2 error byte string must be decoded with proper
3233     # encoding rather than ascii
3234     if sys.version_info[0] < 3:
3235         err_str = err_str.decode(preferredencoding())
3236     return err_str
3237
3238
3239 def mimetype2ext(mt):
3240     if mt is None:
3241         return None
3242
3243     mt, _, params = mt.partition(';')
3244     mt = mt.strip()
3245
3246     FULL_MAP = {
3247         'audio/mp4': 'm4a',
3248         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3249         # it's the most popular one
3250         'audio/mpeg': 'mp3',
3251         'audio/x-wav': 'wav',
3252         'audio/wav': 'wav',
3253         'audio/wave': 'wav',
3254     }
3255
3256     ext = FULL_MAP.get(mt)
3257     if ext is not None:
3258         return ext
3259
3260     SUBTYPE_MAP = {
3261         '3gpp': '3gp',
3262         'smptett+xml': 'tt',
3263         'ttaf+xml': 'dfxp',
3264         'ttml+xml': 'ttml',
3265         'x-flv': 'flv',
3266         'x-mp4-fragmented': 'mp4',
3267         'x-ms-sami': 'sami',
3268         'x-ms-wmv': 'wmv',
3269         'mpegurl': 'm3u8',
3270         'x-mpegurl': 'm3u8',
3271         'vnd.apple.mpegurl': 'm3u8',
3272         'dash+xml': 'mpd',
3273         'f4m+xml': 'f4m',
3274         'hds+xml': 'f4m',
3275         'vnd.ms-sstr+xml': 'ism',
3276         'quicktime': 'mov',
3277         'mp2t': 'ts',
3278         'x-wav': 'wav',
3279         'filmstrip+json': 'fs',
3280         'svg+xml': 'svg',
3281     }
3282
3283     _, _, subtype = mt.rpartition('/')
3284     ext = SUBTYPE_MAP.get(subtype.lower())
3285     if ext is not None:
3286         return ext
3287
3288     SUFFIX_MAP = {
3289         'json': 'json',
3290         'xml': 'xml',
3291         'zip': 'zip',
3292         'gzip': 'gz',
3293     }
3294
3295     _, _, suffix = subtype.partition('+')
3296     ext = SUFFIX_MAP.get(suffix)
3297     if ext is not None:
3298         return ext
3299
3300     return subtype.replace('+', '.')
3301
3302
3303 def ext2mimetype(ext_or_url):
3304     if not ext_or_url:
3305         return None
3306     if '.' not in ext_or_url:
3307         ext_or_url = f'file.{ext_or_url}'
3308     return mimetypes.guess_type(ext_or_url)[0]
3309
3310
3311 def parse_codecs(codecs_str):
3312     # http://tools.ietf.org/html/rfc6381
3313     if not codecs_str:
3314         return {}
3315     split_codecs = list(filter(None, map(
3316         str.strip, codecs_str.strip().strip(',').split(','))))
3317     vcodec, acodec, tcodec, hdr = None, None, None, None
3318     for full_codec in split_codecs:
3319         parts = full_codec.split('.')
3320         codec = parts[0].replace('0', '')
3321         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3322                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3323             if not vcodec:
3324                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3325                 if codec in ('dvh1', 'dvhe'):
3326                     hdr = 'DV'
3327                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3328                     hdr = 'HDR10'
3329                 elif full_codec.replace('0', '').startswith('vp9.2'):
3330                     hdr = 'HDR10'
3331         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3332             if not acodec:
3333                 acodec = full_codec
3334         elif codec in ('stpp', 'wvtt',):
3335             if not tcodec:
3336                 tcodec = full_codec
3337         else:
3338             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3339     if vcodec or acodec or tcodec:
3340         return {
3341             'vcodec': vcodec or 'none',
3342             'acodec': acodec or 'none',
3343             'dynamic_range': hdr,
3344             **({'tcodec': tcodec} if tcodec is not None else {}),
3345         }
3346     elif len(split_codecs) == 2:
3347         return {
3348             'vcodec': split_codecs[0],
3349             'acodec': split_codecs[1],
3350         }
3351     return {}
3352
3353
3354 def urlhandle_detect_ext(url_handle):
3355     getheader = url_handle.headers.get
3356
3357     cd = getheader('Content-Disposition')
3358     if cd:
3359         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3360         if m:
3361             e = determine_ext(m.group('filename'), default_ext=None)
3362             if e:
3363                 return e
3364
3365     return mimetype2ext(getheader('Content-Type'))
3366
3367
3368 def encode_data_uri(data, mime_type):
3369     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3370
3371
3372 def age_restricted(content_limit, age_limit):
3373     """ Returns True iff the content should be blocked """
3374
3375     if age_limit is None:  # No limit set
3376         return False
3377     if content_limit is None:
3378         return False  # Content available for everyone
3379     return age_limit < content_limit
3380
3381
3382 def is_html(first_bytes):
3383     """ Detect whether a file contains HTML by examining its first bytes. """
3384
3385     BOMS = [
3386         (b'\xef\xbb\xbf', 'utf-8'),
3387         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3388         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3389         (b'\xff\xfe', 'utf-16-le'),
3390         (b'\xfe\xff', 'utf-16-be'),
3391     ]
3392     for bom, enc in BOMS:
3393         if first_bytes.startswith(bom):
3394             s = first_bytes[len(bom):].decode(enc, 'replace')
3395             break
3396     else:
3397         s = first_bytes.decode('utf-8', 'replace')
3398
3399     return re.match(r'^\s*<', s)
3400
3401
3402 def determine_protocol(info_dict):
3403     protocol = info_dict.get('protocol')
3404     if protocol is not None:
3405         return protocol
3406
3407     url = sanitize_url(info_dict['url'])
3408     if url.startswith('rtmp'):
3409         return 'rtmp'
3410     elif url.startswith('mms'):
3411         return 'mms'
3412     elif url.startswith('rtsp'):
3413         return 'rtsp'
3414
3415     ext = determine_ext(url)
3416     if ext == 'm3u8':
3417         return 'm3u8'
3418     elif ext == 'f4m':
3419         return 'f4m'
3420
3421     return compat_urllib_parse_urlparse(url).scheme
3422
3423
3424 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3425     """ Render a list of rows, each as a list of values.
3426     Text after a \t will be right aligned """
3427     def width(string):
3428         return len(remove_terminal_sequences(string).replace('\t', ''))
3429
3430     def get_max_lens(table):
3431         return [max(width(str(v)) for v in col) for col in zip(*table)]
3432
3433     def filter_using_list(row, filterArray):
3434         return [col for (take, col) in zip(filterArray, row) if take]
3435
3436     if hide_empty:
3437         max_lens = get_max_lens(data)
3438         header_row = filter_using_list(header_row, max_lens)
3439         data = [filter_using_list(row, max_lens) for row in data]
3440
3441     table = [header_row] + data
3442     max_lens = get_max_lens(table)
3443     extra_gap += 1
3444     if delim:
3445         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3446         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3447     for row in table:
3448         for pos, text in enumerate(map(str, row)):
3449             if '\t' in text:
3450                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3451             else:
3452                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3453     ret = '\n'.join(''.join(row).rstrip() for row in table)
3454     return ret
3455
3456
3457 def _match_one(filter_part, dct, incomplete):
3458     # TODO: Generalize code with YoutubeDL._build_format_filter
3459     STRING_OPERATORS = {
3460         '*=': operator.contains,
3461         '^=': lambda attr, value: attr.startswith(value),
3462         '$=': lambda attr, value: attr.endswith(value),
3463         '~=': lambda attr, value: re.search(value, attr),
3464     }
3465     COMPARISON_OPERATORS = {
3466         **STRING_OPERATORS,
3467         '<=': operator.le,  # "<=" must be defined above "<"
3468         '<': operator.lt,
3469         '>=': operator.ge,
3470         '>': operator.gt,
3471         '=': operator.eq,
3472     }
3473
3474     operator_rex = re.compile(r'''(?x)\s*
3475         (?P<key>[a-z_]+)
3476         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3477         (?:
3478             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3479             (?P<strval>.+?)
3480         )
3481         \s*$
3482         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3483     m = operator_rex.search(filter_part)
3484     if m:
3485         m = m.groupdict()
3486         unnegated_op = COMPARISON_OPERATORS[m['op']]
3487         if m['negation']:
3488             op = lambda attr, value: not unnegated_op(attr, value)
3489         else:
3490             op = unnegated_op
3491         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3492         if m['quote']:
3493             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3494         actual_value = dct.get(m['key'])
3495         numeric_comparison = None
3496         if isinstance(actual_value, compat_numeric_types):
3497             # If the original field is a string and matching comparisonvalue is
3498             # a number we should respect the origin of the original field
3499             # and process comparison value as a string (see
3500             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3501             try:
3502                 numeric_comparison = int(comparison_value)
3503             except ValueError:
3504                 numeric_comparison = parse_filesize(comparison_value)
3505                 if numeric_comparison is None:
3506                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3507                 if numeric_comparison is None:
3508                     numeric_comparison = parse_duration(comparison_value)
3509         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3510             raise ValueError('Operator %s only supports string values!' % m['op'])
3511         if actual_value is None:
3512             return incomplete or m['none_inclusive']
3513         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3514
3515     UNARY_OPERATORS = {
3516         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3517         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3518     }
3519     operator_rex = re.compile(r'''(?x)\s*
3520         (?P<op>%s)\s*(?P<key>[a-z_]+)
3521         \s*$
3522         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3523     m = operator_rex.search(filter_part)
3524     if m:
3525         op = UNARY_OPERATORS[m.group('op')]
3526         actual_value = dct.get(m.group('key'))
3527         if incomplete and actual_value is None:
3528             return True
3529         return op(actual_value)
3530
3531     raise ValueError('Invalid filter part %r' % filter_part)
3532
3533
3534 def match_str(filter_str, dct, incomplete=False):
3535     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3536         When incomplete, all conditions passes on missing fields
3537     """
3538     return all(
3539         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3540         for filter_part in re.split(r'(?<!\\)&', filter_str))
3541
3542
3543 def match_filter_func(filter_str):
3544     def _match_func(info_dict, *args, **kwargs):
3545         if match_str(filter_str, info_dict, *args, **kwargs):
3546             return None
3547         else:
3548             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3549             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3550     return _match_func
3551
3552
3553 def parse_dfxp_time_expr(time_expr):
3554     if not time_expr:
3555         return
3556
3557     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3558     if mobj:
3559         return float(mobj.group('time_offset'))
3560
3561     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3562     if mobj:
3563         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3564
3565
3566 def srt_subtitles_timecode(seconds):
3567     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3568
3569
3570 def ass_subtitles_timecode(seconds):
3571     time = timetuple_from_msec(seconds * 1000)
3572     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3573
3574
3575 def dfxp2srt(dfxp_data):
3576     '''
3577     @param dfxp_data A bytes-like object containing DFXP data
3578     @returns A unicode object containing converted SRT data
3579     '''
3580     LEGACY_NAMESPACES = (
3581         (b'http://www.w3.org/ns/ttml', [
3582             b'http://www.w3.org/2004/11/ttaf1',
3583             b'http://www.w3.org/2006/04/ttaf1',
3584             b'http://www.w3.org/2006/10/ttaf1',
3585         ]),
3586         (b'http://www.w3.org/ns/ttml#styling', [
3587             b'http://www.w3.org/ns/ttml#style',
3588         ]),
3589     )
3590
3591     SUPPORTED_STYLING = [
3592         'color',
3593         'fontFamily',
3594         'fontSize',
3595         'fontStyle',
3596         'fontWeight',
3597         'textDecoration'
3598     ]
3599
3600     _x = functools.partial(xpath_with_ns, ns_map={
3601         'xml': 'http://www.w3.org/XML/1998/namespace',
3602         'ttml': 'http://www.w3.org/ns/ttml',
3603         'tts': 'http://www.w3.org/ns/ttml#styling',
3604     })
3605
3606     styles = {}
3607     default_style = {}
3608
3609     class TTMLPElementParser(object):
3610         _out = ''
3611         _unclosed_elements = []
3612         _applied_styles = []
3613
3614         def start(self, tag, attrib):
3615             if tag in (_x('ttml:br'), 'br'):
3616                 self._out += '\n'
3617             else:
3618                 unclosed_elements = []
3619                 style = {}
3620                 element_style_id = attrib.get('style')
3621                 if default_style:
3622                     style.update(default_style)
3623                 if element_style_id:
3624                     style.update(styles.get(element_style_id, {}))
3625                 for prop in SUPPORTED_STYLING:
3626                     prop_val = attrib.get(_x('tts:' + prop))
3627                     if prop_val:
3628                         style[prop] = prop_val
3629                 if style:
3630                     font = ''
3631                     for k, v in sorted(style.items()):
3632                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3633                             continue
3634                         if k == 'color':
3635                             font += ' color="%s"' % v
3636                         elif k == 'fontSize':
3637                             font += ' size="%s"' % v
3638                         elif k == 'fontFamily':
3639                             font += ' face="%s"' % v
3640                         elif k == 'fontWeight' and v == 'bold':
3641                             self._out += '<b>'
3642                             unclosed_elements.append('b')
3643                         elif k == 'fontStyle' and v == 'italic':
3644                             self._out += '<i>'
3645                             unclosed_elements.append('i')
3646                         elif k == 'textDecoration' and v == 'underline':
3647                             self._out += '<u>'
3648                             unclosed_elements.append('u')
3649                     if font:
3650                         self._out += '<font' + font + '>'
3651                         unclosed_elements.append('font')
3652                     applied_style = {}
3653                     if self._applied_styles:
3654                         applied_style.update(self._applied_styles[-1])
3655                     applied_style.update(style)
3656                     self._applied_styles.append(applied_style)
3657                 self._unclosed_elements.append(unclosed_elements)
3658
3659         def end(self, tag):
3660             if tag not in (_x('ttml:br'), 'br'):
3661                 unclosed_elements = self._unclosed_elements.pop()
3662                 for element in reversed(unclosed_elements):
3663                     self._out += '</%s>' % element
3664                 if unclosed_elements and self._applied_styles:
3665                     self._applied_styles.pop()
3666
3667         def data(self, data):
3668             self._out += data
3669
3670         def close(self):
3671             return self._out.strip()
3672
3673     def parse_node(node):
3674         target = TTMLPElementParser()
3675         parser = xml.etree.ElementTree.XMLParser(target=target)
3676         parser.feed(xml.etree.ElementTree.tostring(node))
3677         return parser.close()
3678
3679     for k, v in LEGACY_NAMESPACES:
3680         for ns in v:
3681             dfxp_data = dfxp_data.replace(ns, k)
3682
3683     dfxp = compat_etree_fromstring(dfxp_data)
3684     out = []
3685     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3686
3687     if not paras:
3688         raise ValueError('Invalid dfxp/TTML subtitle')
3689
3690     repeat = False
3691     while True:
3692         for style in dfxp.findall(_x('.//ttml:style')):
3693             style_id = style.get('id') or style.get(_x('xml:id'))
3694             if not style_id:
3695                 continue
3696             parent_style_id = style.get('style')
3697             if parent_style_id:
3698                 if parent_style_id not in styles:
3699                     repeat = True
3700                     continue
3701                 styles[style_id] = styles[parent_style_id].copy()
3702             for prop in SUPPORTED_STYLING:
3703                 prop_val = style.get(_x('tts:' + prop))
3704                 if prop_val:
3705                     styles.setdefault(style_id, {})[prop] = prop_val
3706         if repeat:
3707             repeat = False
3708         else:
3709             break
3710
3711     for p in ('body', 'div'):
3712         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3713         if ele is None:
3714             continue
3715         style = styles.get(ele.get('style'))
3716         if not style:
3717             continue
3718         default_style.update(style)
3719
3720     for para, index in zip(paras, itertools.count(1)):
3721         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3722         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3723         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3724         if begin_time is None:
3725             continue
3726         if not end_time:
3727             if not dur:
3728                 continue
3729             end_time = begin_time + dur
3730         out.append('%d\n%s --> %s\n%s\n\n' % (
3731             index,
3732             srt_subtitles_timecode(begin_time),
3733             srt_subtitles_timecode(end_time),
3734             parse_node(para)))
3735
3736     return ''.join(out)
3737
3738
3739 def cli_option(params, command_option, param):
3740     param = params.get(param)
3741     if param:
3742         param = compat_str(param)
3743     return [command_option, param] if param is not None else []
3744
3745
3746 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3747     param = params.get(param)
3748     if param is None:
3749         return []
3750     assert isinstance(param, bool)
3751     if separator:
3752         return [command_option + separator + (true_value if param else false_value)]
3753     return [command_option, true_value if param else false_value]
3754
3755
3756 def cli_valueless_option(params, command_option, param, expected_value=True):
3757     param = params.get(param)
3758     return [command_option] if param == expected_value else []
3759
3760
3761 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3762     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3763         if use_compat:
3764             return argdict
3765         else:
3766             argdict = None
3767     if argdict is None:
3768         return default
3769     assert isinstance(argdict, dict)
3770
3771     assert isinstance(keys, (list, tuple))
3772     for key_list in keys:
3773         arg_list = list(filter(
3774             lambda x: x is not None,
3775             [argdict.get(key.lower()) for key in variadic(key_list)]))
3776         if arg_list:
3777             return [arg for args in arg_list for arg in args]
3778     return default
3779
3780
3781 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3782     main_key, exe = main_key.lower(), exe.lower()
3783     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3784     keys = [f'{root_key}{k}' for k in (keys or [''])]
3785     if root_key in keys:
3786         if main_key != exe:
3787             keys.append((main_key, exe))
3788         keys.append('default')
3789     else:
3790         use_compat = False
3791     return cli_configuration_args(argdict, keys, default, use_compat)
3792
3793
3794 class ISO639Utils(object):
3795     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3796     _lang_map = {
3797         'aa': 'aar',
3798         'ab': 'abk',
3799         'ae': 'ave',
3800         'af': 'afr',
3801         'ak': 'aka',
3802         'am': 'amh',
3803         'an': 'arg',
3804         'ar': 'ara',
3805         'as': 'asm',
3806         'av': 'ava',
3807         'ay': 'aym',
3808         'az': 'aze',
3809         'ba': 'bak',
3810         'be': 'bel',
3811         'bg': 'bul',
3812         'bh': 'bih',
3813         'bi': 'bis',
3814         'bm': 'bam',
3815         'bn': 'ben',
3816         'bo': 'bod',
3817         'br': 'bre',
3818         'bs': 'bos',
3819         'ca': 'cat',
3820         'ce': 'che',
3821         'ch': 'cha',
3822         'co': 'cos',
3823         'cr': 'cre',
3824         'cs': 'ces',
3825         'cu': 'chu',
3826         'cv': 'chv',
3827         'cy': 'cym',
3828         'da': 'dan',
3829         'de': 'deu',
3830         'dv': 'div',
3831         'dz': 'dzo',
3832         'ee': 'ewe',
3833         'el': 'ell',
3834         'en': 'eng',
3835         'eo': 'epo',
3836         'es': 'spa',
3837         'et': 'est',
3838         'eu': 'eus',
3839         'fa': 'fas',
3840         'ff': 'ful',
3841         'fi': 'fin',
3842         'fj': 'fij',
3843         'fo': 'fao',
3844         'fr': 'fra',
3845         'fy': 'fry',
3846         'ga': 'gle',
3847         'gd': 'gla',
3848         'gl': 'glg',
3849         'gn': 'grn',
3850         'gu': 'guj',
3851         'gv': 'glv',
3852         'ha': 'hau',
3853         'he': 'heb',
3854         'iw': 'heb',  # Replaced by he in 1989 revision
3855         'hi': 'hin',
3856         'ho': 'hmo',
3857         'hr': 'hrv',
3858         'ht': 'hat',
3859         'hu': 'hun',
3860         'hy': 'hye',
3861         'hz': 'her',
3862         'ia': 'ina',
3863         'id': 'ind',
3864         'in': 'ind',  # Replaced by id in 1989 revision
3865         'ie': 'ile',
3866         'ig': 'ibo',
3867         'ii': 'iii',
3868         'ik': 'ipk',
3869         'io': 'ido',
3870         'is': 'isl',
3871         'it': 'ita',
3872         'iu': 'iku',
3873         'ja': 'jpn',
3874         'jv': 'jav',
3875         'ka': 'kat',
3876         'kg': 'kon',
3877         'ki': 'kik',
3878         'kj': 'kua',
3879         'kk': 'kaz',
3880         'kl': 'kal',
3881         'km': 'khm',
3882         'kn': 'kan',
3883         'ko': 'kor',
3884         'kr': 'kau',
3885         'ks': 'kas',
3886         'ku': 'kur',
3887         'kv': 'kom',
3888         'kw': 'cor',
3889         'ky': 'kir',
3890         'la': 'lat',
3891         'lb': 'ltz',
3892         'lg': 'lug',
3893         'li': 'lim',
3894         'ln': 'lin',
3895         'lo': 'lao',
3896         'lt': 'lit',
3897         'lu': 'lub',
3898         'lv': 'lav',
3899         'mg': 'mlg',
3900         'mh': 'mah',
3901         'mi': 'mri',
3902         'mk': 'mkd',
3903         'ml': 'mal',
3904         'mn': 'mon',
3905         'mr': 'mar',
3906         'ms': 'msa',
3907         'mt': 'mlt',
3908         'my': 'mya',
3909         'na': 'nau',
3910         'nb': 'nob',
3911         'nd': 'nde',
3912         'ne': 'nep',
3913         'ng': 'ndo',
3914         'nl': 'nld',
3915         'nn': 'nno',
3916         'no': 'nor',
3917         'nr': 'nbl',
3918         'nv': 'nav',
3919         'ny': 'nya',
3920         'oc': 'oci',
3921         'oj': 'oji',
3922         'om': 'orm',
3923         'or': 'ori',
3924         'os': 'oss',
3925         'pa': 'pan',
3926         'pi': 'pli',
3927         'pl': 'pol',
3928         'ps': 'pus',
3929         'pt': 'por',
3930         'qu': 'que',
3931         'rm': 'roh',
3932         'rn': 'run',
3933         'ro': 'ron',
3934         'ru': 'rus',
3935         'rw': 'kin',
3936         'sa': 'san',
3937         'sc': 'srd',
3938         'sd': 'snd',
3939         'se': 'sme',
3940         'sg': 'sag',
3941         'si': 'sin',
3942         'sk': 'slk',
3943         'sl': 'slv',
3944         'sm': 'smo',
3945         'sn': 'sna',
3946         'so': 'som',
3947         'sq': 'sqi',
3948         'sr': 'srp',
3949         'ss': 'ssw',
3950         'st': 'sot',
3951         'su': 'sun',
3952         'sv': 'swe',
3953         'sw': 'swa',
3954         'ta': 'tam',
3955         'te': 'tel',
3956         'tg': 'tgk',
3957         'th': 'tha',
3958         'ti': 'tir',
3959         'tk': 'tuk',
3960         'tl': 'tgl',
3961         'tn': 'tsn',
3962         'to': 'ton',
3963         'tr': 'tur',
3964         'ts': 'tso',
3965         'tt': 'tat',
3966         'tw': 'twi',
3967         'ty': 'tah',
3968         'ug': 'uig',
3969         'uk': 'ukr',
3970         'ur': 'urd',
3971         'uz': 'uzb',
3972         've': 'ven',
3973         'vi': 'vie',
3974         'vo': 'vol',
3975         'wa': 'wln',
3976         'wo': 'wol',
3977         'xh': 'xho',
3978         'yi': 'yid',
3979         'ji': 'yid',  # Replaced by yi in 1989 revision
3980         'yo': 'yor',
3981         'za': 'zha',
3982         'zh': 'zho',
3983         'zu': 'zul',
3984     }
3985
3986     @classmethod
3987     def short2long(cls, code):
3988         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3989         return cls._lang_map.get(code[:2])
3990
3991     @classmethod
3992     def long2short(cls, code):
3993         """Convert language code from ISO 639-2/T to ISO 639-1"""
3994         for short_name, long_name in cls._lang_map.items():
3995             if long_name == code:
3996                 return short_name
3997
3998
3999 class ISO3166Utils(object):
4000     # From http://data.okfn.org/data/core/country-list
4001     _country_map = {
4002         'AF': 'Afghanistan',
4003         'AX': 'Åland Islands',
4004         'AL': 'Albania',
4005         'DZ': 'Algeria',
4006         'AS': 'American Samoa',
4007         'AD': 'Andorra',
4008         'AO': 'Angola',
4009         'AI': 'Anguilla',
4010         'AQ': 'Antarctica',
4011         'AG': 'Antigua and Barbuda',
4012         'AR': 'Argentina',
4013         'AM': 'Armenia',
4014         'AW': 'Aruba',
4015         'AU': 'Australia',
4016         'AT': 'Austria',
4017         'AZ': 'Azerbaijan',
4018         'BS': 'Bahamas',
4019         'BH': 'Bahrain',
4020         'BD': 'Bangladesh',
4021         'BB': 'Barbados',
4022         'BY': 'Belarus',
4023         'BE': 'Belgium',
4024         'BZ': 'Belize',
4025         'BJ': 'Benin',
4026         'BM': 'Bermuda',
4027         'BT': 'Bhutan',
4028         'BO': 'Bolivia, Plurinational State of',
4029         'BQ': 'Bonaire, Sint Eustatius and Saba',
4030         'BA': 'Bosnia and Herzegovina',
4031         'BW': 'Botswana',
4032         'BV': 'Bouvet Island',
4033         'BR': 'Brazil',
4034         'IO': 'British Indian Ocean Territory',
4035         'BN': 'Brunei Darussalam',
4036         'BG': 'Bulgaria',
4037         'BF': 'Burkina Faso',
4038         'BI': 'Burundi',
4039         'KH': 'Cambodia',
4040         'CM': 'Cameroon',
4041         'CA': 'Canada',
4042         'CV': 'Cape Verde',
4043         'KY': 'Cayman Islands',
4044         'CF': 'Central African Republic',
4045         'TD': 'Chad',
4046         'CL': 'Chile',
4047         'CN': 'China',
4048         'CX': 'Christmas Island',
4049         'CC': 'Cocos (Keeling) Islands',
4050         'CO': 'Colombia',
4051         'KM': 'Comoros',
4052         'CG': 'Congo',
4053         'CD': 'Congo, the Democratic Republic of the',
4054         'CK': 'Cook Islands',
4055         'CR': 'Costa Rica',
4056         'CI': 'Côte d\'Ivoire',
4057         'HR': 'Croatia',
4058         'CU': 'Cuba',
4059         'CW': 'Curaçao',
4060         'CY': 'Cyprus',
4061         'CZ': 'Czech Republic',
4062         'DK': 'Denmark',
4063         'DJ': 'Djibouti',
4064         'DM': 'Dominica',
4065         'DO': 'Dominican Republic',
4066         'EC': 'Ecuador',
4067         'EG': 'Egypt',
4068         'SV': 'El Salvador',
4069         'GQ': 'Equatorial Guinea',
4070         'ER': 'Eritrea',
4071         'EE': 'Estonia',
4072         'ET': 'Ethiopia',
4073         'FK': 'Falkland Islands (Malvinas)',
4074         'FO': 'Faroe Islands',
4075         'FJ': 'Fiji',
4076         'FI': 'Finland',
4077         'FR': 'France',
4078         'GF': 'French Guiana',
4079         'PF': 'French Polynesia',
4080         'TF': 'French Southern Territories',
4081         'GA': 'Gabon',
4082         'GM': 'Gambia',
4083         'GE': 'Georgia',
4084         'DE': 'Germany',
4085         'GH': 'Ghana',
4086         'GI': 'Gibraltar',
4087         'GR': 'Greece',
4088         'GL': 'Greenland',
4089         'GD': 'Grenada',
4090         'GP': 'Guadeloupe',
4091         'GU': 'Guam',
4092         'GT': 'Guatemala',
4093         'GG': 'Guernsey',
4094         'GN': 'Guinea',
4095         'GW': 'Guinea-Bissau',
4096         'GY': 'Guyana',
4097         'HT': 'Haiti',
4098         'HM': 'Heard Island and McDonald Islands',
4099         'VA': 'Holy See (Vatican City State)',
4100         'HN': 'Honduras',
4101         'HK': 'Hong Kong',
4102         'HU': 'Hungary',
4103         'IS': 'Iceland',
4104         'IN': 'India',
4105         'ID': 'Indonesia',
4106         'IR': 'Iran, Islamic Republic of',
4107         'IQ': 'Iraq',
4108         'IE': 'Ireland',
4109         'IM': 'Isle of Man',
4110         'IL': 'Israel',
4111         'IT': 'Italy',
4112         'JM': 'Jamaica',
4113         'JP': 'Japan',
4114         'JE': 'Jersey',
4115         'JO': 'Jordan',
4116         'KZ': 'Kazakhstan',
4117         'KE': 'Kenya',
4118         'KI': 'Kiribati',
4119         'KP': 'Korea, Democratic People\'s Republic of',
4120         'KR': 'Korea, Republic of',
4121         'KW': 'Kuwait',
4122         'KG': 'Kyrgyzstan',
4123         'LA': 'Lao People\'s Democratic Republic',
4124         'LV': 'Latvia',
4125         'LB': 'Lebanon',
4126         'LS': 'Lesotho',
4127         'LR': 'Liberia',
4128         'LY': 'Libya',
4129         'LI': 'Liechtenstein',
4130         'LT': 'Lithuania',
4131         'LU': 'Luxembourg',
4132         'MO': 'Macao',
4133         'MK': 'Macedonia, the Former Yugoslav Republic of',
4134         'MG': 'Madagascar',
4135         'MW': 'Malawi',
4136         'MY': 'Malaysia',
4137         'MV': 'Maldives',
4138         'ML': 'Mali',
4139         'MT': 'Malta',
4140         'MH': 'Marshall Islands',
4141         'MQ': 'Martinique',
4142         'MR': 'Mauritania',
4143         'MU': 'Mauritius',
4144         'YT': 'Mayotte',
4145         'MX': 'Mexico',
4146         'FM': 'Micronesia, Federated States of',
4147         'MD': 'Moldova, Republic of',
4148         'MC': 'Monaco',
4149         'MN': 'Mongolia',
4150         'ME': 'Montenegro',
4151         'MS': 'Montserrat',
4152         'MA': 'Morocco',
4153         'MZ': 'Mozambique',
4154         'MM': 'Myanmar',
4155         'NA': 'Namibia',
4156         'NR': 'Nauru',
4157         'NP': 'Nepal',
4158         'NL': 'Netherlands',
4159         'NC': 'New Caledonia',
4160         'NZ': 'New Zealand',
4161         'NI': 'Nicaragua',
4162         'NE': 'Niger',
4163         'NG': 'Nigeria',
4164         'NU': 'Niue',
4165         'NF': 'Norfolk Island',
4166         'MP': 'Northern Mariana Islands',
4167         'NO': 'Norway',
4168         'OM': 'Oman',
4169         'PK': 'Pakistan',
4170         'PW': 'Palau',
4171         'PS': 'Palestine, State of',
4172         'PA': 'Panama',
4173         'PG': 'Papua New Guinea',
4174         'PY': 'Paraguay',
4175         'PE': 'Peru',
4176         'PH': 'Philippines',
4177         'PN': 'Pitcairn',
4178         'PL': 'Poland',
4179         'PT': 'Portugal',
4180         'PR': 'Puerto Rico',
4181         'QA': 'Qatar',
4182         'RE': 'Réunion',
4183         'RO': 'Romania',
4184         'RU': 'Russian Federation',
4185         'RW': 'Rwanda',
4186         'BL': 'Saint Barthélemy',
4187         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4188         'KN': 'Saint Kitts and Nevis',
4189         'LC': 'Saint Lucia',
4190         'MF': 'Saint Martin (French part)',
4191         'PM': 'Saint Pierre and Miquelon',
4192         'VC': 'Saint Vincent and the Grenadines',
4193         'WS': 'Samoa',
4194         'SM': 'San Marino',
4195         'ST': 'Sao Tome and Principe',
4196         'SA': 'Saudi Arabia',
4197         'SN': 'Senegal',
4198         'RS': 'Serbia',
4199         'SC': 'Seychelles',
4200         'SL': 'Sierra Leone',
4201         'SG': 'Singapore',
4202         'SX': 'Sint Maarten (Dutch part)',
4203         'SK': 'Slovakia',
4204         'SI': 'Slovenia',
4205         'SB': 'Solomon Islands',
4206         'SO': 'Somalia',
4207         'ZA': 'South Africa',
4208         'GS': 'South Georgia and the South Sandwich Islands',
4209         'SS': 'South Sudan',
4210         'ES': 'Spain',
4211         'LK': 'Sri Lanka',
4212         'SD': 'Sudan',
4213         'SR': 'Suriname',
4214         'SJ': 'Svalbard and Jan Mayen',
4215         'SZ': 'Swaziland',
4216         'SE': 'Sweden',
4217         'CH': 'Switzerland',
4218         'SY': 'Syrian Arab Republic',
4219         'TW': 'Taiwan, Province of China',
4220         'TJ': 'Tajikistan',
4221         'TZ': 'Tanzania, United Republic of',
4222         'TH': 'Thailand',
4223         'TL': 'Timor-Leste',
4224         'TG': 'Togo',
4225         'TK': 'Tokelau',
4226         'TO': 'Tonga',
4227         'TT': 'Trinidad and Tobago',
4228         'TN': 'Tunisia',
4229         'TR': 'Turkey',
4230         'TM': 'Turkmenistan',
4231         'TC': 'Turks and Caicos Islands',
4232         'TV': 'Tuvalu',
4233         'UG': 'Uganda',
4234         'UA': 'Ukraine',
4235         'AE': 'United Arab Emirates',
4236         'GB': 'United Kingdom',
4237         'US': 'United States',
4238         'UM': 'United States Minor Outlying Islands',
4239         'UY': 'Uruguay',
4240         'UZ': 'Uzbekistan',
4241         'VU': 'Vanuatu',
4242         'VE': 'Venezuela, Bolivarian Republic of',
4243         'VN': 'Viet Nam',
4244         'VG': 'Virgin Islands, British',
4245         'VI': 'Virgin Islands, U.S.',
4246         'WF': 'Wallis and Futuna',
4247         'EH': 'Western Sahara',
4248         'YE': 'Yemen',
4249         'ZM': 'Zambia',
4250         'ZW': 'Zimbabwe',
4251     }
4252
4253     @classmethod
4254     def short2full(cls, code):
4255         """Convert an ISO 3166-2 country code to the corresponding full name"""
4256         return cls._country_map.get(code.upper())
4257
4258
4259 class GeoUtils(object):
4260     # Major IPv4 address blocks per country
4261     _country_ip_map = {
4262         'AD': '46.172.224.0/19',
4263         'AE': '94.200.0.0/13',
4264         'AF': '149.54.0.0/17',
4265         'AG': '209.59.64.0/18',
4266         'AI': '204.14.248.0/21',
4267         'AL': '46.99.0.0/16',
4268         'AM': '46.70.0.0/15',
4269         'AO': '105.168.0.0/13',
4270         'AP': '182.50.184.0/21',
4271         'AQ': '23.154.160.0/24',
4272         'AR': '181.0.0.0/12',
4273         'AS': '202.70.112.0/20',
4274         'AT': '77.116.0.0/14',
4275         'AU': '1.128.0.0/11',
4276         'AW': '181.41.0.0/18',
4277         'AX': '185.217.4.0/22',
4278         'AZ': '5.197.0.0/16',
4279         'BA': '31.176.128.0/17',
4280         'BB': '65.48.128.0/17',
4281         'BD': '114.130.0.0/16',
4282         'BE': '57.0.0.0/8',
4283         'BF': '102.178.0.0/15',
4284         'BG': '95.42.0.0/15',
4285         'BH': '37.131.0.0/17',
4286         'BI': '154.117.192.0/18',
4287         'BJ': '137.255.0.0/16',
4288         'BL': '185.212.72.0/23',
4289         'BM': '196.12.64.0/18',
4290         'BN': '156.31.0.0/16',
4291         'BO': '161.56.0.0/16',
4292         'BQ': '161.0.80.0/20',
4293         'BR': '191.128.0.0/12',
4294         'BS': '24.51.64.0/18',
4295         'BT': '119.2.96.0/19',
4296         'BW': '168.167.0.0/16',
4297         'BY': '178.120.0.0/13',
4298         'BZ': '179.42.192.0/18',
4299         'CA': '99.224.0.0/11',
4300         'CD': '41.243.0.0/16',
4301         'CF': '197.242.176.0/21',
4302         'CG': '160.113.0.0/16',
4303         'CH': '85.0.0.0/13',
4304         'CI': '102.136.0.0/14',
4305         'CK': '202.65.32.0/19',
4306         'CL': '152.172.0.0/14',
4307         'CM': '102.244.0.0/14',
4308         'CN': '36.128.0.0/10',
4309         'CO': '181.240.0.0/12',
4310         'CR': '201.192.0.0/12',
4311         'CU': '152.206.0.0/15',
4312         'CV': '165.90.96.0/19',
4313         'CW': '190.88.128.0/17',
4314         'CY': '31.153.0.0/16',
4315         'CZ': '88.100.0.0/14',
4316         'DE': '53.0.0.0/8',
4317         'DJ': '197.241.0.0/17',
4318         'DK': '87.48.0.0/12',
4319         'DM': '192.243.48.0/20',
4320         'DO': '152.166.0.0/15',
4321         'DZ': '41.96.0.0/12',
4322         'EC': '186.68.0.0/15',
4323         'EE': '90.190.0.0/15',
4324         'EG': '156.160.0.0/11',
4325         'ER': '196.200.96.0/20',
4326         'ES': '88.0.0.0/11',
4327         'ET': '196.188.0.0/14',
4328         'EU': '2.16.0.0/13',
4329         'FI': '91.152.0.0/13',
4330         'FJ': '144.120.0.0/16',
4331         'FK': '80.73.208.0/21',
4332         'FM': '119.252.112.0/20',
4333         'FO': '88.85.32.0/19',
4334         'FR': '90.0.0.0/9',
4335         'GA': '41.158.0.0/15',
4336         'GB': '25.0.0.0/8',
4337         'GD': '74.122.88.0/21',
4338         'GE': '31.146.0.0/16',
4339         'GF': '161.22.64.0/18',
4340         'GG': '62.68.160.0/19',
4341         'GH': '154.160.0.0/12',
4342         'GI': '95.164.0.0/16',
4343         'GL': '88.83.0.0/19',
4344         'GM': '160.182.0.0/15',
4345         'GN': '197.149.192.0/18',
4346         'GP': '104.250.0.0/19',
4347         'GQ': '105.235.224.0/20',
4348         'GR': '94.64.0.0/13',
4349         'GT': '168.234.0.0/16',
4350         'GU': '168.123.0.0/16',
4351         'GW': '197.214.80.0/20',
4352         'GY': '181.41.64.0/18',
4353         'HK': '113.252.0.0/14',
4354         'HN': '181.210.0.0/16',
4355         'HR': '93.136.0.0/13',
4356         'HT': '148.102.128.0/17',
4357         'HU': '84.0.0.0/14',
4358         'ID': '39.192.0.0/10',
4359         'IE': '87.32.0.0/12',
4360         'IL': '79.176.0.0/13',
4361         'IM': '5.62.80.0/20',
4362         'IN': '117.192.0.0/10',
4363         'IO': '203.83.48.0/21',
4364         'IQ': '37.236.0.0/14',
4365         'IR': '2.176.0.0/12',
4366         'IS': '82.221.0.0/16',
4367         'IT': '79.0.0.0/10',
4368         'JE': '87.244.64.0/18',
4369         'JM': '72.27.0.0/17',
4370         'JO': '176.29.0.0/16',
4371         'JP': '133.0.0.0/8',
4372         'KE': '105.48.0.0/12',
4373         'KG': '158.181.128.0/17',
4374         'KH': '36.37.128.0/17',
4375         'KI': '103.25.140.0/22',
4376         'KM': '197.255.224.0/20',
4377         'KN': '198.167.192.0/19',
4378         'KP': '175.45.176.0/22',
4379         'KR': '175.192.0.0/10',
4380         'KW': '37.36.0.0/14',
4381         'KY': '64.96.0.0/15',
4382         'KZ': '2.72.0.0/13',
4383         'LA': '115.84.64.0/18',
4384         'LB': '178.135.0.0/16',
4385         'LC': '24.92.144.0/20',
4386         'LI': '82.117.0.0/19',
4387         'LK': '112.134.0.0/15',
4388         'LR': '102.183.0.0/16',
4389         'LS': '129.232.0.0/17',
4390         'LT': '78.56.0.0/13',
4391         'LU': '188.42.0.0/16',
4392         'LV': '46.109.0.0/16',
4393         'LY': '41.252.0.0/14',
4394         'MA': '105.128.0.0/11',
4395         'MC': '88.209.64.0/18',
4396         'MD': '37.246.0.0/16',
4397         'ME': '178.175.0.0/17',
4398         'MF': '74.112.232.0/21',
4399         'MG': '154.126.0.0/17',
4400         'MH': '117.103.88.0/21',
4401         'MK': '77.28.0.0/15',
4402         'ML': '154.118.128.0/18',
4403         'MM': '37.111.0.0/17',
4404         'MN': '49.0.128.0/17',
4405         'MO': '60.246.0.0/16',
4406         'MP': '202.88.64.0/20',
4407         'MQ': '109.203.224.0/19',
4408         'MR': '41.188.64.0/18',
4409         'MS': '208.90.112.0/22',
4410         'MT': '46.11.0.0/16',
4411         'MU': '105.16.0.0/12',
4412         'MV': '27.114.128.0/18',
4413         'MW': '102.70.0.0/15',
4414         'MX': '187.192.0.0/11',
4415         'MY': '175.136.0.0/13',
4416         'MZ': '197.218.0.0/15',
4417         'NA': '41.182.0.0/16',
4418         'NC': '101.101.0.0/18',
4419         'NE': '197.214.0.0/18',
4420         'NF': '203.17.240.0/22',
4421         'NG': '105.112.0.0/12',
4422         'NI': '186.76.0.0/15',
4423         'NL': '145.96.0.0/11',
4424         'NO': '84.208.0.0/13',
4425         'NP': '36.252.0.0/15',
4426         'NR': '203.98.224.0/19',
4427         'NU': '49.156.48.0/22',
4428         'NZ': '49.224.0.0/14',
4429         'OM': '5.36.0.0/15',
4430         'PA': '186.72.0.0/15',
4431         'PE': '186.160.0.0/14',
4432         'PF': '123.50.64.0/18',
4433         'PG': '124.240.192.0/19',
4434         'PH': '49.144.0.0/13',
4435         'PK': '39.32.0.0/11',
4436         'PL': '83.0.0.0/11',
4437         'PM': '70.36.0.0/20',
4438         'PR': '66.50.0.0/16',
4439         'PS': '188.161.0.0/16',
4440         'PT': '85.240.0.0/13',
4441         'PW': '202.124.224.0/20',
4442         'PY': '181.120.0.0/14',
4443         'QA': '37.210.0.0/15',
4444         'RE': '102.35.0.0/16',
4445         'RO': '79.112.0.0/13',
4446         'RS': '93.86.0.0/15',
4447         'RU': '5.136.0.0/13',
4448         'RW': '41.186.0.0/16',
4449         'SA': '188.48.0.0/13',
4450         'SB': '202.1.160.0/19',
4451         'SC': '154.192.0.0/11',
4452         'SD': '102.120.0.0/13',
4453         'SE': '78.64.0.0/12',
4454         'SG': '8.128.0.0/10',
4455         'SI': '188.196.0.0/14',
4456         'SK': '78.98.0.0/15',
4457         'SL': '102.143.0.0/17',
4458         'SM': '89.186.32.0/19',
4459         'SN': '41.82.0.0/15',
4460         'SO': '154.115.192.0/18',
4461         'SR': '186.179.128.0/17',
4462         'SS': '105.235.208.0/21',
4463         'ST': '197.159.160.0/19',
4464         'SV': '168.243.0.0/16',
4465         'SX': '190.102.0.0/20',
4466         'SY': '5.0.0.0/16',
4467         'SZ': '41.84.224.0/19',
4468         'TC': '65.255.48.0/20',
4469         'TD': '154.68.128.0/19',
4470         'TG': '196.168.0.0/14',
4471         'TH': '171.96.0.0/13',
4472         'TJ': '85.9.128.0/18',
4473         'TK': '27.96.24.0/21',
4474         'TL': '180.189.160.0/20',
4475         'TM': '95.85.96.0/19',
4476         'TN': '197.0.0.0/11',
4477         'TO': '175.176.144.0/21',
4478         'TR': '78.160.0.0/11',
4479         'TT': '186.44.0.0/15',
4480         'TV': '202.2.96.0/19',
4481         'TW': '120.96.0.0/11',
4482         'TZ': '156.156.0.0/14',
4483         'UA': '37.52.0.0/14',
4484         'UG': '102.80.0.0/13',
4485         'US': '6.0.0.0/8',
4486         'UY': '167.56.0.0/13',
4487         'UZ': '84.54.64.0/18',
4488         'VA': '212.77.0.0/19',
4489         'VC': '207.191.240.0/21',
4490         'VE': '186.88.0.0/13',
4491         'VG': '66.81.192.0/20',
4492         'VI': '146.226.0.0/16',
4493         'VN': '14.160.0.0/11',
4494         'VU': '202.80.32.0/20',
4495         'WF': '117.20.32.0/21',
4496         'WS': '202.4.32.0/19',
4497         'YE': '134.35.0.0/16',
4498         'YT': '41.242.116.0/22',
4499         'ZA': '41.0.0.0/11',
4500         'ZM': '102.144.0.0/13',
4501         'ZW': '102.177.192.0/18',
4502     }
4503
4504     @classmethod
4505     def random_ipv4(cls, code_or_block):
4506         if len(code_or_block) == 2:
4507             block = cls._country_ip_map.get(code_or_block.upper())
4508             if not block:
4509                 return None
4510         else:
4511             block = code_or_block
4512         addr, preflen = block.split('/')
4513         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4514         addr_max = addr_min | (0xffffffff >> int(preflen))
4515         return compat_str(socket.inet_ntoa(
4516             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4517
4518
4519 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4520     def __init__(self, proxies=None):
4521         # Set default handlers
4522         for type in ('http', 'https'):
4523             setattr(self, '%s_open' % type,
4524                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4525                         meth(r, proxy, type))
4526         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4527
4528     def proxy_open(self, req, proxy, type):
4529         req_proxy = req.headers.get('Ytdl-request-proxy')
4530         if req_proxy is not None:
4531             proxy = req_proxy
4532             del req.headers['Ytdl-request-proxy']
4533
4534         if proxy == '__noproxy__':
4535             return None  # No Proxy
4536         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4537             req.add_header('Ytdl-socks-proxy', proxy)
4538             # yt-dlp's http/https handlers do wrapping the socket with socks
4539             return None
4540         return compat_urllib_request.ProxyHandler.proxy_open(
4541             self, req, proxy, type)
4542
4543
4544 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4545 # released into Public Domain
4546 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4547
4548 def long_to_bytes(n, blocksize=0):
4549     """long_to_bytes(n:long, blocksize:int) : string
4550     Convert a long integer to a byte string.
4551
4552     If optional blocksize is given and greater than zero, pad the front of the
4553     byte string with binary zeros so that the length is a multiple of
4554     blocksize.
4555     """
4556     # after much testing, this algorithm was deemed to be the fastest
4557     s = b''
4558     n = int(n)
4559     while n > 0:
4560         s = compat_struct_pack('>I', n & 0xffffffff) + s
4561         n = n >> 32
4562     # strip off leading zeros
4563     for i in range(len(s)):
4564         if s[i] != b'\000'[0]:
4565             break
4566     else:
4567         # only happens when n == 0
4568         s = b'\000'
4569         i = 0
4570     s = s[i:]
4571     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4572     # de-padding being done above, but sigh...
4573     if blocksize > 0 and len(s) % blocksize:
4574         s = (blocksize - len(s) % blocksize) * b'\000' + s
4575     return s
4576
4577
4578 def bytes_to_long(s):
4579     """bytes_to_long(string) : long
4580     Convert a byte string to a long integer.
4581
4582     This is (essentially) the inverse of long_to_bytes().
4583     """
4584     acc = 0
4585     length = len(s)
4586     if length % 4:
4587         extra = (4 - length % 4)
4588         s = b'\000' * extra + s
4589         length = length + extra
4590     for i in range(0, length, 4):
4591         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4592     return acc
4593
4594
4595 def ohdave_rsa_encrypt(data, exponent, modulus):
4596     '''
4597     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4598
4599     Input:
4600         data: data to encrypt, bytes-like object
4601         exponent, modulus: parameter e and N of RSA algorithm, both integer
4602     Output: hex string of encrypted data
4603
4604     Limitation: supports one block encryption only
4605     '''
4606
4607     payload = int(binascii.hexlify(data[::-1]), 16)
4608     encrypted = pow(payload, exponent, modulus)
4609     return '%x' % encrypted
4610
4611
4612 def pkcs1pad(data, length):
4613     """
4614     Padding input data with PKCS#1 scheme
4615
4616     @param {int[]} data        input data
4617     @param {int}   length      target length
4618     @returns {int[]}           padded data
4619     """
4620     if len(data) > length - 11:
4621         raise ValueError('Input data too long for PKCS#1 padding')
4622
4623     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4624     return [0, 2] + pseudo_random + [0] + data
4625
4626
4627 def encode_base_n(num, n, table=None):
4628     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4629     if not table:
4630         table = FULL_TABLE[:n]
4631
4632     if n > len(table):
4633         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4634
4635     if num == 0:
4636         return table[0]
4637
4638     ret = ''
4639     while num:
4640         ret = table[num % n] + ret
4641         num = num // n
4642     return ret
4643
4644
4645 def decode_packed_codes(code):
4646     mobj = re.search(PACKED_CODES_RE, code)
4647     obfuscated_code, base, count, symbols = mobj.groups()
4648     base = int(base)
4649     count = int(count)
4650     symbols = symbols.split('|')
4651     symbol_table = {}
4652
4653     while count:
4654         count -= 1
4655         base_n_count = encode_base_n(count, base)
4656         symbol_table[base_n_count] = symbols[count] or base_n_count
4657
4658     return re.sub(
4659         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4660         obfuscated_code)
4661
4662
4663 def caesar(s, alphabet, shift):
4664     if shift == 0:
4665         return s
4666     l = len(alphabet)
4667     return ''.join(
4668         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4669         for c in s)
4670
4671
4672 def rot47(s):
4673     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4674
4675
4676 def parse_m3u8_attributes(attrib):
4677     info = {}
4678     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4679         if val.startswith('"'):
4680             val = val[1:-1]
4681         info[key] = val
4682     return info
4683
4684
4685 def urshift(val, n):
4686     return val >> n if val >= 0 else (val + 0x100000000) >> n
4687
4688
4689 # Based on png2str() written by @gdkchan and improved by @yokrysty
4690 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4691 def decode_png(png_data):
4692     # Reference: https://www.w3.org/TR/PNG/
4693     header = png_data[8:]
4694
4695     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4696         raise IOError('Not a valid PNG file.')
4697
4698     int_map = {1: '>B', 2: '>H', 4: '>I'}
4699     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4700
4701     chunks = []
4702
4703     while header:
4704         length = unpack_integer(header[:4])
4705         header = header[4:]
4706
4707         chunk_type = header[:4]
4708         header = header[4:]
4709
4710         chunk_data = header[:length]
4711         header = header[length:]
4712
4713         header = header[4:]  # Skip CRC
4714
4715         chunks.append({
4716             'type': chunk_type,
4717             'length': length,
4718             'data': chunk_data
4719         })
4720
4721     ihdr = chunks[0]['data']
4722
4723     width = unpack_integer(ihdr[:4])
4724     height = unpack_integer(ihdr[4:8])
4725
4726     idat = b''
4727
4728     for chunk in chunks:
4729         if chunk['type'] == b'IDAT':
4730             idat += chunk['data']
4731
4732     if not idat:
4733         raise IOError('Unable to read PNG data.')
4734
4735     decompressed_data = bytearray(zlib.decompress(idat))
4736
4737     stride = width * 3
4738     pixels = []
4739
4740     def _get_pixel(idx):
4741         x = idx % stride
4742         y = idx // stride
4743         return pixels[y][x]
4744
4745     for y in range(height):
4746         basePos = y * (1 + stride)
4747         filter_type = decompressed_data[basePos]
4748
4749         current_row = []
4750
4751         pixels.append(current_row)
4752
4753         for x in range(stride):
4754             color = decompressed_data[1 + basePos + x]
4755             basex = y * stride + x
4756             left = 0
4757             up = 0
4758
4759             if x > 2:
4760                 left = _get_pixel(basex - 3)
4761             if y > 0:
4762                 up = _get_pixel(basex - stride)
4763
4764             if filter_type == 1:  # Sub
4765                 color = (color + left) & 0xff
4766             elif filter_type == 2:  # Up
4767                 color = (color + up) & 0xff
4768             elif filter_type == 3:  # Average
4769                 color = (color + ((left + up) >> 1)) & 0xff
4770             elif filter_type == 4:  # Paeth
4771                 a = left
4772                 b = up
4773                 c = 0
4774
4775                 if x > 2 and y > 0:
4776                     c = _get_pixel(basex - stride - 3)
4777
4778                 p = a + b - c
4779
4780                 pa = abs(p - a)
4781                 pb = abs(p - b)
4782                 pc = abs(p - c)
4783
4784                 if pa <= pb and pa <= pc:
4785                     color = (color + a) & 0xff
4786                 elif pb <= pc:
4787                     color = (color + b) & 0xff
4788                 else:
4789                     color = (color + c) & 0xff
4790
4791             current_row.append(color)
4792
4793     return width, height, pixels
4794
4795
4796 def write_xattr(path, key, value):
4797     # This mess below finds the best xattr tool for the job
4798     try:
4799         # try the pyxattr module...
4800         import xattr
4801
4802         if hasattr(xattr, 'set'):  # pyxattr
4803             # Unicode arguments are not supported in python-pyxattr until
4804             # version 0.5.0
4805             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4806             pyxattr_required_version = '0.5.0'
4807             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4808                 # TODO: fallback to CLI tools
4809                 raise XAttrUnavailableError(
4810                     'python-pyxattr is detected but is too old. '
4811                     'yt-dlp requires %s or above while your version is %s. '
4812                     'Falling back to other xattr implementations' % (
4813                         pyxattr_required_version, xattr.__version__))
4814
4815             setxattr = xattr.set
4816         else:  # xattr
4817             setxattr = xattr.setxattr
4818
4819         try:
4820             setxattr(path, key, value)
4821         except EnvironmentError as e:
4822             raise XAttrMetadataError(e.errno, e.strerror)
4823
4824     except ImportError:
4825         if compat_os_name == 'nt':
4826             # Write xattrs to NTFS Alternate Data Streams:
4827             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4828             assert ':' not in key
4829             assert os.path.exists(path)
4830
4831             ads_fn = path + ':' + key
4832             try:
4833                 with open(ads_fn, 'wb') as f:
4834                     f.write(value)
4835             except EnvironmentError as e:
4836                 raise XAttrMetadataError(e.errno, e.strerror)
4837         else:
4838             user_has_setfattr = check_executable('setfattr', ['--version'])
4839             user_has_xattr = check_executable('xattr', ['-h'])
4840
4841             if user_has_setfattr or user_has_xattr:
4842
4843                 value = value.decode('utf-8')
4844                 if user_has_setfattr:
4845                     executable = 'setfattr'
4846                     opts = ['-n', key, '-v', value]
4847                 elif user_has_xattr:
4848                     executable = 'xattr'
4849                     opts = ['-w', key, value]
4850
4851                 cmd = ([encodeFilename(executable, True)]
4852                        + [encodeArgument(o) for o in opts]
4853                        + [encodeFilename(path, True)])
4854
4855                 try:
4856                     p = Popen(
4857                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4858                 except EnvironmentError as e:
4859                     raise XAttrMetadataError(e.errno, e.strerror)
4860                 stdout, stderr = p.communicate_or_kill()
4861                 stderr = stderr.decode('utf-8', 'replace')
4862                 if p.returncode != 0:
4863                     raise XAttrMetadataError(p.returncode, stderr)
4864
4865             else:
4866                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4867                 if sys.platform.startswith('linux'):
4868                     raise XAttrUnavailableError(
4869                         "Couldn't find a tool to set the xattrs. "
4870                         "Install either the python 'pyxattr' or 'xattr' "
4871                         "modules, or the GNU 'attr' package "
4872                         "(which contains the 'setfattr' tool).")
4873                 else:
4874                     raise XAttrUnavailableError(
4875                         "Couldn't find a tool to set the xattrs. "
4876                         "Install either the python 'xattr' module, "
4877                         "or the 'xattr' binary.")
4878
4879
4880 def random_birthday(year_field, month_field, day_field):
4881     start_date = datetime.date(1950, 1, 1)
4882     end_date = datetime.date(1995, 12, 31)
4883     offset = random.randint(0, (end_date - start_date).days)
4884     random_date = start_date + datetime.timedelta(offset)
4885     return {
4886         year_field: str(random_date.year),
4887         month_field: str(random_date.month),
4888         day_field: str(random_date.day),
4889     }
4890
4891
4892 # Templates for internet shortcut files, which are plain text files.
4893 DOT_URL_LINK_TEMPLATE = '''
4894 [InternetShortcut]
4895 URL=%(url)s
4896 '''.lstrip()
4897
4898 DOT_WEBLOC_LINK_TEMPLATE = '''
4899 <?xml version="1.0" encoding="UTF-8"?>
4900 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4901 <plist version="1.0">
4902 <dict>
4903 \t<key>URL</key>
4904 \t<string>%(url)s</string>
4905 </dict>
4906 </plist>
4907 '''.lstrip()
4908
4909 DOT_DESKTOP_LINK_TEMPLATE = '''
4910 [Desktop Entry]
4911 Encoding=UTF-8
4912 Name=%(filename)s
4913 Type=Link
4914 URL=%(url)s
4915 Icon=text-html
4916 '''.lstrip()
4917
4918 LINK_TEMPLATES = {
4919     'url': DOT_URL_LINK_TEMPLATE,
4920     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4921     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4922 }
4923
4924
4925 def iri_to_uri(iri):
4926     """
4927     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4928
4929     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4930     """
4931
4932     iri_parts = compat_urllib_parse_urlparse(iri)
4933
4934     if '[' in iri_parts.netloc:
4935         raise ValueError('IPv6 URIs are not, yet, supported.')
4936         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4937
4938     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4939
4940     net_location = ''
4941     if iri_parts.username:
4942         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4943         if iri_parts.password is not None:
4944             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4945         net_location += '@'
4946
4947     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4948     # The 'idna' encoding produces ASCII text.
4949     if iri_parts.port is not None and iri_parts.port != 80:
4950         net_location += ':' + str(iri_parts.port)
4951
4952     return compat_urllib_parse_urlunparse(
4953         (iri_parts.scheme,
4954             net_location,
4955
4956             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4957
4958             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4959             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4960
4961             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4962             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4963
4964             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4965
4966     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4967
4968
4969 def to_high_limit_path(path):
4970     if sys.platform in ['win32', 'cygwin']:
4971         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4972         return r'\\?\ '.rstrip() + os.path.abspath(path)
4973
4974     return path
4975
4976
4977 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4978     if field is None:
4979         val = obj if obj is not None else default
4980     else:
4981         val = obj.get(field, default)
4982     if func and val not in ignore:
4983         val = func(val)
4984     return template % val if val not in ignore else default
4985
4986
4987 def clean_podcast_url(url):
4988     return re.sub(r'''(?x)
4989         (?:
4990             (?:
4991                 chtbl\.com/track|
4992                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4993                 play\.podtrac\.com
4994             )/[^/]+|
4995             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4996             flex\.acast\.com|
4997             pd(?:
4998                 cn\.co| # https://podcorn.com/analytics-prefix/
4999                 st\.fm # https://podsights.com/docs/
5000             )/e
5001         )/''', '', url)
5002
5003
5004 _HEX_TABLE = '0123456789abcdef'
5005
5006
5007 def random_uuidv4():
5008     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5009
5010
5011 def make_dir(path, to_screen=None):
5012     try:
5013         dn = os.path.dirname(path)
5014         if dn and not os.path.exists(dn):
5015             os.makedirs(dn)
5016         return True
5017     except (OSError, IOError) as err:
5018         if callable(to_screen) is not None:
5019             to_screen('unable to create directory ' + error_to_compat_str(err))
5020         return False
5021
5022
5023 def get_executable_path():
5024     from zipimport import zipimporter
5025     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5026         path = os.path.dirname(sys.executable)
5027     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5028         path = os.path.join(os.path.dirname(__file__), '../..')
5029     else:
5030         path = os.path.join(os.path.dirname(__file__), '..')
5031     return os.path.abspath(path)
5032
5033
5034 def load_plugins(name, suffix, namespace):
5035     classes = {}
5036     try:
5037         plugins_spec = importlib.util.spec_from_file_location(
5038             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5039         plugins = importlib.util.module_from_spec(plugins_spec)
5040         sys.modules[plugins_spec.name] = plugins
5041         plugins_spec.loader.exec_module(plugins)
5042         for name in dir(plugins):
5043             if name in namespace:
5044                 continue
5045             if not name.endswith(suffix):
5046                 continue
5047             klass = getattr(plugins, name)
5048             classes[name] = namespace[name] = klass
5049     except FileNotFoundError:
5050         pass
5051     return classes
5052
5053
5054 def traverse_obj(
5055         obj, *path_list, default=None, expected_type=None, get_all=True,
5056         casesense=True, is_user_input=False, traverse_string=False):
5057     ''' Traverse nested list/dict/tuple
5058     @param path_list        A list of paths which are checked one by one.
5059                             Each path is a list of keys where each key is a string,
5060                             a function, a tuple of strings/None or "...".
5061                             When a fuction is given, it takes the key as argument and
5062                             returns whether the key matches or not. When a tuple is given,
5063                             all the keys given in the tuple are traversed, and
5064                             "..." traverses all the keys in the object
5065                             "None" returns the object without traversal
5066     @param default          Default value to return
5067     @param expected_type    Only accept final value of this type (Can also be any callable)
5068     @param get_all          Return all the values obtained from a path or only the first one
5069     @param casesense        Whether to consider dictionary keys as case sensitive
5070     @param is_user_input    Whether the keys are generated from user input. If True,
5071                             strings are converted to int/slice if necessary
5072     @param traverse_string  Whether to traverse inside strings. If True, any
5073                             non-compatible object will also be converted into a string
5074     # TODO: Write tests
5075     '''
5076     if not casesense:
5077         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5078         path_list = (map(_lower, variadic(path)) for path in path_list)
5079
5080     def _traverse_obj(obj, path, _current_depth=0):
5081         nonlocal depth
5082         path = tuple(variadic(path))
5083         for i, key in enumerate(path):
5084             if None in (key, obj):
5085                 return obj
5086             if isinstance(key, (list, tuple)):
5087                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5088                 key = ...
5089             if key is ...:
5090                 obj = (obj.values() if isinstance(obj, dict)
5091                        else obj if isinstance(obj, (list, tuple, LazyList))
5092                        else str(obj) if traverse_string else [])
5093                 _current_depth += 1
5094                 depth = max(depth, _current_depth)
5095                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5096             elif callable(key):
5097                 if isinstance(obj, (list, tuple, LazyList)):
5098                     obj = enumerate(obj)
5099                 elif isinstance(obj, dict):
5100                     obj = obj.items()
5101                 else:
5102                     if not traverse_string:
5103                         return None
5104                     obj = str(obj)
5105                 _current_depth += 1
5106                 depth = max(depth, _current_depth)
5107                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5108             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5109                 obj = (obj.get(key) if casesense or (key in obj)
5110                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5111             else:
5112                 if is_user_input:
5113                     key = (int_or_none(key) if ':' not in key
5114                            else slice(*map(int_or_none, key.split(':'))))
5115                     if key == slice(None):
5116                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5117                 if not isinstance(key, (int, slice)):
5118                     return None
5119                 if not isinstance(obj, (list, tuple, LazyList)):
5120                     if not traverse_string:
5121                         return None
5122                     obj = str(obj)
5123                 try:
5124                     obj = obj[key]
5125                 except IndexError:
5126                     return None
5127         return obj
5128
5129     if isinstance(expected_type, type):
5130         type_test = lambda val: val if isinstance(val, expected_type) else None
5131     elif expected_type is not None:
5132         type_test = expected_type
5133     else:
5134         type_test = lambda val: val
5135
5136     for path in path_list:
5137         depth = 0
5138         val = _traverse_obj(obj, path)
5139         if val is not None:
5140             if depth:
5141                 for _ in range(depth - 1):
5142                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5143                 val = [v for v in map(type_test, val) if v is not None]
5144                 if val:
5145                     return val if get_all else val[0]
5146             else:
5147                 val = type_test(val)
5148                 if val is not None:
5149                     return val
5150     return default
5151
5152
5153 def traverse_dict(dictn, keys, casesense=True):
5154     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5155                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5156     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5157
5158
5159 def variadic(x, allowed_types=(str, bytes, dict)):
5160     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5161
5162
5163 # create a JSON Web Signature (jws) with HS256 algorithm
5164 # the resulting format is in JWS Compact Serialization
5165 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5166 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5167 def jwt_encode_hs256(payload_data, key, headers={}):
5168     header_data = {
5169         'alg': 'HS256',
5170         'typ': 'JWT',
5171     }
5172     if headers:
5173         header_data.update(headers)
5174     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5175     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5176     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5177     signature_b64 = base64.b64encode(h.digest())
5178     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5179     return token
5180
5181
5182 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5183 def jwt_decode_hs256(jwt):
5184     header_b64, payload_b64, signature_b64 = jwt.split('.')
5185     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5186     return payload_data
5187
5188
5189 def supports_terminal_sequences(stream):
5190     if compat_os_name == 'nt':
5191         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5192         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5193             return False
5194     elif not os.getenv('TERM'):
5195         return False
5196     try:
5197         return stream.isatty()
5198     except BaseException:
5199         return False
5200
5201
5202 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5203
5204
5205 def remove_terminal_sequences(string):
5206     return _terminal_sequences_re.sub('', string)
5207
5208
5209 def number_of_digits(number):
5210     return len('%d' % number)
5211
5212
5213 def join_nonempty(*values, delim='-', from_dict=None):
5214     if from_dict is not None:
5215         values = map(from_dict.get, values)
5216     return delim.join(map(str, filter(None, values)))
5217
5218
5219 class Config:
5220     own_args = None
5221     filename = None
5222     __initialized = False
5223
5224     def __init__(self, parser, label=None):
5225         self._parser, self.label = parser, label
5226         self._loaded_paths, self.configs = set(), []
5227
5228     def init(self, args=None, filename=None):
5229         assert not self.__initialized
5230         if filename:
5231             location = os.path.realpath(filename)
5232             if location in self._loaded_paths:
5233                 return False
5234             self._loaded_paths.add(location)
5235
5236         self.__initialized = True
5237         self.own_args, self.filename = args, filename
5238         for location in self._parser.parse_args(args)[0].config_locations or []:
5239             location = compat_expanduser(location)
5240             if os.path.isdir(location):
5241                 location = os.path.join(location, 'yt-dlp.conf')
5242             if not os.path.exists(location):
5243                 self._parser.error(f'config location {location} does not exist')
5244             self.append_config(self.read_file(location), location)
5245         return True
5246
5247     def __str__(self):
5248         label = join_nonempty(
5249             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5250             delim=' ')
5251         return join_nonempty(
5252             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5253             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5254             delim='\n')
5255
5256     @staticmethod
5257     def read_file(filename, default=[]):
5258         try:
5259             optionf = open(filename)
5260         except IOError:
5261             return default  # silently skip if file is not present
5262         try:
5263             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5264             contents = optionf.read()
5265             if sys.version_info < (3,):
5266                 contents = contents.decode(preferredencoding())
5267             res = compat_shlex_split(contents, comments=True)
5268         finally:
5269             optionf.close()
5270         return res
5271
5272     @staticmethod
5273     def hide_login_info(opts):
5274         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5275         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5276
5277         def _scrub_eq(o):
5278             m = eqre.match(o)
5279             if m:
5280                 return m.group('key') + '=PRIVATE'
5281             else:
5282                 return o
5283
5284         opts = list(map(_scrub_eq, opts))
5285         for idx, opt in enumerate(opts):
5286             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5287                 opts[idx + 1] = 'PRIVATE'
5288         return opts
5289
5290     def append_config(self, *args, label=None):
5291         config = type(self)(self._parser, label)
5292         config._loaded_paths = self._loaded_paths
5293         if config.init(*args):
5294             self.configs.append(config)
5295
5296     @property
5297     def all_args(self):
5298         for config in reversed(self.configs):
5299             yield from config.all_args
5300         yield from self.own_args or []
5301
5302     def parse_args(self):
5303         return self._parser.parse_args(list(self.all_args))