yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_split,
  62     compat_shlex_quote,
  63     compat_str,
  64     compat_struct_pack,
  65     compat_struct_unpack,
  66     compat_urllib_error,
  67     compat_urllib_parse,
  68     compat_urllib_parse_urlencode,
  69     compat_urllib_parse_urlparse,
  70     compat_urllib_parse_urlunparse,
  71     compat_urllib_parse_quote,
  72     compat_urllib_parse_quote_plus,
  73     compat_urllib_parse_unquote_plus,
  74     compat_urllib_request,
  75     compat_urlparse,
  76     compat_xpath,
  77 )
  78
  79 from .socks import (
  80     ProxyType,
  81     sockssocket,
  82 )
  83
  84
  85 def register_socks_protocols():
  86     # "Register" SOCKS protocols
  87     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  88     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  89     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  90         if scheme not in compat_urlparse.uses_netloc:
  91             compat_urlparse.uses_netloc.append(scheme)
  92
  93
  94 # This is not clearly defined otherwise
  95 compiled_regex_type = type(re.compile(''))
  96
  97
  98 def random_user_agent():
  99     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 100     _CHROME_VERSIONS = (
 101         '90.0.4430.212',
 102         '90.0.4430.24',
 103         '90.0.4430.70',
 104         '90.0.4430.72',
 105         '90.0.4430.85',
 106         '90.0.4430.93',
 107         '91.0.4472.101',
 108         '91.0.4472.106',
 109         '91.0.4472.114',
 110         '91.0.4472.124',
 111         '91.0.4472.164',
 112         '91.0.4472.19',
 113         '91.0.4472.77',
 114         '92.0.4515.107',
 115         '92.0.4515.115',
 116         '92.0.4515.131',
 117         '92.0.4515.159',
 118         '92.0.4515.43',
 119         '93.0.4556.0',
 120         '93.0.4577.15',
 121         '93.0.4577.63',
 122         '93.0.4577.82',
 123         '94.0.4606.41',
 124         '94.0.4606.54',
 125         '94.0.4606.61',
 126         '94.0.4606.71',
 127         '94.0.4606.81',
 128         '94.0.4606.85',
 129         '95.0.4638.17',
 130         '95.0.4638.50',
 131         '95.0.4638.54',
 132         '95.0.4638.69',
 133         '95.0.4638.74',
 134         '96.0.4664.18',
 135         '96.0.4664.45',
 136         '96.0.4664.55',
 137         '96.0.4664.93',
 138         '97.0.4692.20',
 139     )
 140     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 141
 142
 143 std_headers = {
 144     'User-Agent': random_user_agent(),
 145     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 146     'Accept-Encoding': 'gzip, deflate',
 147     'Accept-Language': 'en-us,en;q=0.5',
 148 }
 149
 150
 151 USER_AGENTS = {
 152     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 153 }
 154
 155
 156 NO_DEFAULT = object()
 157
 158 ENGLISH_MONTH_NAMES = [
 159     'January', 'February', 'March', 'April', 'May', 'June',
 160     'July', 'August', 'September', 'October', 'November', 'December']
 161
 162 MONTH_NAMES = {
 163     'en': ENGLISH_MONTH_NAMES,
 164     'fr': [
 165         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 166         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 167 }
 168
 169 KNOWN_EXTENSIONS = (
 170     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 171     'flv', 'f4v', 'f4a', 'f4b',
 172     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 173     'mkv', 'mka', 'mk3d',
 174     'avi', 'divx',
 175     'mov',
 176     'asf', 'wmv', 'wma',
 177     '3gp', '3g2',
 178     'mp3',
 179     'flac',
 180     'ape',
 181     'wav',
 182     'f4f', 'f4m', 'm3u8', 'smil')
 183
 184 # needed for sanitizing filenames in restricted mode
 185 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 186                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 187                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 188
 189 DATE_FORMATS = (
 190     '%d %B %Y',
 191     '%d %b %Y',
 192     '%B %d %Y',
 193     '%B %dst %Y',
 194     '%B %dnd %Y',
 195     '%B %drd %Y',
 196     '%B %dth %Y',
 197     '%b %d %Y',
 198     '%b %dst %Y',
 199     '%b %dnd %Y',
 200     '%b %drd %Y',
 201     '%b %dth %Y',
 202     '%b %dst %Y %I:%M',
 203     '%b %dnd %Y %I:%M',
 204     '%b %drd %Y %I:%M',
 205     '%b %dth %Y %I:%M',
 206     '%Y %m %d',
 207     '%Y-%m-%d',
 208     '%Y.%m.%d.',
 209     '%Y/%m/%d',
 210     '%Y/%m/%d %H:%M',
 211     '%Y/%m/%d %H:%M:%S',
 212     '%Y%m%d%H%M',
 213     '%Y%m%d%H%M%S',
 214     '%Y%m%d',
 215     '%Y-%m-%d %H:%M',
 216     '%Y-%m-%d %H:%M:%S',
 217     '%Y-%m-%d %H:%M:%S.%f',
 218     '%Y-%m-%d %H:%M:%S:%f',
 219     '%d.%m.%Y %H:%M',
 220     '%d.%m.%Y %H.%M',
 221     '%Y-%m-%dT%H:%M:%SZ',
 222     '%Y-%m-%dT%H:%M:%S.%fZ',
 223     '%Y-%m-%dT%H:%M:%S.%f0Z',
 224     '%Y-%m-%dT%H:%M:%S',
 225     '%Y-%m-%dT%H:%M:%S.%f',
 226     '%Y-%m-%dT%H:%M',
 227     '%b %d %Y at %H:%M',
 228     '%b %d %Y at %H:%M:%S',
 229     '%B %d %Y at %H:%M',
 230     '%B %d %Y at %H:%M:%S',
 231     '%H:%M %d-%b-%Y',
 232 )
 233
 234 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 235 DATE_FORMATS_DAY_FIRST.extend([
 236     '%d-%m-%Y',
 237     '%d.%m.%Y',
 238     '%d.%m.%y',
 239     '%d/%m/%Y',
 240     '%d/%m/%y',
 241     '%d/%m/%Y %H:%M:%S',
 242 ])
 243
 244 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 245 DATE_FORMATS_MONTH_FIRST.extend([
 246     '%m-%d-%Y',
 247     '%m.%d.%Y',
 248     '%m/%d/%Y',
 249     '%m/%d/%y',
 250     '%m/%d/%Y %H:%M:%S',
 251 ])
 252
 253 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 254 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 255
 256
 257 def preferredencoding():
 258     """Get preferred encoding.
 259
 260     Returns the best encoding scheme for the system, based on
 261     locale.getpreferredencoding() and some further tweaks.
 262     """
 263     try:
 264         pref = locale.getpreferredencoding()
 265         'TEST'.encode(pref)
 266     except Exception:
 267         pref = 'UTF-8'
 268
 269     return pref
 270
 271
 272 def write_json_file(obj, fn):
 273     """ Encode obj as JSON and write it to fn, atomically if possible """
 274
 275     fn = encodeFilename(fn)
 276     if sys.version_info < (3, 0) and sys.platform != 'win32':
 277         encoding = get_filesystem_encoding()
 278         # os.path.basename returns a bytes object, but NamedTemporaryFile
 279         # will fail if the filename contains non ascii characters unless we
 280         # use a unicode object
 281         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 282         # the same for os.path.dirname
 283         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 284     else:
 285         path_basename = os.path.basename
 286         path_dirname = os.path.dirname
 287
 288     args = {
 289         'suffix': '.tmp',
 290         'prefix': path_basename(fn) + '.',
 291         'dir': path_dirname(fn),
 292         'delete': False,
 293     }
 294
 295     # In Python 2.x, json.dump expects a bytestream.
 296     # In Python 3.x, it writes to a character stream
 297     if sys.version_info < (3, 0):
 298         args['mode'] = 'wb'
 299     else:
 300         args.update({
 301             'mode': 'w',
 302             'encoding': 'utf-8',
 303         })
 304
 305     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 306
 307     try:
 308         with tf:
 309             json.dump(obj, tf, ensure_ascii=False)
 310         if sys.platform == 'win32':
 311             # Need to remove existing file on Windows, else os.rename raises
 312             # WindowsError or FileExistsError.
 313             try:
 314                 os.unlink(fn)
 315             except OSError:
 316                 pass
 317         try:
 318             mask = os.umask(0)
 319             os.umask(mask)
 320             os.chmod(tf.name, 0o666 & ~mask)
 321         except OSError:
 322             pass
 323         os.rename(tf.name, fn)
 324     except Exception:
 325         try:
 326             os.remove(tf.name)
 327         except OSError:
 328             pass
 329         raise
 330
 331
 332 if sys.version_info >= (2, 7):
 333     def find_xpath_attr(node, xpath, key, val=None):
 334         """ Find the xpath xpath[@key=val] """
 335         assert re.match(r'^[a-zA-Z_-]+$', key)
 336         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 337         return node.find(expr)
 338 else:
 339     def find_xpath_attr(node, xpath, key, val=None):
 340         for f in node.findall(compat_xpath(xpath)):
 341             if key not in f.attrib:
 342                 continue
 343             if val is None or f.attrib.get(key) == val:
 344                 return f
 345         return None
 346
 347 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 348 # the namespace parameter
 349
 350
 351 def xpath_with_ns(path, ns_map):
 352     components = [c.split(':') for c in path.split('/')]
 353     replaced = []
 354     for c in components:
 355         if len(c) == 1:
 356             replaced.append(c[0])
 357         else:
 358             ns, tag = c
 359             replaced.append('{%s}%s' % (ns_map[ns], tag))
 360     return '/'.join(replaced)
 361
 362
 363 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 364     def _find_xpath(xpath):
 365         return node.find(compat_xpath(xpath))
 366
 367     if isinstance(xpath, (str, compat_str)):
 368         n = _find_xpath(xpath)
 369     else:
 370         for xp in xpath:
 371             n = _find_xpath(xp)
 372             if n is not None:
 373                 break
 374
 375     if n is None:
 376         if default is not NO_DEFAULT:
 377             return default
 378         elif fatal:
 379             name = xpath if name is None else name
 380             raise ExtractorError('Could not find XML element %s' % name)
 381         else:
 382             return None
 383     return n
 384
 385
 386 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 387     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 388     if n is None or n == default:
 389         return n
 390     if n.text is None:
 391         if default is not NO_DEFAULT:
 392             return default
 393         elif fatal:
 394             name = xpath if name is None else name
 395             raise ExtractorError('Could not find XML element\'s text %s' % name)
 396         else:
 397             return None
 398     return n.text
 399
 400
 401 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 402     n = find_xpath_attr(node, xpath, key)
 403     if n is None:
 404         if default is not NO_DEFAULT:
 405             return default
 406         elif fatal:
 407             name = '%s[@%s]' % (xpath, key) if name is None else name
 408             raise ExtractorError('Could not find XML attribute %s' % name)
 409         else:
 410             return None
 411     return n.attrib[key]
 412
 413
 414 def get_element_by_id(id, html):
 415     """Return the content of the tag with the specified ID in the passed HTML document"""
 416     return get_element_by_attribute('id', id, html)
 417
 418
 419 def get_element_html_by_id(id, html):
 420     """Return the html of the tag with the specified ID in the passed HTML document"""
 421     return get_element_html_by_attribute('id', id, html)
 422
 423
 424 def get_element_by_class(class_name, html):
 425     """Return the content of the first tag with the specified class in the passed HTML document"""
 426     retval = get_elements_by_class(class_name, html)
 427     return retval[0] if retval else None
 428
 429
 430 def get_element_html_by_class(class_name, html):
 431     """Return the html of the first tag with the specified class in the passed HTML document"""
 432     retval = get_elements_html_by_class(class_name, html)
 433     return retval[0] if retval else None
 434
 435
 436 def get_element_by_attribute(attribute, value, html, escape_value=True):
 437     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 438     return retval[0] if retval else None
 439
 440
 441 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 442     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 443     return retval[0] if retval else None
 444
 445
 446 def get_elements_by_class(class_name, html):
 447     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 448     return get_elements_by_attribute(
 449         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 450         html, escape_value=False)
 451
 452
 453 def get_elements_html_by_class(class_name, html):
 454     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 455     return get_elements_html_by_attribute(
 456         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 457         html, escape_value=False)
 458
 459
 460 def get_elements_by_attribute(*args, **kwargs):
 461     """Return the content of the tag with the specified attribute in the passed HTML document"""
 462     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 463
 464
 465 def get_elements_html_by_attribute(*args, **kwargs):
 466     """Return the html of the tag with the specified attribute in the passed HTML document"""
 467     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 468
 469
 470 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 471     """
 472     Return the text (content) and the html (whole) of the tag with the specified
 473     attribute in the passed HTML document
 474     """
 475
 476     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 477
 478     value = re.escape(value) if escape_value else value
 479
 480     partial_element_re = r'''(?x)
 481         <(?P<tag>[a-zA-Z0-9:._-]+)
 482          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 483          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 484         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 485
 486     for m in re.finditer(partial_element_re, html):
 487         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 488
 489         yield (
 490             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 491             whole
 492         )
 493
 494
 495 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 496     """
 497     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 498     closing tag for the first opening tag it has encountered, and can be used
 499     as a context manager
 500     """
 501
 502     class HTMLBreakOnClosingTagException(Exception):
 503         pass
 504
 505     def __init__(self):
 506         self.tagstack = collections.deque()
 507         compat_HTMLParser.__init__(self)
 508
 509     def __enter__(self):
 510         return self
 511
 512     def __exit__(self, *_):
 513         self.close()
 514
 515     def close(self):
 516         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 517         # so data remains buffered; we no longer have any interest in it, thus
 518         # override this method to discard it
 519         pass
 520
 521     def handle_starttag(self, tag, _):
 522         self.tagstack.append(tag)
 523
 524     def handle_endtag(self, tag):
 525         if not self.tagstack:
 526             raise compat_HTMLParseError('no tags in the stack')
 527         while self.tagstack:
 528             inner_tag = self.tagstack.pop()
 529             if inner_tag == tag:
 530                 break
 531         else:
 532             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 533         if not self.tagstack:
 534             raise self.HTMLBreakOnClosingTagException()
 535
 536
 537 def get_element_text_and_html_by_tag(tag, html):
 538     """
 539     For the first element with the specified tag in the passed HTML document
 540     return its' content (text) and the whole element (html)
 541     """
 542     def find_or_raise(haystack, needle, exc):
 543         try:
 544             return haystack.index(needle)
 545         except ValueError:
 546             raise exc
 547     closing_tag = f'</{tag}>'
 548     whole_start = find_or_raise(
 549         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 550     content_start = find_or_raise(
 551         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 552     content_start += whole_start + 1
 553     with HTMLBreakOnClosingTagParser() as parser:
 554         parser.feed(html[whole_start:content_start])
 555         if not parser.tagstack or parser.tagstack[0] != tag:
 556             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 557         offset = content_start
 558         while offset < len(html):
 559             next_closing_tag_start = find_or_raise(
 560                 html[offset:], closing_tag,
 561                 compat_HTMLParseError(f'closing {tag} tag not found'))
 562             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 563             try:
 564                 parser.feed(html[offset:offset + next_closing_tag_end])
 565                 offset += next_closing_tag_end
 566             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 567                 return html[content_start:offset + next_closing_tag_start], \
 568                     html[whole_start:offset + next_closing_tag_end]
 569         raise compat_HTMLParseError('unexpected end of html')
 570
 571
 572 class HTMLAttributeParser(compat_HTMLParser):
 573     """Trivial HTML parser to gather the attributes for a single element"""
 574
 575     def __init__(self):
 576         self.attrs = {}
 577         compat_HTMLParser.__init__(self)
 578
 579     def handle_starttag(self, tag, attrs):
 580         self.attrs = dict(attrs)
 581
 582
 583 class HTMLListAttrsParser(compat_HTMLParser):
 584     """HTML parser to gather the attributes for the elements of a list"""
 585
 586     def __init__(self):
 587         compat_HTMLParser.__init__(self)
 588         self.items = []
 589         self._level = 0
 590
 591     def handle_starttag(self, tag, attrs):
 592         if tag == 'li' and self._level == 0:
 593             self.items.append(dict(attrs))
 594         self._level += 1
 595
 596     def handle_endtag(self, tag):
 597         self._level -= 1
 598
 599
 600 def extract_attributes(html_element):
 601     """Given a string for an HTML element such as
 602     <el
 603          a="foo" B="bar" c="&98;az" d=boz
 604          empty= noval entity="&amp;"
 605          sq='"' dq="'"
 606     >
 607     Decode and return a dictionary of attributes.
 608     {
 609         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 610         'empty': '', 'noval': None, 'entity': '&',
 611         'sq': '"', 'dq': '\''
 612     }.
 613     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 614     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 615     """
 616     parser = HTMLAttributeParser()
 617     try:
 618         parser.feed(html_element)
 619         parser.close()
 620     # Older Python may throw HTMLParseError in case of malformed HTML
 621     except compat_HTMLParseError:
 622         pass
 623     return parser.attrs
 624
 625
 626 def parse_list(webpage):
 627     """Given a string for an series of HTML <li> elements,
 628     return a dictionary of their attributes"""
 629     parser = HTMLListAttrsParser()
 630     parser.feed(webpage)
 631     parser.close()
 632     return parser.items
 633
 634
 635 def clean_html(html):
 636     """Clean an HTML snippet into a readable string"""
 637
 638     if html is None:  # Convenience for sanitizing descriptions etc.
 639         return html
 640
 641     # Newline vs <br />
 642     html = html.replace('\n', ' ')
 643     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 644     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 645     # Strip html tags
 646     html = re.sub('<.*?>', '', html)
 647     # Replace html entities
 648     html = unescapeHTML(html)
 649     return html.strip()
 650
 651
 652 def sanitize_open(filename, open_mode):
 653     """Try to open the given filename, and slightly tweak it if this fails.
 654
 655     Attempts to open the given filename. If this fails, it tries to change
 656     the filename slightly, step by step, until it's either able to open it
 657     or it fails and raises a final exception, like the standard open()
 658     function.
 659
 660     It returns the tuple (stream, definitive_file_name).
 661     """
 662     try:
 663         if filename == '-':
 664             if sys.platform == 'win32':
 665                 import msvcrt
 666                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 667             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 668         stream = open(encodeFilename(filename), open_mode)
 669         return (stream, filename)
 670     except (IOError, OSError) as err:
 671         if err.errno in (errno.EACCES,):
 672             raise
 673
 674         # In case of error, try to remove win32 forbidden chars
 675         alt_filename = sanitize_path(filename)
 676         if alt_filename == filename:
 677             raise
 678         else:
 679             # An exception here should be caught in the caller
 680             stream = open(encodeFilename(alt_filename), open_mode)
 681             return (stream, alt_filename)
 682
 683
 684 def timeconvert(timestr):
 685     """Convert RFC 2822 defined time string into system timestamp"""
 686     timestamp = None
 687     timetuple = email.utils.parsedate_tz(timestr)
 688     if timetuple is not None:
 689         timestamp = email.utils.mktime_tz(timetuple)
 690     return timestamp
 691
 692
 693 def sanitize_filename(s, restricted=False, is_id=False):
 694     """Sanitizes a string so it could be used as part of a filename.
 695     If restricted is set, use a stricter subset of allowed characters.
 696     Set is_id if this is not an arbitrary string, but an ID that should be kept
 697     if possible.
 698     """
 699     def replace_insane(char):
 700         if restricted and char in ACCENT_CHARS:
 701             return ACCENT_CHARS[char]
 702         elif not restricted and char == '\n':
 703             return ' '
 704         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 705             return ''
 706         elif char == '"':
 707             return '' if restricted else '\''
 708         elif char == ':':
 709             return '_-' if restricted else ' -'
 710         elif char in '\\/|*<>':
 711             return '_'
 712         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 713             return '_'
 714         if restricted and ord(char) > 127:
 715             return '_'
 716         return char
 717
 718     if s == '':
 719         return ''
 720     # Handle timestamps
 721     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 722     result = ''.join(map(replace_insane, s))
 723     if not is_id:
 724         while '__' in result:
 725             result = result.replace('__', '_')
 726         result = result.strip('_')
 727         # Common case of "Foreign band name - English song title"
 728         if restricted and result.startswith('-_'):
 729             result = result[2:]
 730         if result.startswith('-'):
 731             result = '_' + result[len('-'):]
 732         result = result.lstrip('.')
 733         if not result:
 734             result = '_'
 735     return result
 736
 737
 738 def sanitize_path(s, force=False):
 739     """Sanitizes and normalizes path on Windows"""
 740     if sys.platform == 'win32':
 741         force = False
 742         drive_or_unc, _ = os.path.splitdrive(s)
 743         if sys.version_info < (2, 7) and not drive_or_unc:
 744             drive_or_unc, _ = os.path.splitunc(s)
 745     elif force:
 746         drive_or_unc = ''
 747     else:
 748         return s
 749
 750     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 751     if drive_or_unc:
 752         norm_path.pop(0)
 753     sanitized_path = [
 754         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 755         for path_part in norm_path]
 756     if drive_or_unc:
 757         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 758     elif force and s[0] == os.path.sep:
 759         sanitized_path.insert(0, os.path.sep)
 760     return os.path.join(*sanitized_path)
 761
 762
 763 def sanitize_url(url):
 764     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 765     # the number of unwanted failures due to missing protocol
 766     if url.startswith('//'):
 767         return 'http:%s' % url
 768     # Fix some common typos seen so far
 769     COMMON_TYPOS = (
 770         # https://github.com/ytdl-org/youtube-dl/issues/15649
 771         (r'^httpss://', r'https://'),
 772         # https://bx1.be/lives/direct-tv/
 773         (r'^rmtp([es]?)://', r'rtmp\1://'),
 774     )
 775     for mistake, fixup in COMMON_TYPOS:
 776         if re.match(mistake, url):
 777             return re.sub(mistake, fixup, url)
 778     return url
 779
 780
 781 def extract_basic_auth(url):
 782     parts = compat_urlparse.urlsplit(url)
 783     if parts.username is None:
 784         return url, None
 785     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 786         parts.hostname if parts.port is None
 787         else '%s:%d' % (parts.hostname, parts.port))))
 788     auth_payload = base64.b64encode(
 789         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 790     return url, 'Basic ' + auth_payload.decode('utf-8')
 791
 792
 793 def sanitized_Request(url, *args, **kwargs):
 794     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 795     if auth_header is not None:
 796         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 797         headers['Authorization'] = auth_header
 798     return compat_urllib_request.Request(url, *args, **kwargs)
 799
 800
 801 def expand_path(s):
 802     """Expand shell variables and ~"""
 803     return os.path.expandvars(compat_expanduser(s))
 804
 805
 806 def orderedSet(iterable):
 807     """ Remove all duplicates from the input iterable """
 808     res = []
 809     for el in iterable:
 810         if el not in res:
 811             res.append(el)
 812     return res
 813
 814
 815 def _htmlentity_transform(entity_with_semicolon):
 816     """Transforms an HTML entity to a character."""
 817     entity = entity_with_semicolon[:-1]
 818
 819     # Known non-numeric HTML entity
 820     if entity in compat_html_entities.name2codepoint:
 821         return compat_chr(compat_html_entities.name2codepoint[entity])
 822
 823     # TODO: HTML5 allows entities without a semicolon. For example,
 824     # '&Eacuteric' should be decoded as 'Éric'.
 825     if entity_with_semicolon in compat_html_entities_html5:
 826         return compat_html_entities_html5[entity_with_semicolon]
 827
 828     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 829     if mobj is not None:
 830         numstr = mobj.group(1)
 831         if numstr.startswith('x'):
 832             base = 16
 833             numstr = '0%s' % numstr
 834         else:
 835             base = 10
 836         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 837         try:
 838             return compat_chr(int(numstr, base))
 839         except ValueError:
 840             pass
 841
 842     # Unknown entity in name, return its literal representation
 843     return '&%s;' % entity
 844
 845
 846 def unescapeHTML(s):
 847     if s is None:
 848         return None
 849     assert type(s) == compat_str
 850
 851     return re.sub(
 852         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 853
 854
 855 def escapeHTML(text):
 856     return (
 857         text
 858         .replace('&', '&amp;')
 859         .replace('<', '&lt;')
 860         .replace('>', '&gt;')
 861         .replace('"', '&quot;')
 862         .replace("'", '&#39;')
 863     )
 864
 865
 866 def process_communicate_or_kill(p, *args, **kwargs):
 867     try:
 868         return p.communicate(*args, **kwargs)
 869     except BaseException:  # Including KeyboardInterrupt
 870         p.kill()
 871         p.wait()
 872         raise
 873
 874
 875 class Popen(subprocess.Popen):
 876     if sys.platform == 'win32':
 877         _startupinfo = subprocess.STARTUPINFO()
 878         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 879     else:
 880         _startupinfo = None
 881
 882     def __init__(self, *args, **kwargs):
 883         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 884
 885     def communicate_or_kill(self, *args, **kwargs):
 886         return process_communicate_or_kill(self, *args, **kwargs)
 887
 888
 889 def get_subprocess_encoding():
 890     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 891         # For subprocess calls, encode with locale encoding
 892         # Refer to http://stackoverflow.com/a/9951851/35070
 893         encoding = preferredencoding()
 894     else:
 895         encoding = sys.getfilesystemencoding()
 896     if encoding is None:
 897         encoding = 'utf-8'
 898     return encoding
 899
 900
 901 def encodeFilename(s, for_subprocess=False):
 902     """
 903     @param s The name of the file
 904     """
 905
 906     assert type(s) == compat_str
 907
 908     # Python 3 has a Unicode API
 909     if sys.version_info >= (3, 0):
 910         return s
 911
 912     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 913     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 914     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 915     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 916         return s
 917
 918     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 919     if sys.platform.startswith('java'):
 920         return s
 921
 922     return s.encode(get_subprocess_encoding(), 'ignore')
 923
 924
 925 def decodeFilename(b, for_subprocess=False):
 926
 927     if sys.version_info >= (3, 0):
 928         return b
 929
 930     if not isinstance(b, bytes):
 931         return b
 932
 933     return b.decode(get_subprocess_encoding(), 'ignore')
 934
 935
 936 def encodeArgument(s):
 937     if not isinstance(s, compat_str):
 938         # Legacy code that uses byte strings
 939         # Uncomment the following line after fixing all post processors
 940         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 941         s = s.decode('ascii')
 942     return encodeFilename(s, True)
 943
 944
 945 def decodeArgument(b):
 946     return decodeFilename(b, True)
 947
 948
 949 def decodeOption(optval):
 950     if optval is None:
 951         return optval
 952     if isinstance(optval, bytes):
 953         optval = optval.decode(preferredencoding())
 954
 955     assert isinstance(optval, compat_str)
 956     return optval
 957
 958
 959 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 960
 961
 962 def timetuple_from_msec(msec):
 963     secs, msec = divmod(msec, 1000)
 964     mins, secs = divmod(secs, 60)
 965     hrs, mins = divmod(mins, 60)
 966     return _timetuple(hrs, mins, secs, msec)
 967
 968
 969 def formatSeconds(secs, delim=':', msec=False):
 970     time = timetuple_from_msec(secs * 1000)
 971     if time.hours:
 972         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 973     elif time.minutes:
 974         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 975     else:
 976         ret = '%d' % time.seconds
 977     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 978
 979
 980 def _ssl_load_windows_store_certs(ssl_context, storename):
 981     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 982     try:
 983         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 984                  if encoding == 'x509_asn' and (
 985                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 986     except PermissionError:
 987         return
 988     for cert in certs:
 989         try:
 990             ssl_context.load_verify_locations(cadata=cert)
 991         except ssl.SSLError:
 992             pass
 993
 994
 995 def make_HTTPS_handler(params, **kwargs):
 996     opts_check_certificate = not params.get('nocheckcertificate')
 997     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 998     context.check_hostname = opts_check_certificate
 999     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1000     if opts_check_certificate:
1001         try:
1002             context.load_default_certs()
1003             # Work around the issue in load_default_certs when there are bad certificates. See:
1004             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1005             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1006         except ssl.SSLError:
1007             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1008             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1009                 # Create a new context to discard any certificates that were already loaded
1010                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1011                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1012                 for storename in ('CA', 'ROOT'):
1013                     _ssl_load_windows_store_certs(context, storename)
1014             context.set_default_verify_paths()
1015     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1016
1017
1018 def bug_reports_message(before=';'):
1019     if ytdl_is_updateable():
1020         update_cmd = 'type  yt-dlp -U  to update'
1021     else:
1022         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
1023     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
1024     msg += ' Make sure you are using the latest version; %s.' % update_cmd
1025     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
1026
1027     before = before.rstrip()
1028     if not before or before.endswith(('.', '!', '?')):
1029         msg = msg[0].title() + msg[1:]
1030
1031     return (before + ' ' if before else '') + msg
1032
1033
1034 class YoutubeDLError(Exception):
1035     """Base exception for YoutubeDL errors."""
1036     msg = None
1037
1038     def __init__(self, msg=None):
1039         if msg is not None:
1040             self.msg = msg
1041         elif self.msg is None:
1042             self.msg = type(self).__name__
1043         super().__init__(self.msg)
1044
1045
1046 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1047 if hasattr(ssl, 'CertificateError'):
1048     network_exceptions.append(ssl.CertificateError)
1049 network_exceptions = tuple(network_exceptions)
1050
1051
1052 class ExtractorError(YoutubeDLError):
1053     """Error during info extraction."""
1054
1055     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1056         """ tb, if given, is the original traceback (so that it can be printed out).
1057         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1058         """
1059         if sys.exc_info()[0] in network_exceptions:
1060             expected = True
1061
1062         self.msg = str(msg)
1063         self.traceback = tb
1064         self.expected = expected
1065         self.cause = cause
1066         self.video_id = video_id
1067         self.ie = ie
1068         self.exc_info = sys.exc_info()  # preserve original exception
1069
1070         super(ExtractorError, self).__init__(''.join((
1071             format_field(ie, template='[%s] '),
1072             format_field(video_id, template='%s: '),
1073             self.msg,
1074             format_field(cause, template=' (caused by %r)'),
1075             '' if expected else bug_reports_message())))
1076
1077     def format_traceback(self):
1078         if self.traceback is None:
1079             return None
1080         return ''.join(traceback.format_tb(self.traceback))
1081
1082
1083 class UnsupportedError(ExtractorError):
1084     def __init__(self, url):
1085         super(UnsupportedError, self).__init__(
1086             'Unsupported URL: %s' % url, expected=True)
1087         self.url = url
1088
1089
1090 class RegexNotFoundError(ExtractorError):
1091     """Error when a regex didn't match"""
1092     pass
1093
1094
1095 class GeoRestrictedError(ExtractorError):
1096     """Geographic restriction Error exception.
1097
1098     This exception may be thrown when a video is not available from your
1099     geographic location due to geographic restrictions imposed by a website.
1100     """
1101
1102     def __init__(self, msg, countries=None, **kwargs):
1103         kwargs['expected'] = True
1104         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1105         self.countries = countries
1106
1107
1108 class DownloadError(YoutubeDLError):
1109     """Download Error exception.
1110
1111     This exception may be thrown by FileDownloader objects if they are not
1112     configured to continue on errors. They will contain the appropriate
1113     error message.
1114     """
1115
1116     def __init__(self, msg, exc_info=None):
1117         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1118         super(DownloadError, self).__init__(msg)
1119         self.exc_info = exc_info
1120
1121
1122 class EntryNotInPlaylist(YoutubeDLError):
1123     """Entry not in playlist exception.
1124
1125     This exception will be thrown by YoutubeDL when a requested entry
1126     is not found in the playlist info_dict
1127     """
1128     msg = 'Entry not found in info'
1129
1130
1131 class SameFileError(YoutubeDLError):
1132     """Same File exception.
1133
1134     This exception will be thrown by FileDownloader objects if they detect
1135     multiple files would have to be downloaded to the same file on disk.
1136     """
1137     msg = 'Fixed output name but more than one file to download'
1138
1139     def __init__(self, filename=None):
1140         if filename is not None:
1141             self.msg += f': {filename}'
1142         super().__init__(self.msg)
1143
1144
1145 class PostProcessingError(YoutubeDLError):
1146     """Post Processing exception.
1147
1148     This exception may be raised by PostProcessor's .run() method to
1149     indicate an error in the postprocessing task.
1150     """
1151
1152
1153 class DownloadCancelled(YoutubeDLError):
1154     """ Exception raised when the download queue should be interrupted """
1155     msg = 'The download was cancelled'
1156
1157
1158 class ExistingVideoReached(DownloadCancelled):
1159     """ --break-on-existing triggered """
1160     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1161
1162
1163 class RejectedVideoReached(DownloadCancelled):
1164     """ --break-on-reject triggered """
1165     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1166
1167
1168 class MaxDownloadsReached(DownloadCancelled):
1169     """ --max-downloads limit has been reached. """
1170     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1171
1172
1173 class ReExtractInfo(YoutubeDLError):
1174     """ Video info needs to be re-extracted. """
1175
1176     def __init__(self, msg, expected=False):
1177         super().__init__(msg)
1178         self.expected = expected
1179
1180
1181 class ThrottledDownload(ReExtractInfo):
1182     """ Download speed below --throttled-rate. """
1183     msg = 'The download speed is below throttle limit'
1184
1185     def __init__(self):
1186         super().__init__(self.msg, expected=False)
1187
1188
1189 class UnavailableVideoError(YoutubeDLError):
1190     """Unavailable Format exception.
1191
1192     This exception will be thrown when a video is requested
1193     in a format that is not available for that video.
1194     """
1195     msg = 'Unable to download video'
1196
1197     def __init__(self, err=None):
1198         if err is not None:
1199             self.msg += f': {err}'
1200         super().__init__(self.msg)
1201
1202
1203 class ContentTooShortError(YoutubeDLError):
1204     """Content Too Short exception.
1205
1206     This exception may be raised by FileDownloader objects when a file they
1207     download is too small for what the server announced first, indicating
1208     the connection was probably interrupted.
1209     """
1210
1211     def __init__(self, downloaded, expected):
1212         super(ContentTooShortError, self).__init__(
1213             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1214         )
1215         # Both in bytes
1216         self.downloaded = downloaded
1217         self.expected = expected
1218
1219
1220 class XAttrMetadataError(YoutubeDLError):
1221     def __init__(self, code=None, msg='Unknown error'):
1222         super(XAttrMetadataError, self).__init__(msg)
1223         self.code = code
1224         self.msg = msg
1225
1226         # Parsing code and msg
1227         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1228                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1229             self.reason = 'NO_SPACE'
1230         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231             self.reason = 'VALUE_TOO_LONG'
1232         else:
1233             self.reason = 'NOT_SUPPORTED'
1234
1235
1236 class XAttrUnavailableError(YoutubeDLError):
1237     pass
1238
1239
1240 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1241     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1242     # expected HTTP responses to meet HTTP/1.0 or later (see also
1243     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1244     if sys.version_info < (3, 0):
1245         kwargs['strict'] = True
1246     hc = http_class(*args, **compat_kwargs(kwargs))
1247     source_address = ydl_handler._params.get('source_address')
1248
1249     if source_address is not None:
1250         # This is to workaround _create_connection() from socket where it will try all
1251         # address data from getaddrinfo() including IPv6. This filters the result from
1252         # getaddrinfo() based on the source_address value.
1253         # This is based on the cpython socket.create_connection() function.
1254         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1255         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1256             host, port = address
1257             err = None
1258             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1259             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1260             ip_addrs = [addr for addr in addrs if addr[0] == af]
1261             if addrs and not ip_addrs:
1262                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1263                 raise socket.error(
1264                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1265                     % (ip_version, source_address[0]))
1266             for res in ip_addrs:
1267                 af, socktype, proto, canonname, sa = res
1268                 sock = None
1269                 try:
1270                     sock = socket.socket(af, socktype, proto)
1271                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1272                         sock.settimeout(timeout)
1273                     sock.bind(source_address)
1274                     sock.connect(sa)
1275                     err = None  # Explicitly break reference cycle
1276                     return sock
1277                 except socket.error as _:
1278                     err = _
1279                     if sock is not None:
1280                         sock.close()
1281             if err is not None:
1282                 raise err
1283             else:
1284                 raise socket.error('getaddrinfo returns an empty list')
1285         if hasattr(hc, '_create_connection'):
1286             hc._create_connection = _create_connection
1287         sa = (source_address, 0)
1288         if hasattr(hc, 'source_address'):  # Python 2.7+
1289             hc.source_address = sa
1290         else:  # Python 2.6
1291             def _hc_connect(self, *args, **kwargs):
1292                 sock = _create_connection(
1293                     (self.host, self.port), self.timeout, sa)
1294                 if is_https:
1295                     self.sock = ssl.wrap_socket(
1296                         sock, self.key_file, self.cert_file,
1297                         ssl_version=ssl.PROTOCOL_TLSv1)
1298                 else:
1299                     self.sock = sock
1300             hc.connect = functools.partial(_hc_connect, hc)
1301
1302     return hc
1303
1304
1305 def handle_youtubedl_headers(headers):
1306     filtered_headers = headers
1307
1308     if 'Youtubedl-no-compression' in filtered_headers:
1309         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1310         del filtered_headers['Youtubedl-no-compression']
1311
1312     return filtered_headers
1313
1314
1315 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1316     """Handler for HTTP requests and responses.
1317
1318     This class, when installed with an OpenerDirector, automatically adds
1319     the standard headers to every HTTP request and handles gzipped and
1320     deflated responses from web servers. If compression is to be avoided in
1321     a particular request, the original request in the program code only has
1322     to include the HTTP header "Youtubedl-no-compression", which will be
1323     removed before making the real request.
1324
1325     Part of this code was copied from:
1326
1327     http://techknack.net/python-urllib2-handlers/
1328
1329     Andrew Rowls, the author of that code, agreed to release it to the
1330     public domain.
1331     """
1332
1333     def __init__(self, params, *args, **kwargs):
1334         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1335         self._params = params
1336
1337     def http_open(self, req):
1338         conn_class = compat_http_client.HTTPConnection
1339
1340         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341         if socks_proxy:
1342             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343             del req.headers['Ytdl-socks-proxy']
1344
1345         return self.do_open(functools.partial(
1346             _create_http_connection, self, conn_class, False),
1347             req)
1348
1349     @staticmethod
1350     def deflate(data):
1351         if not data:
1352             return data
1353         try:
1354             return zlib.decompress(data, -zlib.MAX_WBITS)
1355         except zlib.error:
1356             return zlib.decompress(data)
1357
1358     def http_request(self, req):
1359         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1360         # always respected by websites, some tend to give out URLs with non percent-encoded
1361         # non-ASCII characters (see telemb.py, ard.py [#3412])
1362         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1363         # To work around aforementioned issue we will replace request's original URL with
1364         # percent-encoded one
1365         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1366         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1367         url = req.get_full_url()
1368         url_escaped = escape_url(url)
1369
1370         # Substitute URL if any change after escaping
1371         if url != url_escaped:
1372             req = update_Request(req, url=url_escaped)
1373
1374         for h, v in std_headers.items():
1375             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1376             # The dict keys are capitalized because of this bug by urllib
1377             if h.capitalize() not in req.headers:
1378                 req.add_header(h, v)
1379
1380         req.headers = handle_youtubedl_headers(req.headers)
1381
1382         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1383             # Python 2.6 is brain-dead when it comes to fragments
1384             req._Request__original = req._Request__original.partition('#')[0]
1385             req._Request__r_type = req._Request__r_type.partition('#')[0]
1386
1387         return req
1388
1389     def http_response(self, req, resp):
1390         old_resp = resp
1391         # gzip
1392         if resp.headers.get('Content-encoding', '') == 'gzip':
1393             content = resp.read()
1394             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1395             try:
1396                 uncompressed = io.BytesIO(gz.read())
1397             except IOError as original_ioerror:
1398                 # There may be junk add the end of the file
1399                 # See http://stackoverflow.com/q/4928560/35070 for details
1400                 for i in range(1, 1024):
1401                     try:
1402                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1403                         uncompressed = io.BytesIO(gz.read())
1404                     except IOError:
1405                         continue
1406                     break
1407                 else:
1408                     raise original_ioerror
1409             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1410             resp.msg = old_resp.msg
1411             del resp.headers['Content-encoding']
1412         # deflate
1413         if resp.headers.get('Content-encoding', '') == 'deflate':
1414             gz = io.BytesIO(self.deflate(resp.read()))
1415             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1416             resp.msg = old_resp.msg
1417             del resp.headers['Content-encoding']
1418         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1419         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1420         if 300 <= resp.code < 400:
1421             location = resp.headers.get('Location')
1422             if location:
1423                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1424                 if sys.version_info >= (3, 0):
1425                     location = location.encode('iso-8859-1').decode('utf-8')
1426                 else:
1427                     location = location.decode('utf-8')
1428                 location_escaped = escape_url(location)
1429                 if location != location_escaped:
1430                     del resp.headers['Location']
1431                     if sys.version_info < (3, 0):
1432                         location_escaped = location_escaped.encode('utf-8')
1433                     resp.headers['Location'] = location_escaped
1434         return resp
1435
1436     https_request = http_request
1437     https_response = http_response
1438
1439
1440 def make_socks_conn_class(base_class, socks_proxy):
1441     assert issubclass(base_class, (
1442         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1443
1444     url_components = compat_urlparse.urlparse(socks_proxy)
1445     if url_components.scheme.lower() == 'socks5':
1446         socks_type = ProxyType.SOCKS5
1447     elif url_components.scheme.lower() in ('socks', 'socks4'):
1448         socks_type = ProxyType.SOCKS4
1449     elif url_components.scheme.lower() == 'socks4a':
1450         socks_type = ProxyType.SOCKS4A
1451
1452     def unquote_if_non_empty(s):
1453         if not s:
1454             return s
1455         return compat_urllib_parse_unquote_plus(s)
1456
1457     proxy_args = (
1458         socks_type,
1459         url_components.hostname, url_components.port or 1080,
1460         True,  # Remote DNS
1461         unquote_if_non_empty(url_components.username),
1462         unquote_if_non_empty(url_components.password),
1463     )
1464
1465     class SocksConnection(base_class):
1466         def connect(self):
1467             self.sock = sockssocket()
1468             self.sock.setproxy(*proxy_args)
1469             if type(self.timeout) in (int, float):
1470                 self.sock.settimeout(self.timeout)
1471             self.sock.connect((self.host, self.port))
1472
1473             if isinstance(self, compat_http_client.HTTPSConnection):
1474                 if hasattr(self, '_context'):  # Python > 2.6
1475                     self.sock = self._context.wrap_socket(
1476                         self.sock, server_hostname=self.host)
1477                 else:
1478                     self.sock = ssl.wrap_socket(self.sock)
1479
1480     return SocksConnection
1481
1482
1483 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1484     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1485         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1486         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1487         self._params = params
1488
1489     def https_open(self, req):
1490         kwargs = {}
1491         conn_class = self._https_conn_class
1492
1493         if hasattr(self, '_context'):  # python > 2.6
1494             kwargs['context'] = self._context
1495         if hasattr(self, '_check_hostname'):  # python 3.x
1496             kwargs['check_hostname'] = self._check_hostname
1497
1498         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1499         if socks_proxy:
1500             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1501             del req.headers['Ytdl-socks-proxy']
1502
1503         return self.do_open(functools.partial(
1504             _create_http_connection, self, conn_class, True),
1505             req, **kwargs)
1506
1507
1508 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1509     """
1510     See [1] for cookie file format.
1511
1512     1. https://curl.haxx.se/docs/http-cookies.html
1513     """
1514     _HTTPONLY_PREFIX = '#HttpOnly_'
1515     _ENTRY_LEN = 7
1516     _HEADER = '''# Netscape HTTP Cookie File
1517 # This file is generated by yt-dlp.  Do not edit.
1518
1519 '''
1520     _CookieFileEntry = collections.namedtuple(
1521         'CookieFileEntry',
1522         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1523
1524     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1525         """
1526         Save cookies to a file.
1527
1528         Most of the code is taken from CPython 3.8 and slightly adapted
1529         to support cookie files with UTF-8 in both python 2 and 3.
1530         """
1531         if filename is None:
1532             if self.filename is not None:
1533                 filename = self.filename
1534             else:
1535                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1536
1537         # Store session cookies with `expires` set to 0 instead of an empty
1538         # string
1539         for cookie in self:
1540             if cookie.expires is None:
1541                 cookie.expires = 0
1542
1543         with io.open(filename, 'w', encoding='utf-8') as f:
1544             f.write(self._HEADER)
1545             now = time.time()
1546             for cookie in self:
1547                 if not ignore_discard and cookie.discard:
1548                     continue
1549                 if not ignore_expires and cookie.is_expired(now):
1550                     continue
1551                 if cookie.secure:
1552                     secure = 'TRUE'
1553                 else:
1554                     secure = 'FALSE'
1555                 if cookie.domain.startswith('.'):
1556                     initial_dot = 'TRUE'
1557                 else:
1558                     initial_dot = 'FALSE'
1559                 if cookie.expires is not None:
1560                     expires = compat_str(cookie.expires)
1561                 else:
1562                     expires = ''
1563                 if cookie.value is None:
1564                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1565                     # with no name, whereas http.cookiejar regards it as a
1566                     # cookie with no value.
1567                     name = ''
1568                     value = cookie.name
1569                 else:
1570                     name = cookie.name
1571                     value = cookie.value
1572                 f.write(
1573                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1574                                secure, expires, name, value]) + '\n')
1575
1576     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1577         """Load cookies from a file."""
1578         if filename is None:
1579             if self.filename is not None:
1580                 filename = self.filename
1581             else:
1582                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1583
1584         def prepare_line(line):
1585             if line.startswith(self._HTTPONLY_PREFIX):
1586                 line = line[len(self._HTTPONLY_PREFIX):]
1587             # comments and empty lines are fine
1588             if line.startswith('#') or not line.strip():
1589                 return line
1590             cookie_list = line.split('\t')
1591             if len(cookie_list) != self._ENTRY_LEN:
1592                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1593             cookie = self._CookieFileEntry(*cookie_list)
1594             if cookie.expires_at and not cookie.expires_at.isdigit():
1595                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1596             return line
1597
1598         cf = io.StringIO()
1599         with io.open(filename, encoding='utf-8') as f:
1600             for line in f:
1601                 try:
1602                     cf.write(prepare_line(line))
1603                 except compat_cookiejar.LoadError as e:
1604                     write_string(
1605                         'WARNING: skipping cookie file entry due to %s: %r\n'
1606                         % (e, line), sys.stderr)
1607                     continue
1608         cf.seek(0)
1609         self._really_load(cf, filename, ignore_discard, ignore_expires)
1610         # Session cookies are denoted by either `expires` field set to
1611         # an empty string or 0. MozillaCookieJar only recognizes the former
1612         # (see [1]). So we need force the latter to be recognized as session
1613         # cookies on our own.
1614         # Session cookies may be important for cookies-based authentication,
1615         # e.g. usually, when user does not check 'Remember me' check box while
1616         # logging in on a site, some important cookies are stored as session
1617         # cookies so that not recognizing them will result in failed login.
1618         # 1. https://bugs.python.org/issue17164
1619         for cookie in self:
1620             # Treat `expires=0` cookies as session cookies
1621             if cookie.expires == 0:
1622                 cookie.expires = None
1623                 cookie.discard = True
1624
1625
1626 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1627     def __init__(self, cookiejar=None):
1628         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1629
1630     def http_response(self, request, response):
1631         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1632         # characters in Set-Cookie HTTP header of last response (see
1633         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1634         # In order to at least prevent crashing we will percent encode Set-Cookie
1635         # header before HTTPCookieProcessor starts processing it.
1636         # if sys.version_info < (3, 0) and response.headers:
1637         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1638         #         set_cookie = response.headers.get(set_cookie_header)
1639         #         if set_cookie:
1640         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1641         #             if set_cookie != set_cookie_escaped:
1642         #                 del response.headers[set_cookie_header]
1643         #                 response.headers[set_cookie_header] = set_cookie_escaped
1644         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1645
1646     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1647     https_response = http_response
1648
1649
1650 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1651     """YoutubeDL redirect handler
1652
1653     The code is based on HTTPRedirectHandler implementation from CPython [1].
1654
1655     This redirect handler solves two issues:
1656      - ensures redirect URL is always unicode under python 2
1657      - introduces support for experimental HTTP response status code
1658        308 Permanent Redirect [2] used by some sites [3]
1659
1660     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1661     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1662     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1663     """
1664
1665     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1666
1667     def redirect_request(self, req, fp, code, msg, headers, newurl):
1668         """Return a Request or None in response to a redirect.
1669
1670         This is called by the http_error_30x methods when a
1671         redirection response is received.  If a redirection should
1672         take place, return a new Request to allow http_error_30x to
1673         perform the redirect.  Otherwise, raise HTTPError if no-one
1674         else should try to handle this url.  Return None if you can't
1675         but another Handler might.
1676         """
1677         m = req.get_method()
1678         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1679                  or code in (301, 302, 303) and m == "POST")):
1680             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1681         # Strictly (according to RFC 2616), 301 or 302 in response to
1682         # a POST MUST NOT cause a redirection without confirmation
1683         # from the user (of urllib.request, in this case).  In practice,
1684         # essentially all clients do redirect in this case, so we do
1685         # the same.
1686
1687         # On python 2 urlh.geturl() may sometimes return redirect URL
1688         # as byte string instead of unicode. This workaround allows
1689         # to force it always return unicode.
1690         if sys.version_info[0] < 3:
1691             newurl = compat_str(newurl)
1692
1693         # Be conciliant with URIs containing a space.  This is mainly
1694         # redundant with the more complete encoding done in http_error_302(),
1695         # but it is kept for compatibility with other callers.
1696         newurl = newurl.replace(' ', '%20')
1697
1698         CONTENT_HEADERS = ("content-length", "content-type")
1699         # NB: don't use dict comprehension for python 2.6 compatibility
1700         newheaders = dict((k, v) for k, v in req.headers.items()
1701                           if k.lower() not in CONTENT_HEADERS)
1702         return compat_urllib_request.Request(
1703             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1704             unverifiable=True)
1705
1706
1707 def extract_timezone(date_str):
1708     m = re.search(
1709         r'''(?x)
1710             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1711             (?P<tz>Z|                                            # just the UTC Z, or
1712                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1713                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714                    [ ]?                                          # optional space
1715                 (?P<sign>\+|-)                                   # +/-
1716                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1717             $)
1718         ''', date_str)
1719     if not m:
1720         timezone = datetime.timedelta()
1721     else:
1722         date_str = date_str[:-len(m.group('tz'))]
1723         if not m.group('sign'):
1724             timezone = datetime.timedelta()
1725         else:
1726             sign = 1 if m.group('sign') == '+' else -1
1727             timezone = datetime.timedelta(
1728                 hours=sign * int(m.group('hours')),
1729                 minutes=sign * int(m.group('minutes')))
1730     return timezone, date_str
1731
1732
1733 def parse_iso8601(date_str, delimiter='T', timezone=None):
1734     """ Return a UNIX timestamp from the given date """
1735
1736     if date_str is None:
1737         return None
1738
1739     date_str = re.sub(r'\.[0-9]+', '', date_str)
1740
1741     if timezone is None:
1742         timezone, date_str = extract_timezone(date_str)
1743
1744     try:
1745         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1746         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1747         return calendar.timegm(dt.timetuple())
1748     except ValueError:
1749         pass
1750
1751
1752 def date_formats(day_first=True):
1753     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1754
1755
1756 def unified_strdate(date_str, day_first=True):
1757     """Return a string with the date in the format YYYYMMDD"""
1758
1759     if date_str is None:
1760         return None
1761     upload_date = None
1762     # Replace commas
1763     date_str = date_str.replace(',', ' ')
1764     # Remove AM/PM + timezone
1765     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1766     _, date_str = extract_timezone(date_str)
1767
1768     for expression in date_formats(day_first):
1769         try:
1770             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1771         except ValueError:
1772             pass
1773     if upload_date is None:
1774         timetuple = email.utils.parsedate_tz(date_str)
1775         if timetuple:
1776             try:
1777                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1778             except ValueError:
1779                 pass
1780     if upload_date is not None:
1781         return compat_str(upload_date)
1782
1783
1784 def unified_timestamp(date_str, day_first=True):
1785     if date_str is None:
1786         return None
1787
1788     date_str = re.sub(r'[,|]', '', date_str)
1789
1790     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1791     timezone, date_str = extract_timezone(date_str)
1792
1793     # Remove AM/PM + timezone
1794     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1795
1796     # Remove unrecognized timezones from ISO 8601 alike timestamps
1797     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1798     if m:
1799         date_str = date_str[:-len(m.group('tz'))]
1800
1801     # Python only supports microseconds, so remove nanoseconds
1802     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1803     if m:
1804         date_str = m.group(1)
1805
1806     for expression in date_formats(day_first):
1807         try:
1808             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1809             return calendar.timegm(dt.timetuple())
1810         except ValueError:
1811             pass
1812     timetuple = email.utils.parsedate_tz(date_str)
1813     if timetuple:
1814         return calendar.timegm(timetuple) + pm_delta * 3600
1815
1816
1817 def determine_ext(url, default_ext='unknown_video'):
1818     if url is None or '.' not in url:
1819         return default_ext
1820     guess = url.partition('?')[0].rpartition('.')[2]
1821     if re.match(r'^[A-Za-z0-9]+$', guess):
1822         return guess
1823     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1824     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1825         return guess.rstrip('/')
1826     else:
1827         return default_ext
1828
1829
1830 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1831     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1832
1833
1834 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1835     """
1836     Return a datetime object from a string in the format YYYYMMDD or
1837     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1838
1839     format: string date format used to return datetime object from
1840     precision: round the time portion of a datetime object.
1841                 auto|microsecond|second|minute|hour|day.
1842                 auto: round to the unit provided in date_str (if applicable).
1843     """
1844     auto_precision = False
1845     if precision == 'auto':
1846         auto_precision = True
1847         precision = 'microsecond'
1848     today = datetime_round(datetime.datetime.now(), precision)
1849     if date_str in ('now', 'today'):
1850         return today
1851     if date_str == 'yesterday':
1852         return today - datetime.timedelta(days=1)
1853     match = re.match(
1854         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1855         date_str)
1856     if match is not None:
1857         start_time = datetime_from_str(match.group('start'), precision, format)
1858         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1859         unit = match.group('unit')
1860         if unit == 'month' or unit == 'year':
1861             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1862             unit = 'day'
1863         else:
1864             if unit == 'week':
1865                 unit = 'day'
1866                 time *= 7
1867             delta = datetime.timedelta(**{unit + 's': time})
1868             new_date = start_time + delta
1869         if auto_precision:
1870             return datetime_round(new_date, unit)
1871         return new_date
1872
1873     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1874
1875
1876 def date_from_str(date_str, format='%Y%m%d'):
1877     """
1878     Return a datetime object from a string in the format YYYYMMDD or
1879     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1880
1881     format: string date format used to return datetime object from
1882     """
1883     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1884
1885
1886 def datetime_add_months(dt, months):
1887     """Increment/Decrement a datetime object by months."""
1888     month = dt.month + months - 1
1889     year = dt.year + month // 12
1890     month = month % 12 + 1
1891     day = min(dt.day, calendar.monthrange(year, month)[1])
1892     return dt.replace(year, month, day)
1893
1894
1895 def datetime_round(dt, precision='day'):
1896     """
1897     Round a datetime object's time to a specific precision
1898     """
1899     if precision == 'microsecond':
1900         return dt
1901
1902     unit_seconds = {
1903         'day': 86400,
1904         'hour': 3600,
1905         'minute': 60,
1906         'second': 1,
1907     }
1908     roundto = lambda x, n: ((x + n / 2) // n) * n
1909     timestamp = calendar.timegm(dt.timetuple())
1910     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1911
1912
1913 def hyphenate_date(date_str):
1914     """
1915     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1916     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1917     if match is not None:
1918         return '-'.join(match.groups())
1919     else:
1920         return date_str
1921
1922
1923 class DateRange(object):
1924     """Represents a time interval between two dates"""
1925
1926     def __init__(self, start=None, end=None):
1927         """start and end must be strings in the format accepted by date"""
1928         if start is not None:
1929             self.start = date_from_str(start)
1930         else:
1931             self.start = datetime.datetime.min.date()
1932         if end is not None:
1933             self.end = date_from_str(end)
1934         else:
1935             self.end = datetime.datetime.max.date()
1936         if self.start > self.end:
1937             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1938
1939     @classmethod
1940     def day(cls, day):
1941         """Returns a range that only contains the given day"""
1942         return cls(day, day)
1943
1944     def __contains__(self, date):
1945         """Check if the date is in the range"""
1946         if not isinstance(date, datetime.date):
1947             date = date_from_str(date)
1948         return self.start <= date <= self.end
1949
1950     def __str__(self):
1951         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1952
1953
1954 def platform_name():
1955     """ Returns the platform name as a compat_str """
1956     res = platform.platform()
1957     if isinstance(res, bytes):
1958         res = res.decode(preferredencoding())
1959
1960     assert isinstance(res, compat_str)
1961     return res
1962
1963
1964 def get_windows_version():
1965     ''' Get Windows version. None if it's not running on Windows '''
1966     if compat_os_name == 'nt':
1967         return version_tuple(platform.win32_ver()[1])
1968     else:
1969         return None
1970
1971
1972 def _windows_write_string(s, out):
1973     """ Returns True if the string was written using special methods,
1974     False if it has yet to be written out."""
1975     # Adapted from http://stackoverflow.com/a/3259271/35070
1976
1977     import ctypes.wintypes
1978
1979     WIN_OUTPUT_IDS = {
1980         1: -11,
1981         2: -12,
1982     }
1983
1984     try:
1985         fileno = out.fileno()
1986     except AttributeError:
1987         # If the output stream doesn't have a fileno, it's virtual
1988         return False
1989     except io.UnsupportedOperation:
1990         # Some strange Windows pseudo files?
1991         return False
1992     if fileno not in WIN_OUTPUT_IDS:
1993         return False
1994
1995     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1996         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1997         ('GetStdHandle', ctypes.windll.kernel32))
1998     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1999
2000     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2001         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2002         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2003         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2004     written = ctypes.wintypes.DWORD(0)
2005
2006     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2007     FILE_TYPE_CHAR = 0x0002
2008     FILE_TYPE_REMOTE = 0x8000
2009     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2010         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2011         ctypes.POINTER(ctypes.wintypes.DWORD))(
2012         ('GetConsoleMode', ctypes.windll.kernel32))
2013     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2014
2015     def not_a_console(handle):
2016         if handle == INVALID_HANDLE_VALUE or handle is None:
2017             return True
2018         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2019                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2020
2021     if not_a_console(h):
2022         return False
2023
2024     def next_nonbmp_pos(s):
2025         try:
2026             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2027         except StopIteration:
2028             return len(s)
2029
2030     while s:
2031         count = min(next_nonbmp_pos(s), 1024)
2032
2033         ret = WriteConsoleW(
2034             h, s, count if count else 2, ctypes.byref(written), None)
2035         if ret == 0:
2036             raise OSError('Failed to write string')
2037         if not count:  # We just wrote a non-BMP character
2038             assert written.value == 2
2039             s = s[1:]
2040         else:
2041             assert written.value > 0
2042             s = s[written.value:]
2043     return True
2044
2045
2046 def write_string(s, out=None, encoding=None):
2047     if out is None:
2048         out = sys.stderr
2049     assert type(s) == compat_str
2050
2051     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2052         if _windows_write_string(s, out):
2053             return
2054
2055     if ('b' in getattr(out, 'mode', '')
2056             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2057         byt = s.encode(encoding or preferredencoding(), 'ignore')
2058         out.write(byt)
2059     elif hasattr(out, 'buffer'):
2060         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2061         byt = s.encode(enc, 'ignore')
2062         out.buffer.write(byt)
2063     else:
2064         out.write(s)
2065     out.flush()
2066
2067
2068 def bytes_to_intlist(bs):
2069     if not bs:
2070         return []
2071     if isinstance(bs[0], int):  # Python 3
2072         return list(bs)
2073     else:
2074         return [ord(c) for c in bs]
2075
2076
2077 def intlist_to_bytes(xs):
2078     if not xs:
2079         return b''
2080     return compat_struct_pack('%dB' % len(xs), *xs)
2081
2082
2083 # Cross-platform file locking
2084 if sys.platform == 'win32':
2085     import ctypes.wintypes
2086     import msvcrt
2087
2088     class OVERLAPPED(ctypes.Structure):
2089         _fields_ = [
2090             ('Internal', ctypes.wintypes.LPVOID),
2091             ('InternalHigh', ctypes.wintypes.LPVOID),
2092             ('Offset', ctypes.wintypes.DWORD),
2093             ('OffsetHigh', ctypes.wintypes.DWORD),
2094             ('hEvent', ctypes.wintypes.HANDLE),
2095         ]
2096
2097     kernel32 = ctypes.windll.kernel32
2098     LockFileEx = kernel32.LockFileEx
2099     LockFileEx.argtypes = [
2100         ctypes.wintypes.HANDLE,     # hFile
2101         ctypes.wintypes.DWORD,      # dwFlags
2102         ctypes.wintypes.DWORD,      # dwReserved
2103         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2104         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2105         ctypes.POINTER(OVERLAPPED)  # Overlapped
2106     ]
2107     LockFileEx.restype = ctypes.wintypes.BOOL
2108     UnlockFileEx = kernel32.UnlockFileEx
2109     UnlockFileEx.argtypes = [
2110         ctypes.wintypes.HANDLE,     # hFile
2111         ctypes.wintypes.DWORD,      # dwReserved
2112         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2113         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2114         ctypes.POINTER(OVERLAPPED)  # Overlapped
2115     ]
2116     UnlockFileEx.restype = ctypes.wintypes.BOOL
2117     whole_low = 0xffffffff
2118     whole_high = 0x7fffffff
2119
2120     def _lock_file(f, exclusive):
2121         overlapped = OVERLAPPED()
2122         overlapped.Offset = 0
2123         overlapped.OffsetHigh = 0
2124         overlapped.hEvent = 0
2125         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2126         handle = msvcrt.get_osfhandle(f.fileno())
2127         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2128                           whole_low, whole_high, f._lock_file_overlapped_p):
2129             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2130
2131     def _unlock_file(f):
2132         assert f._lock_file_overlapped_p
2133         handle = msvcrt.get_osfhandle(f.fileno())
2134         if not UnlockFileEx(handle, 0,
2135                             whole_low, whole_high, f._lock_file_overlapped_p):
2136             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2137
2138 else:
2139     # Some platforms, such as Jython, is missing fcntl
2140     try:
2141         import fcntl
2142
2143         def _lock_file(f, exclusive):
2144             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2145
2146         def _unlock_file(f):
2147             fcntl.flock(f, fcntl.LOCK_UN)
2148     except ImportError:
2149         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2150
2151         def _lock_file(f, exclusive):
2152             raise IOError(UNSUPPORTED_MSG)
2153
2154         def _unlock_file(f):
2155             raise IOError(UNSUPPORTED_MSG)
2156
2157
2158 class locked_file(object):
2159     def __init__(self, filename, mode, encoding=None):
2160         assert mode in ['r', 'a', 'w']
2161         self.f = io.open(filename, mode, encoding=encoding)
2162         self.mode = mode
2163
2164     def __enter__(self):
2165         exclusive = self.mode != 'r'
2166         try:
2167             _lock_file(self.f, exclusive)
2168         except IOError:
2169             self.f.close()
2170             raise
2171         return self
2172
2173     def __exit__(self, etype, value, traceback):
2174         try:
2175             _unlock_file(self.f)
2176         finally:
2177             self.f.close()
2178
2179     def __iter__(self):
2180         return iter(self.f)
2181
2182     def write(self, *args):
2183         return self.f.write(*args)
2184
2185     def read(self, *args):
2186         return self.f.read(*args)
2187
2188
2189 def get_filesystem_encoding():
2190     encoding = sys.getfilesystemencoding()
2191     return encoding if encoding is not None else 'utf-8'
2192
2193
2194 def shell_quote(args):
2195     quoted_args = []
2196     encoding = get_filesystem_encoding()
2197     for a in args:
2198         if isinstance(a, bytes):
2199             # We may get a filename encoded with 'encodeFilename'
2200             a = a.decode(encoding)
2201         quoted_args.append(compat_shlex_quote(a))
2202     return ' '.join(quoted_args)
2203
2204
2205 def smuggle_url(url, data):
2206     """ Pass additional data in a URL for internal use. """
2207
2208     url, idata = unsmuggle_url(url, {})
2209     data.update(idata)
2210     sdata = compat_urllib_parse_urlencode(
2211         {'__youtubedl_smuggle': json.dumps(data)})
2212     return url + '#' + sdata
2213
2214
2215 def unsmuggle_url(smug_url, default=None):
2216     if '#__youtubedl_smuggle' not in smug_url:
2217         return smug_url, default
2218     url, _, sdata = smug_url.rpartition('#')
2219     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2220     data = json.loads(jsond)
2221     return url, data
2222
2223
2224 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2225     """ Formats numbers with decimal sufixes like K, M, etc """
2226     num, factor = float_or_none(num), float(factor)
2227     if num is None:
2228         return None
2229     exponent = 0 if num == 0 else int(math.log(num, factor))
2230     suffix = ['', *'kMGTPEZY'][exponent]
2231     if factor == 1024:
2232         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2233     converted = num / (factor ** exponent)
2234     return fmt % (converted, suffix)
2235
2236
2237 def format_bytes(bytes):
2238     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2239
2240
2241 def lookup_unit_table(unit_table, s):
2242     units_re = '|'.join(re.escape(u) for u in unit_table)
2243     m = re.match(
2244         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2245     if not m:
2246         return None
2247     num_str = m.group('num').replace(',', '.')
2248     mult = unit_table[m.group('unit')]
2249     return int(float(num_str) * mult)
2250
2251
2252 def parse_filesize(s):
2253     if s is None:
2254         return None
2255
2256     # The lower-case forms are of course incorrect and unofficial,
2257     # but we support those too
2258     _UNIT_TABLE = {
2259         'B': 1,
2260         'b': 1,
2261         'bytes': 1,
2262         'KiB': 1024,
2263         'KB': 1000,
2264         'kB': 1024,
2265         'Kb': 1000,
2266         'kb': 1000,
2267         'kilobytes': 1000,
2268         'kibibytes': 1024,
2269         'MiB': 1024 ** 2,
2270         'MB': 1000 ** 2,
2271         'mB': 1024 ** 2,
2272         'Mb': 1000 ** 2,
2273         'mb': 1000 ** 2,
2274         'megabytes': 1000 ** 2,
2275         'mebibytes': 1024 ** 2,
2276         'GiB': 1024 ** 3,
2277         'GB': 1000 ** 3,
2278         'gB': 1024 ** 3,
2279         'Gb': 1000 ** 3,
2280         'gb': 1000 ** 3,
2281         'gigabytes': 1000 ** 3,
2282         'gibibytes': 1024 ** 3,
2283         'TiB': 1024 ** 4,
2284         'TB': 1000 ** 4,
2285         'tB': 1024 ** 4,
2286         'Tb': 1000 ** 4,
2287         'tb': 1000 ** 4,
2288         'terabytes': 1000 ** 4,
2289         'tebibytes': 1024 ** 4,
2290         'PiB': 1024 ** 5,
2291         'PB': 1000 ** 5,
2292         'pB': 1024 ** 5,
2293         'Pb': 1000 ** 5,
2294         'pb': 1000 ** 5,
2295         'petabytes': 1000 ** 5,
2296         'pebibytes': 1024 ** 5,
2297         'EiB': 1024 ** 6,
2298         'EB': 1000 ** 6,
2299         'eB': 1024 ** 6,
2300         'Eb': 1000 ** 6,
2301         'eb': 1000 ** 6,
2302         'exabytes': 1000 ** 6,
2303         'exbibytes': 1024 ** 6,
2304         'ZiB': 1024 ** 7,
2305         'ZB': 1000 ** 7,
2306         'zB': 1024 ** 7,
2307         'Zb': 1000 ** 7,
2308         'zb': 1000 ** 7,
2309         'zettabytes': 1000 ** 7,
2310         'zebibytes': 1024 ** 7,
2311         'YiB': 1024 ** 8,
2312         'YB': 1000 ** 8,
2313         'yB': 1024 ** 8,
2314         'Yb': 1000 ** 8,
2315         'yb': 1000 ** 8,
2316         'yottabytes': 1000 ** 8,
2317         'yobibytes': 1024 ** 8,
2318     }
2319
2320     return lookup_unit_table(_UNIT_TABLE, s)
2321
2322
2323 def parse_count(s):
2324     if s is None:
2325         return None
2326
2327     s = re.sub(r'^[^\d]+\s', '', s).strip()
2328
2329     if re.match(r'^[\d,.]+$', s):
2330         return str_to_int(s)
2331
2332     _UNIT_TABLE = {
2333         'k': 1000,
2334         'K': 1000,
2335         'm': 1000 ** 2,
2336         'M': 1000 ** 2,
2337         'kk': 1000 ** 2,
2338         'KK': 1000 ** 2,
2339         'b': 1000 ** 3,
2340         'B': 1000 ** 3,
2341     }
2342
2343     ret = lookup_unit_table(_UNIT_TABLE, s)
2344     if ret is not None:
2345         return ret
2346
2347     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2348     if mobj:
2349         return str_to_int(mobj.group(1))
2350
2351
2352 def parse_resolution(s):
2353     if s is None:
2354         return {}
2355
2356     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2357     if mobj:
2358         return {
2359             'width': int(mobj.group('w')),
2360             'height': int(mobj.group('h')),
2361         }
2362
2363     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2364     if mobj:
2365         return {'height': int(mobj.group(1))}
2366
2367     mobj = re.search(r'\b([48])[kK]\b', s)
2368     if mobj:
2369         return {'height': int(mobj.group(1)) * 540}
2370
2371     return {}
2372
2373
2374 def parse_bitrate(s):
2375     if not isinstance(s, compat_str):
2376         return
2377     mobj = re.search(r'\b(\d+)\s*kbps', s)
2378     if mobj:
2379         return int(mobj.group(1))
2380
2381
2382 def month_by_name(name, lang='en'):
2383     """ Return the number of a month by (locale-independently) English name """
2384
2385     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2386
2387     try:
2388         return month_names.index(name) + 1
2389     except ValueError:
2390         return None
2391
2392
2393 def month_by_abbreviation(abbrev):
2394     """ Return the number of a month by (locale-independently) English
2395         abbreviations """
2396
2397     try:
2398         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2399     except ValueError:
2400         return None
2401
2402
2403 def fix_xml_ampersands(xml_str):
2404     """Replace all the '&' by '&amp;' in XML"""
2405     return re.sub(
2406         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2407         '&amp;',
2408         xml_str)
2409
2410
2411 def setproctitle(title):
2412     assert isinstance(title, compat_str)
2413
2414     # ctypes in Jython is not complete
2415     # http://bugs.jython.org/issue2148
2416     if sys.platform.startswith('java'):
2417         return
2418
2419     try:
2420         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2421     except OSError:
2422         return
2423     except TypeError:
2424         # LoadLibrary in Windows Python 2.7.13 only expects
2425         # a bytestring, but since unicode_literals turns
2426         # every string into a unicode string, it fails.
2427         return
2428     title_bytes = title.encode('utf-8')
2429     buf = ctypes.create_string_buffer(len(title_bytes))
2430     buf.value = title_bytes
2431     try:
2432         libc.prctl(15, buf, 0, 0, 0)
2433     except AttributeError:
2434         return  # Strange libc, just skip this
2435
2436
2437 def remove_start(s, start):
2438     return s[len(start):] if s is not None and s.startswith(start) else s
2439
2440
2441 def remove_end(s, end):
2442     return s[:-len(end)] if s is not None and s.endswith(end) else s
2443
2444
2445 def remove_quotes(s):
2446     if s is None or len(s) < 2:
2447         return s
2448     for quote in ('"', "'", ):
2449         if s[0] == quote and s[-1] == quote:
2450             return s[1:-1]
2451     return s
2452
2453
2454 def get_domain(url):
2455     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2456     return domain.group('domain') if domain else None
2457
2458
2459 def url_basename(url):
2460     path = compat_urlparse.urlparse(url).path
2461     return path.strip('/').split('/')[-1]
2462
2463
2464 def base_url(url):
2465     return re.match(r'https?://[^?#&]+/', url).group()
2466
2467
2468 def urljoin(base, path):
2469     if isinstance(path, bytes):
2470         path = path.decode('utf-8')
2471     if not isinstance(path, compat_str) or not path:
2472         return None
2473     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2474         return path
2475     if isinstance(base, bytes):
2476         base = base.decode('utf-8')
2477     if not isinstance(base, compat_str) or not re.match(
2478             r'^(?:https?:)?//', base):
2479         return None
2480     return compat_urlparse.urljoin(base, path)
2481
2482
2483 class HEADRequest(compat_urllib_request.Request):
2484     def get_method(self):
2485         return 'HEAD'
2486
2487
2488 class PUTRequest(compat_urllib_request.Request):
2489     def get_method(self):
2490         return 'PUT'
2491
2492
2493 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2494     if get_attr and v is not None:
2495         v = getattr(v, get_attr, None)
2496     try:
2497         return int(v) * invscale // scale
2498     except (ValueError, TypeError, OverflowError):
2499         return default
2500
2501
2502 def str_or_none(v, default=None):
2503     return default if v is None else compat_str(v)
2504
2505
2506 def str_to_int(int_str):
2507     """ A more relaxed version of int_or_none """
2508     if isinstance(int_str, compat_integer_types):
2509         return int_str
2510     elif isinstance(int_str, compat_str):
2511         int_str = re.sub(r'[,\.\+]', '', int_str)
2512         return int_or_none(int_str)
2513
2514
2515 def float_or_none(v, scale=1, invscale=1, default=None):
2516     if v is None:
2517         return default
2518     try:
2519         return float(v) * invscale / scale
2520     except (ValueError, TypeError):
2521         return default
2522
2523
2524 def bool_or_none(v, default=None):
2525     return v if isinstance(v, bool) else default
2526
2527
2528 def strip_or_none(v, default=None):
2529     return v.strip() if isinstance(v, compat_str) else default
2530
2531
2532 def url_or_none(url):
2533     if not url or not isinstance(url, compat_str):
2534         return None
2535     url = url.strip()
2536     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2537
2538
2539 def strftime_or_none(timestamp, date_format, default=None):
2540     datetime_object = None
2541     try:
2542         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2543             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2544         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2545             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2546         return datetime_object.strftime(date_format)
2547     except (ValueError, TypeError, AttributeError):
2548         return default
2549
2550
2551 def parse_duration(s):
2552     if not isinstance(s, compat_basestring):
2553         return None
2554     s = s.strip()
2555     if not s:
2556         return None
2557
2558     days, hours, mins, secs, ms = [None] * 5
2559     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2560     if m:
2561         days, hours, mins, secs, ms = m.groups()
2562     else:
2563         m = re.match(
2564             r'''(?ix)(?:P?
2565                 (?:
2566                     [0-9]+\s*y(?:ears?)?\s*
2567                 )?
2568                 (?:
2569                     [0-9]+\s*m(?:onths?)?\s*
2570                 )?
2571                 (?:
2572                     [0-9]+\s*w(?:eeks?)?\s*
2573                 )?
2574                 (?:
2575                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2576                 )?
2577                 T)?
2578                 (?:
2579                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2580                 )?
2581                 (?:
2582                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2583                 )?
2584                 (?:
2585                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2586                 )?Z?$''', s)
2587         if m:
2588             days, hours, mins, secs, ms = m.groups()
2589         else:
2590             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2591             if m:
2592                 hours, mins = m.groups()
2593             else:
2594                 return None
2595
2596     duration = 0
2597     if secs:
2598         duration += float(secs)
2599     if mins:
2600         duration += float(mins) * 60
2601     if hours:
2602         duration += float(hours) * 60 * 60
2603     if days:
2604         duration += float(days) * 24 * 60 * 60
2605     if ms:
2606         duration += float(ms)
2607     return duration
2608
2609
2610 def prepend_extension(filename, ext, expected_real_ext=None):
2611     name, real_ext = os.path.splitext(filename)
2612     return (
2613         '{0}.{1}{2}'.format(name, ext, real_ext)
2614         if not expected_real_ext or real_ext[1:] == expected_real_ext
2615         else '{0}.{1}'.format(filename, ext))
2616
2617
2618 def replace_extension(filename, ext, expected_real_ext=None):
2619     name, real_ext = os.path.splitext(filename)
2620     return '{0}.{1}'.format(
2621         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2622         ext)
2623
2624
2625 def check_executable(exe, args=[]):
2626     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2627     args can be a list of arguments for a short output (like -version) """
2628     try:
2629         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2630     except OSError:
2631         return False
2632     return exe
2633
2634
2635 def _get_exe_version_output(exe, args):
2636     try:
2637         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2638         # SIGTTOU if yt-dlp is run in the background.
2639         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2640         out, _ = Popen(
2641             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2642             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2643     except OSError:
2644         return False
2645     if isinstance(out, bytes):  # Python 2.x
2646         out = out.decode('ascii', 'ignore')
2647     return out
2648
2649
2650 def detect_exe_version(output, version_re=None, unrecognized='present'):
2651     assert isinstance(output, compat_str)
2652     if version_re is None:
2653         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2654     m = re.search(version_re, output)
2655     if m:
2656         return m.group(1)
2657     else:
2658         return unrecognized
2659
2660
2661 def get_exe_version(exe, args=['--version'],
2662                     version_re=None, unrecognized='present'):
2663     """ Returns the version of the specified executable,
2664     or False if the executable is not present """
2665     out = _get_exe_version_output(exe, args)
2666     return detect_exe_version(out, version_re, unrecognized) if out else False
2667
2668
2669 class LazyList(collections.abc.Sequence):
2670     ''' Lazy immutable list from an iterable
2671     Note that slices of a LazyList are lists and not LazyList'''
2672
2673     class IndexError(IndexError):
2674         pass
2675
2676     def __init__(self, iterable, *, reverse=False, _cache=None):
2677         self.__iterable = iter(iterable)
2678         self.__cache = [] if _cache is None else _cache
2679         self.__reversed = reverse
2680
2681     def __iter__(self):
2682         if self.__reversed:
2683             # We need to consume the entire iterable to iterate in reverse
2684             yield from self.exhaust()
2685             return
2686         yield from self.__cache
2687         for item in self.__iterable:
2688             self.__cache.append(item)
2689             yield item
2690
2691     def __exhaust(self):
2692         self.__cache.extend(self.__iterable)
2693         # Discard the emptied iterable to make it pickle-able
2694         self.__iterable = []
2695         return self.__cache
2696
2697     def exhaust(self):
2698         ''' Evaluate the entire iterable '''
2699         return self.__exhaust()[::-1 if self.__reversed else 1]
2700
2701     @staticmethod
2702     def __reverse_index(x):
2703         return None if x is None else -(x + 1)
2704
2705     def __getitem__(self, idx):
2706         if isinstance(idx, slice):
2707             if self.__reversed:
2708                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2709             start, stop, step = idx.start, idx.stop, idx.step or 1
2710         elif isinstance(idx, int):
2711             if self.__reversed:
2712                 idx = self.__reverse_index(idx)
2713             start, stop, step = idx, idx, 0
2714         else:
2715             raise TypeError('indices must be integers or slices')
2716         if ((start or 0) < 0 or (stop or 0) < 0
2717                 or (start is None and step < 0)
2718                 or (stop is None and step > 0)):
2719             # We need to consume the entire iterable to be able to slice from the end
2720             # Obviously, never use this with infinite iterables
2721             self.__exhaust()
2722             try:
2723                 return self.__cache[idx]
2724             except IndexError as e:
2725                 raise self.IndexError(e) from e
2726         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2727         if n > 0:
2728             self.__cache.extend(itertools.islice(self.__iterable, n))
2729         try:
2730             return self.__cache[idx]
2731         except IndexError as e:
2732             raise self.IndexError(e) from e
2733
2734     def __bool__(self):
2735         try:
2736             self[-1] if self.__reversed else self[0]
2737         except self.IndexError:
2738             return False
2739         return True
2740
2741     def __len__(self):
2742         self.__exhaust()
2743         return len(self.__cache)
2744
2745     def __reversed__(self):
2746         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2747
2748     def __copy__(self):
2749         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2750
2751     def __repr__(self):
2752         # repr and str should mimic a list. So we exhaust the iterable
2753         return repr(self.exhaust())
2754
2755     def __str__(self):
2756         return repr(self.exhaust())
2757
2758
2759 class PagedList:
2760
2761     class IndexError(IndexError):
2762         pass
2763
2764     def __len__(self):
2765         # This is only useful for tests
2766         return len(self.getslice())
2767
2768     def __init__(self, pagefunc, pagesize, use_cache=True):
2769         self._pagefunc = pagefunc
2770         self._pagesize = pagesize
2771         self._use_cache = use_cache
2772         self._cache = {}
2773
2774     def getpage(self, pagenum):
2775         page_results = self._cache.get(pagenum)
2776         if page_results is None:
2777             page_results = list(self._pagefunc(pagenum))
2778         if self._use_cache:
2779             self._cache[pagenum] = page_results
2780         return page_results
2781
2782     def getslice(self, start=0, end=None):
2783         return list(self._getslice(start, end))
2784
2785     def _getslice(self, start, end):
2786         raise NotImplementedError('This method must be implemented by subclasses')
2787
2788     def __getitem__(self, idx):
2789         # NOTE: cache must be enabled if this is used
2790         if not isinstance(idx, int) or idx < 0:
2791             raise TypeError('indices must be non-negative integers')
2792         entries = self.getslice(idx, idx + 1)
2793         if not entries:
2794             raise self.IndexError()
2795         return entries[0]
2796
2797
2798 class OnDemandPagedList(PagedList):
2799     def _getslice(self, start, end):
2800         for pagenum in itertools.count(start // self._pagesize):
2801             firstid = pagenum * self._pagesize
2802             nextfirstid = pagenum * self._pagesize + self._pagesize
2803             if start >= nextfirstid:
2804                 continue
2805
2806             startv = (
2807                 start % self._pagesize
2808                 if firstid <= start < nextfirstid
2809                 else 0)
2810             endv = (
2811                 ((end - 1) % self._pagesize) + 1
2812                 if (end is not None and firstid <= end <= nextfirstid)
2813                 else None)
2814
2815             page_results = self.getpage(pagenum)
2816             if startv != 0 or endv is not None:
2817                 page_results = page_results[startv:endv]
2818             yield from page_results
2819
2820             # A little optimization - if current page is not "full", ie. does
2821             # not contain page_size videos then we can assume that this page
2822             # is the last one - there are no more ids on further pages -
2823             # i.e. no need to query again.
2824             if len(page_results) + startv < self._pagesize:
2825                 break
2826
2827             # If we got the whole page, but the next page is not interesting,
2828             # break out early as well
2829             if end == nextfirstid:
2830                 break
2831
2832
2833 class InAdvancePagedList(PagedList):
2834     def __init__(self, pagefunc, pagecount, pagesize):
2835         self._pagecount = pagecount
2836         PagedList.__init__(self, pagefunc, pagesize, True)
2837
2838     def _getslice(self, start, end):
2839         start_page = start // self._pagesize
2840         end_page = (
2841             self._pagecount if end is None else (end // self._pagesize + 1))
2842         skip_elems = start - start_page * self._pagesize
2843         only_more = None if end is None else end - start
2844         for pagenum in range(start_page, end_page):
2845             page_results = self.getpage(pagenum)
2846             if skip_elems:
2847                 page_results = page_results[skip_elems:]
2848                 skip_elems = None
2849             if only_more is not None:
2850                 if len(page_results) < only_more:
2851                     only_more -= len(page_results)
2852                 else:
2853                     yield from page_results[:only_more]
2854                     break
2855             yield from page_results
2856
2857
2858 def uppercase_escape(s):
2859     unicode_escape = codecs.getdecoder('unicode_escape')
2860     return re.sub(
2861         r'\\U[0-9a-fA-F]{8}',
2862         lambda m: unicode_escape(m.group(0))[0],
2863         s)
2864
2865
2866 def lowercase_escape(s):
2867     unicode_escape = codecs.getdecoder('unicode_escape')
2868     return re.sub(
2869         r'\\u[0-9a-fA-F]{4}',
2870         lambda m: unicode_escape(m.group(0))[0],
2871         s)
2872
2873
2874 def escape_rfc3986(s):
2875     """Escape non-ASCII characters as suggested by RFC 3986"""
2876     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2877         s = s.encode('utf-8')
2878     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2879
2880
2881 def escape_url(url):
2882     """Escape URL as suggested by RFC 3986"""
2883     url_parsed = compat_urllib_parse_urlparse(url)
2884     return url_parsed._replace(
2885         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2886         path=escape_rfc3986(url_parsed.path),
2887         params=escape_rfc3986(url_parsed.params),
2888         query=escape_rfc3986(url_parsed.query),
2889         fragment=escape_rfc3986(url_parsed.fragment)
2890     ).geturl()
2891
2892
2893 def parse_qs(url):
2894     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2895
2896
2897 def read_batch_urls(batch_fd):
2898     def fixup(url):
2899         if not isinstance(url, compat_str):
2900             url = url.decode('utf-8', 'replace')
2901         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2902         for bom in BOM_UTF8:
2903             if url.startswith(bom):
2904                 url = url[len(bom):]
2905         url = url.lstrip()
2906         if not url or url.startswith(('#', ';', ']')):
2907             return False
2908         # "#" cannot be stripped out since it is part of the URI
2909         # However, it can be safely stipped out if follwing a whitespace
2910         return re.split(r'\s#', url, 1)[0].rstrip()
2911
2912     with contextlib.closing(batch_fd) as fd:
2913         return [url for url in map(fixup, fd) if url]
2914
2915
2916 def urlencode_postdata(*args, **kargs):
2917     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2918
2919
2920 def update_url_query(url, query):
2921     if not query:
2922         return url
2923     parsed_url = compat_urlparse.urlparse(url)
2924     qs = compat_parse_qs(parsed_url.query)
2925     qs.update(query)
2926     return compat_urlparse.urlunparse(parsed_url._replace(
2927         query=compat_urllib_parse_urlencode(qs, True)))
2928
2929
2930 def update_Request(req, url=None, data=None, headers={}, query={}):
2931     req_headers = req.headers.copy()
2932     req_headers.update(headers)
2933     req_data = data or req.data
2934     req_url = update_url_query(url or req.get_full_url(), query)
2935     req_get_method = req.get_method()
2936     if req_get_method == 'HEAD':
2937         req_type = HEADRequest
2938     elif req_get_method == 'PUT':
2939         req_type = PUTRequest
2940     else:
2941         req_type = compat_urllib_request.Request
2942     new_req = req_type(
2943         req_url, data=req_data, headers=req_headers,
2944         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2945     if hasattr(req, 'timeout'):
2946         new_req.timeout = req.timeout
2947     return new_req
2948
2949
2950 def _multipart_encode_impl(data, boundary):
2951     content_type = 'multipart/form-data; boundary=%s' % boundary
2952
2953     out = b''
2954     for k, v in data.items():
2955         out += b'--' + boundary.encode('ascii') + b'\r\n'
2956         if isinstance(k, compat_str):
2957             k = k.encode('utf-8')
2958         if isinstance(v, compat_str):
2959             v = v.encode('utf-8')
2960         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2961         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2962         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2963         if boundary.encode('ascii') in content:
2964             raise ValueError('Boundary overlaps with data')
2965         out += content
2966
2967     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2968
2969     return out, content_type
2970
2971
2972 def multipart_encode(data, boundary=None):
2973     '''
2974     Encode a dict to RFC 7578-compliant form-data
2975
2976     data:
2977         A dict where keys and values can be either Unicode or bytes-like
2978         objects.
2979     boundary:
2980         If specified a Unicode object, it's used as the boundary. Otherwise
2981         a random boundary is generated.
2982
2983     Reference: https://tools.ietf.org/html/rfc7578
2984     '''
2985     has_specified_boundary = boundary is not None
2986
2987     while True:
2988         if boundary is None:
2989             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2990
2991         try:
2992             out, content_type = _multipart_encode_impl(data, boundary)
2993             break
2994         except ValueError:
2995             if has_specified_boundary:
2996                 raise
2997             boundary = None
2998
2999     return out, content_type
3000
3001
3002 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3003     if isinstance(key_or_keys, (list, tuple)):
3004         for key in key_or_keys:
3005             if key not in d or d[key] is None or skip_false_values and not d[key]:
3006                 continue
3007             return d[key]
3008         return default
3009     return d.get(key_or_keys, default)
3010
3011
3012 def try_get(src, getter, expected_type=None):
3013     for get in variadic(getter):
3014         try:
3015             v = get(src)
3016         except (AttributeError, KeyError, TypeError, IndexError):
3017             pass
3018         else:
3019             if expected_type is None or isinstance(v, expected_type):
3020                 return v
3021
3022
3023 def merge_dicts(*dicts):
3024     merged = {}
3025     for a_dict in dicts:
3026         for k, v in a_dict.items():
3027             if v is None:
3028                 continue
3029             if (k not in merged
3030                     or (isinstance(v, compat_str) and v
3031                         and isinstance(merged[k], compat_str)
3032                         and not merged[k])):
3033                 merged[k] = v
3034     return merged
3035
3036
3037 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3038     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3039
3040
3041 US_RATINGS = {
3042     'G': 0,
3043     'PG': 10,
3044     'PG-13': 13,
3045     'R': 16,
3046     'NC': 18,
3047 }
3048
3049
3050 TV_PARENTAL_GUIDELINES = {
3051     'TV-Y': 0,
3052     'TV-Y7': 7,
3053     'TV-G': 0,
3054     'TV-PG': 0,
3055     'TV-14': 14,
3056     'TV-MA': 17,
3057 }
3058
3059
3060 def parse_age_limit(s):
3061     if type(s) == int:
3062         return s if 0 <= s <= 21 else None
3063     if not isinstance(s, compat_basestring):
3064         return None
3065     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3066     if m:
3067         return int(m.group('age'))
3068     s = s.upper()
3069     if s in US_RATINGS:
3070         return US_RATINGS[s]
3071     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3072     if m:
3073         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3074     return None
3075
3076
3077 def strip_jsonp(code):
3078     return re.sub(
3079         r'''(?sx)^
3080             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3081             (?:\s*&&\s*(?P=func_name))?
3082             \s*\(\s*(?P<callback_data>.*)\);?
3083             \s*?(?://[^\n]*)*$''',
3084         r'\g<callback_data>', code)
3085
3086
3087 def js_to_json(code, vars={}):
3088     # vars is a dict of var, val pairs to substitute
3089     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3090     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3091     INTEGER_TABLE = (
3092         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3093         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3094     )
3095
3096     def fix_kv(m):
3097         v = m.group(0)
3098         if v in ('true', 'false', 'null'):
3099             return v
3100         elif v in ('undefined', 'void 0'):
3101             return 'null'
3102         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3103             return ""
3104
3105         if v[0] in ("'", '"'):
3106             v = re.sub(r'(?s)\\.|"', lambda m: {
3107                 '"': '\\"',
3108                 "\\'": "'",
3109                 '\\\n': '',
3110                 '\\x': '\\u00',
3111             }.get(m.group(0), m.group(0)), v[1:-1])
3112         else:
3113             for regex, base in INTEGER_TABLE:
3114                 im = re.match(regex, v)
3115                 if im:
3116                     i = int(im.group(1), base)
3117                     return '"%d":' % i if v.endswith(':') else '%d' % i
3118
3119             if v in vars:
3120                 return vars[v]
3121
3122         return '"%s"' % v
3123
3124     return re.sub(r'''(?sx)
3125         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3126         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3127         {comment}|,(?={skip}[\]}}])|
3128         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3129         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3130         [0-9]+(?={skip}:)|
3131         !+
3132         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3133
3134
3135 def qualities(quality_ids):
3136     """ Get a numeric quality value out of a list of possible values """
3137     def q(qid):
3138         try:
3139             return quality_ids.index(qid)
3140         except ValueError:
3141             return -1
3142     return q
3143
3144
3145 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3146
3147
3148 DEFAULT_OUTTMPL = {
3149     'default': '%(title)s [%(id)s].%(ext)s',
3150     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3151 }
3152 OUTTMPL_TYPES = {
3153     'chapter': None,
3154     'subtitle': None,
3155     'thumbnail': None,
3156     'description': 'description',
3157     'annotation': 'annotations.xml',
3158     'infojson': 'info.json',
3159     'link': None,
3160     'pl_thumbnail': None,
3161     'pl_description': 'description',
3162     'pl_infojson': 'info.json',
3163 }
3164
3165 # As of [1] format syntax is:
3166 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3167 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3168 STR_FORMAT_RE_TMPL = r'''(?x)
3169     (?<!%)(?P<prefix>(?:%%)*)
3170     %
3171     (?P<has_key>\((?P<key>{0})\))?
3172     (?P<format>
3173         (?P<conversion>[#0\-+ ]+)?
3174         (?P<min_width>\d+)?
3175         (?P<precision>\.\d+)?
3176         (?P<len_mod>[hlL])?  # unused in python
3177         {1}  # conversion type
3178     )
3179 '''
3180
3181
3182 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3183
3184
3185 def limit_length(s, length):
3186     """ Add ellipses to overly long strings """
3187     if s is None:
3188         return None
3189     ELLIPSES = '...'
3190     if len(s) > length:
3191         return s[:length - len(ELLIPSES)] + ELLIPSES
3192     return s
3193
3194
3195 def version_tuple(v):
3196     return tuple(int(e) for e in re.split(r'[-.]', v))
3197
3198
3199 def is_outdated_version(version, limit, assume_new=True):
3200     if not version:
3201         return not assume_new
3202     try:
3203         return version_tuple(version) < version_tuple(limit)
3204     except ValueError:
3205         return not assume_new
3206
3207
3208 def ytdl_is_updateable():
3209     """ Returns if yt-dlp can be updated with -U """
3210
3211     from .update import is_non_updateable
3212
3213     return not is_non_updateable()
3214
3215
3216 def args_to_str(args):
3217     # Get a short string representation for a subprocess command
3218     return ' '.join(compat_shlex_quote(a) for a in args)
3219
3220
3221 def error_to_compat_str(err):
3222     err_str = str(err)
3223     # On python 2 error byte string must be decoded with proper
3224     # encoding rather than ascii
3225     if sys.version_info[0] < 3:
3226         err_str = err_str.decode(preferredencoding())
3227     return err_str
3228
3229
3230 def mimetype2ext(mt):
3231     if mt is None:
3232         return None
3233
3234     mt, _, params = mt.partition(';')
3235     mt = mt.strip()
3236
3237     FULL_MAP = {
3238         'audio/mp4': 'm4a',
3239         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3240         # it's the most popular one
3241         'audio/mpeg': 'mp3',
3242         'audio/x-wav': 'wav',
3243         'audio/wav': 'wav',
3244         'audio/wave': 'wav',
3245     }
3246
3247     ext = FULL_MAP.get(mt)
3248     if ext is not None:
3249         return ext
3250
3251     SUBTYPE_MAP = {
3252         '3gpp': '3gp',
3253         'smptett+xml': 'tt',
3254         'ttaf+xml': 'dfxp',
3255         'ttml+xml': 'ttml',
3256         'x-flv': 'flv',
3257         'x-mp4-fragmented': 'mp4',
3258         'x-ms-sami': 'sami',
3259         'x-ms-wmv': 'wmv',
3260         'mpegurl': 'm3u8',
3261         'x-mpegurl': 'm3u8',
3262         'vnd.apple.mpegurl': 'm3u8',
3263         'dash+xml': 'mpd',
3264         'f4m+xml': 'f4m',
3265         'hds+xml': 'f4m',
3266         'vnd.ms-sstr+xml': 'ism',
3267         'quicktime': 'mov',
3268         'mp2t': 'ts',
3269         'x-wav': 'wav',
3270         'filmstrip+json': 'fs',
3271         'svg+xml': 'svg',
3272     }
3273
3274     _, _, subtype = mt.rpartition('/')
3275     ext = SUBTYPE_MAP.get(subtype.lower())
3276     if ext is not None:
3277         return ext
3278
3279     SUFFIX_MAP = {
3280         'json': 'json',
3281         'xml': 'xml',
3282         'zip': 'zip',
3283         'gzip': 'gz',
3284     }
3285
3286     _, _, suffix = subtype.partition('+')
3287     ext = SUFFIX_MAP.get(suffix)
3288     if ext is not None:
3289         return ext
3290
3291     return subtype.replace('+', '.')
3292
3293
3294 def ext2mimetype(ext_or_url):
3295     if not ext_or_url:
3296         return None
3297     if '.' not in ext_or_url:
3298         ext_or_url = f'file.{ext_or_url}'
3299     return mimetypes.guess_type(ext_or_url)[0]
3300
3301
3302 def parse_codecs(codecs_str):
3303     # http://tools.ietf.org/html/rfc6381
3304     if not codecs_str:
3305         return {}
3306     split_codecs = list(filter(None, map(
3307         str.strip, codecs_str.strip().strip(',').split(','))))
3308     vcodec, acodec, tcodec, hdr = None, None, None, None
3309     for full_codec in split_codecs:
3310         parts = full_codec.split('.')
3311         codec = parts[0].replace('0', '')
3312         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3313                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3314             if not vcodec:
3315                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3316                 if codec in ('dvh1', 'dvhe'):
3317                     hdr = 'DV'
3318                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3319                     hdr = 'HDR10'
3320                 elif full_codec.replace('0', '').startswith('vp9.2'):
3321                     hdr = 'HDR10'
3322         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3323             if not acodec:
3324                 acodec = full_codec
3325         elif codec in ('stpp', 'wvtt',):
3326             if not tcodec:
3327                 tcodec = full_codec
3328         else:
3329             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3330     if vcodec or acodec or tcodec:
3331         return {
3332             'vcodec': vcodec or 'none',
3333             'acodec': acodec or 'none',
3334             'dynamic_range': hdr,
3335             **({'tcodec': tcodec} if tcodec is not None else {}),
3336         }
3337     elif len(split_codecs) == 2:
3338         return {
3339             'vcodec': split_codecs[0],
3340             'acodec': split_codecs[1],
3341         }
3342     return {}
3343
3344
3345 def urlhandle_detect_ext(url_handle):
3346     getheader = url_handle.headers.get
3347
3348     cd = getheader('Content-Disposition')
3349     if cd:
3350         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3351         if m:
3352             e = determine_ext(m.group('filename'), default_ext=None)
3353             if e:
3354                 return e
3355
3356     return mimetype2ext(getheader('Content-Type'))
3357
3358
3359 def encode_data_uri(data, mime_type):
3360     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3361
3362
3363 def age_restricted(content_limit, age_limit):
3364     """ Returns True iff the content should be blocked """
3365
3366     if age_limit is None:  # No limit set
3367         return False
3368     if content_limit is None:
3369         return False  # Content available for everyone
3370     return age_limit < content_limit
3371
3372
3373 def is_html(first_bytes):
3374     """ Detect whether a file contains HTML by examining its first bytes. """
3375
3376     BOMS = [
3377         (b'\xef\xbb\xbf', 'utf-8'),
3378         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3379         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3380         (b'\xff\xfe', 'utf-16-le'),
3381         (b'\xfe\xff', 'utf-16-be'),
3382     ]
3383     for bom, enc in BOMS:
3384         if first_bytes.startswith(bom):
3385             s = first_bytes[len(bom):].decode(enc, 'replace')
3386             break
3387     else:
3388         s = first_bytes.decode('utf-8', 'replace')
3389
3390     return re.match(r'^\s*<', s)
3391
3392
3393 def determine_protocol(info_dict):
3394     protocol = info_dict.get('protocol')
3395     if protocol is not None:
3396         return protocol
3397
3398     url = sanitize_url(info_dict['url'])
3399     if url.startswith('rtmp'):
3400         return 'rtmp'
3401     elif url.startswith('mms'):
3402         return 'mms'
3403     elif url.startswith('rtsp'):
3404         return 'rtsp'
3405
3406     ext = determine_ext(url)
3407     if ext == 'm3u8':
3408         return 'm3u8'
3409     elif ext == 'f4m':
3410         return 'f4m'
3411
3412     return compat_urllib_parse_urlparse(url).scheme
3413
3414
3415 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3416     """ Render a list of rows, each as a list of values.
3417     Text after a \t will be right aligned """
3418     def width(string):
3419         return len(remove_terminal_sequences(string).replace('\t', ''))
3420
3421     def get_max_lens(table):
3422         return [max(width(str(v)) for v in col) for col in zip(*table)]
3423
3424     def filter_using_list(row, filterArray):
3425         return [col for (take, col) in zip(filterArray, row) if take]
3426
3427     if hide_empty:
3428         max_lens = get_max_lens(data)
3429         header_row = filter_using_list(header_row, max_lens)
3430         data = [filter_using_list(row, max_lens) for row in data]
3431
3432     table = [header_row] + data
3433     max_lens = get_max_lens(table)
3434     extra_gap += 1
3435     if delim:
3436         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3437         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3438     for row in table:
3439         for pos, text in enumerate(map(str, row)):
3440             if '\t' in text:
3441                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3442             else:
3443                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3444     ret = '\n'.join(''.join(row).rstrip() for row in table)
3445     return ret
3446
3447
3448 def _match_one(filter_part, dct, incomplete):
3449     # TODO: Generalize code with YoutubeDL._build_format_filter
3450     STRING_OPERATORS = {
3451         '*=': operator.contains,
3452         '^=': lambda attr, value: attr.startswith(value),
3453         '$=': lambda attr, value: attr.endswith(value),
3454         '~=': lambda attr, value: re.search(value, attr),
3455     }
3456     COMPARISON_OPERATORS = {
3457         **STRING_OPERATORS,
3458         '<=': operator.le,  # "<=" must be defined above "<"
3459         '<': operator.lt,
3460         '>=': operator.ge,
3461         '>': operator.gt,
3462         '=': operator.eq,
3463     }
3464
3465     operator_rex = re.compile(r'''(?x)\s*
3466         (?P<key>[a-z_]+)
3467         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3468         (?:
3469             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3470             (?P<strval>.+?)
3471         )
3472         \s*$
3473         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3474     m = operator_rex.search(filter_part)
3475     if m:
3476         m = m.groupdict()
3477         unnegated_op = COMPARISON_OPERATORS[m['op']]
3478         if m['negation']:
3479             op = lambda attr, value: not unnegated_op(attr, value)
3480         else:
3481             op = unnegated_op
3482         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3483         if m['quote']:
3484             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3485         actual_value = dct.get(m['key'])
3486         numeric_comparison = None
3487         if isinstance(actual_value, compat_numeric_types):
3488             # If the original field is a string and matching comparisonvalue is
3489             # a number we should respect the origin of the original field
3490             # and process comparison value as a string (see
3491             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3492             try:
3493                 numeric_comparison = int(comparison_value)
3494             except ValueError:
3495                 numeric_comparison = parse_filesize(comparison_value)
3496                 if numeric_comparison is None:
3497                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3498                 if numeric_comparison is None:
3499                     numeric_comparison = parse_duration(comparison_value)
3500         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3501             raise ValueError('Operator %s only supports string values!' % m['op'])
3502         if actual_value is None:
3503             return incomplete or m['none_inclusive']
3504         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3505
3506     UNARY_OPERATORS = {
3507         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3508         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3509     }
3510     operator_rex = re.compile(r'''(?x)\s*
3511         (?P<op>%s)\s*(?P<key>[a-z_]+)
3512         \s*$
3513         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3514     m = operator_rex.search(filter_part)
3515     if m:
3516         op = UNARY_OPERATORS[m.group('op')]
3517         actual_value = dct.get(m.group('key'))
3518         if incomplete and actual_value is None:
3519             return True
3520         return op(actual_value)
3521
3522     raise ValueError('Invalid filter part %r' % filter_part)
3523
3524
3525 def match_str(filter_str, dct, incomplete=False):
3526     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3527         When incomplete, all conditions passes on missing fields
3528     """
3529     return all(
3530         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3531         for filter_part in re.split(r'(?<!\\)&', filter_str))
3532
3533
3534 def match_filter_func(filter_str):
3535     def _match_func(info_dict, *args, **kwargs):
3536         if match_str(filter_str, info_dict, *args, **kwargs):
3537             return None
3538         else:
3539             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3540             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3541     return _match_func
3542
3543
3544 def parse_dfxp_time_expr(time_expr):
3545     if not time_expr:
3546         return
3547
3548     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3549     if mobj:
3550         return float(mobj.group('time_offset'))
3551
3552     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3553     if mobj:
3554         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3555
3556
3557 def srt_subtitles_timecode(seconds):
3558     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3559
3560
3561 def ass_subtitles_timecode(seconds):
3562     time = timetuple_from_msec(seconds * 1000)
3563     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3564
3565
3566 def dfxp2srt(dfxp_data):
3567     '''
3568     @param dfxp_data A bytes-like object containing DFXP data
3569     @returns A unicode object containing converted SRT data
3570     '''
3571     LEGACY_NAMESPACES = (
3572         (b'http://www.w3.org/ns/ttml', [
3573             b'http://www.w3.org/2004/11/ttaf1',
3574             b'http://www.w3.org/2006/04/ttaf1',
3575             b'http://www.w3.org/2006/10/ttaf1',
3576         ]),
3577         (b'http://www.w3.org/ns/ttml#styling', [
3578             b'http://www.w3.org/ns/ttml#style',
3579         ]),
3580     )
3581
3582     SUPPORTED_STYLING = [
3583         'color',
3584         'fontFamily',
3585         'fontSize',
3586         'fontStyle',
3587         'fontWeight',
3588         'textDecoration'
3589     ]
3590
3591     _x = functools.partial(xpath_with_ns, ns_map={
3592         'xml': 'http://www.w3.org/XML/1998/namespace',
3593         'ttml': 'http://www.w3.org/ns/ttml',
3594         'tts': 'http://www.w3.org/ns/ttml#styling',
3595     })
3596
3597     styles = {}
3598     default_style = {}
3599
3600     class TTMLPElementParser(object):
3601         _out = ''
3602         _unclosed_elements = []
3603         _applied_styles = []
3604
3605         def start(self, tag, attrib):
3606             if tag in (_x('ttml:br'), 'br'):
3607                 self._out += '\n'
3608             else:
3609                 unclosed_elements = []
3610                 style = {}
3611                 element_style_id = attrib.get('style')
3612                 if default_style:
3613                     style.update(default_style)
3614                 if element_style_id:
3615                     style.update(styles.get(element_style_id, {}))
3616                 for prop in SUPPORTED_STYLING:
3617                     prop_val = attrib.get(_x('tts:' + prop))
3618                     if prop_val:
3619                         style[prop] = prop_val
3620                 if style:
3621                     font = ''
3622                     for k, v in sorted(style.items()):
3623                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3624                             continue
3625                         if k == 'color':
3626                             font += ' color="%s"' % v
3627                         elif k == 'fontSize':
3628                             font += ' size="%s"' % v
3629                         elif k == 'fontFamily':
3630                             font += ' face="%s"' % v
3631                         elif k == 'fontWeight' and v == 'bold':
3632                             self._out += '<b>'
3633                             unclosed_elements.append('b')
3634                         elif k == 'fontStyle' and v == 'italic':
3635                             self._out += '<i>'
3636                             unclosed_elements.append('i')
3637                         elif k == 'textDecoration' and v == 'underline':
3638                             self._out += '<u>'
3639                             unclosed_elements.append('u')
3640                     if font:
3641                         self._out += '<font' + font + '>'
3642                         unclosed_elements.append('font')
3643                     applied_style = {}
3644                     if self._applied_styles:
3645                         applied_style.update(self._applied_styles[-1])
3646                     applied_style.update(style)
3647                     self._applied_styles.append(applied_style)
3648                 self._unclosed_elements.append(unclosed_elements)
3649
3650         def end(self, tag):
3651             if tag not in (_x('ttml:br'), 'br'):
3652                 unclosed_elements = self._unclosed_elements.pop()
3653                 for element in reversed(unclosed_elements):
3654                     self._out += '</%s>' % element
3655                 if unclosed_elements and self._applied_styles:
3656                     self._applied_styles.pop()
3657
3658         def data(self, data):
3659             self._out += data
3660
3661         def close(self):
3662             return self._out.strip()
3663
3664     def parse_node(node):
3665         target = TTMLPElementParser()
3666         parser = xml.etree.ElementTree.XMLParser(target=target)
3667         parser.feed(xml.etree.ElementTree.tostring(node))
3668         return parser.close()
3669
3670     for k, v in LEGACY_NAMESPACES:
3671         for ns in v:
3672             dfxp_data = dfxp_data.replace(ns, k)
3673
3674     dfxp = compat_etree_fromstring(dfxp_data)
3675     out = []
3676     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3677
3678     if not paras:
3679         raise ValueError('Invalid dfxp/TTML subtitle')
3680
3681     repeat = False
3682     while True:
3683         for style in dfxp.findall(_x('.//ttml:style')):
3684             style_id = style.get('id') or style.get(_x('xml:id'))
3685             if not style_id:
3686                 continue
3687             parent_style_id = style.get('style')
3688             if parent_style_id:
3689                 if parent_style_id not in styles:
3690                     repeat = True
3691                     continue
3692                 styles[style_id] = styles[parent_style_id].copy()
3693             for prop in SUPPORTED_STYLING:
3694                 prop_val = style.get(_x('tts:' + prop))
3695                 if prop_val:
3696                     styles.setdefault(style_id, {})[prop] = prop_val
3697         if repeat:
3698             repeat = False
3699         else:
3700             break
3701
3702     for p in ('body', 'div'):
3703         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3704         if ele is None:
3705             continue
3706         style = styles.get(ele.get('style'))
3707         if not style:
3708             continue
3709         default_style.update(style)
3710
3711     for para, index in zip(paras, itertools.count(1)):
3712         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3713         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3714         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3715         if begin_time is None:
3716             continue
3717         if not end_time:
3718             if not dur:
3719                 continue
3720             end_time = begin_time + dur
3721         out.append('%d\n%s --> %s\n%s\n\n' % (
3722             index,
3723             srt_subtitles_timecode(begin_time),
3724             srt_subtitles_timecode(end_time),
3725             parse_node(para)))
3726
3727     return ''.join(out)
3728
3729
3730 def cli_option(params, command_option, param):
3731     param = params.get(param)
3732     if param:
3733         param = compat_str(param)
3734     return [command_option, param] if param is not None else []
3735
3736
3737 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3738     param = params.get(param)
3739     if param is None:
3740         return []
3741     assert isinstance(param, bool)
3742     if separator:
3743         return [command_option + separator + (true_value if param else false_value)]
3744     return [command_option, true_value if param else false_value]
3745
3746
3747 def cli_valueless_option(params, command_option, param, expected_value=True):
3748     param = params.get(param)
3749     return [command_option] if param == expected_value else []
3750
3751
3752 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3753     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3754         if use_compat:
3755             return argdict
3756         else:
3757             argdict = None
3758     if argdict is None:
3759         return default
3760     assert isinstance(argdict, dict)
3761
3762     assert isinstance(keys, (list, tuple))
3763     for key_list in keys:
3764         arg_list = list(filter(
3765             lambda x: x is not None,
3766             [argdict.get(key.lower()) for key in variadic(key_list)]))
3767         if arg_list:
3768             return [arg for args in arg_list for arg in args]
3769     return default
3770
3771
3772 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3773     main_key, exe = main_key.lower(), exe.lower()
3774     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3775     keys = [f'{root_key}{k}' for k in (keys or [''])]
3776     if root_key in keys:
3777         if main_key != exe:
3778             keys.append((main_key, exe))
3779         keys.append('default')
3780     else:
3781         use_compat = False
3782     return cli_configuration_args(argdict, keys, default, use_compat)
3783
3784
3785 class ISO639Utils(object):
3786     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3787     _lang_map = {
3788         'aa': 'aar',
3789         'ab': 'abk',
3790         'ae': 'ave',
3791         'af': 'afr',
3792         'ak': 'aka',
3793         'am': 'amh',
3794         'an': 'arg',
3795         'ar': 'ara',
3796         'as': 'asm',
3797         'av': 'ava',
3798         'ay': 'aym',
3799         'az': 'aze',
3800         'ba': 'bak',
3801         'be': 'bel',
3802         'bg': 'bul',
3803         'bh': 'bih',
3804         'bi': 'bis',
3805         'bm': 'bam',
3806         'bn': 'ben',
3807         'bo': 'bod',
3808         'br': 'bre',
3809         'bs': 'bos',
3810         'ca': 'cat',
3811         'ce': 'che',
3812         'ch': 'cha',
3813         'co': 'cos',
3814         'cr': 'cre',
3815         'cs': 'ces',
3816         'cu': 'chu',
3817         'cv': 'chv',
3818         'cy': 'cym',
3819         'da': 'dan',
3820         'de': 'deu',
3821         'dv': 'div',
3822         'dz': 'dzo',
3823         'ee': 'ewe',
3824         'el': 'ell',
3825         'en': 'eng',
3826         'eo': 'epo',
3827         'es': 'spa',
3828         'et': 'est',
3829         'eu': 'eus',
3830         'fa': 'fas',
3831         'ff': 'ful',
3832         'fi': 'fin',
3833         'fj': 'fij',
3834         'fo': 'fao',
3835         'fr': 'fra',
3836         'fy': 'fry',
3837         'ga': 'gle',
3838         'gd': 'gla',
3839         'gl': 'glg',
3840         'gn': 'grn',
3841         'gu': 'guj',
3842         'gv': 'glv',
3843         'ha': 'hau',
3844         'he': 'heb',
3845         'iw': 'heb',  # Replaced by he in 1989 revision
3846         'hi': 'hin',
3847         'ho': 'hmo',
3848         'hr': 'hrv',
3849         'ht': 'hat',
3850         'hu': 'hun',
3851         'hy': 'hye',
3852         'hz': 'her',
3853         'ia': 'ina',
3854         'id': 'ind',
3855         'in': 'ind',  # Replaced by id in 1989 revision
3856         'ie': 'ile',
3857         'ig': 'ibo',
3858         'ii': 'iii',
3859         'ik': 'ipk',
3860         'io': 'ido',
3861         'is': 'isl',
3862         'it': 'ita',
3863         'iu': 'iku',
3864         'ja': 'jpn',
3865         'jv': 'jav',
3866         'ka': 'kat',
3867         'kg': 'kon',
3868         'ki': 'kik',
3869         'kj': 'kua',
3870         'kk': 'kaz',
3871         'kl': 'kal',
3872         'km': 'khm',
3873         'kn': 'kan',
3874         'ko': 'kor',
3875         'kr': 'kau',
3876         'ks': 'kas',
3877         'ku': 'kur',
3878         'kv': 'kom',
3879         'kw': 'cor',
3880         'ky': 'kir',
3881         'la': 'lat',
3882         'lb': 'ltz',
3883         'lg': 'lug',
3884         'li': 'lim',
3885         'ln': 'lin',
3886         'lo': 'lao',
3887         'lt': 'lit',
3888         'lu': 'lub',
3889         'lv': 'lav',
3890         'mg': 'mlg',
3891         'mh': 'mah',
3892         'mi': 'mri',
3893         'mk': 'mkd',
3894         'ml': 'mal',
3895         'mn': 'mon',
3896         'mr': 'mar',
3897         'ms': 'msa',
3898         'mt': 'mlt',
3899         'my': 'mya',
3900         'na': 'nau',
3901         'nb': 'nob',
3902         'nd': 'nde',
3903         'ne': 'nep',
3904         'ng': 'ndo',
3905         'nl': 'nld',
3906         'nn': 'nno',
3907         'no': 'nor',
3908         'nr': 'nbl',
3909         'nv': 'nav',
3910         'ny': 'nya',
3911         'oc': 'oci',
3912         'oj': 'oji',
3913         'om': 'orm',
3914         'or': 'ori',
3915         'os': 'oss',
3916         'pa': 'pan',
3917         'pi': 'pli',
3918         'pl': 'pol',
3919         'ps': 'pus',
3920         'pt': 'por',
3921         'qu': 'que',
3922         'rm': 'roh',
3923         'rn': 'run',
3924         'ro': 'ron',
3925         'ru': 'rus',
3926         'rw': 'kin',
3927         'sa': 'san',
3928         'sc': 'srd',
3929         'sd': 'snd',
3930         'se': 'sme',
3931         'sg': 'sag',
3932         'si': 'sin',
3933         'sk': 'slk',
3934         'sl': 'slv',
3935         'sm': 'smo',
3936         'sn': 'sna',
3937         'so': 'som',
3938         'sq': 'sqi',
3939         'sr': 'srp',
3940         'ss': 'ssw',
3941         'st': 'sot',
3942         'su': 'sun',
3943         'sv': 'swe',
3944         'sw': 'swa',
3945         'ta': 'tam',
3946         'te': 'tel',
3947         'tg': 'tgk',
3948         'th': 'tha',
3949         'ti': 'tir',
3950         'tk': 'tuk',
3951         'tl': 'tgl',
3952         'tn': 'tsn',
3953         'to': 'ton',
3954         'tr': 'tur',
3955         'ts': 'tso',
3956         'tt': 'tat',
3957         'tw': 'twi',
3958         'ty': 'tah',
3959         'ug': 'uig',
3960         'uk': 'ukr',
3961         'ur': 'urd',
3962         'uz': 'uzb',
3963         've': 'ven',
3964         'vi': 'vie',
3965         'vo': 'vol',
3966         'wa': 'wln',
3967         'wo': 'wol',
3968         'xh': 'xho',
3969         'yi': 'yid',
3970         'ji': 'yid',  # Replaced by yi in 1989 revision
3971         'yo': 'yor',
3972         'za': 'zha',
3973         'zh': 'zho',
3974         'zu': 'zul',
3975     }
3976
3977     @classmethod
3978     def short2long(cls, code):
3979         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3980         return cls._lang_map.get(code[:2])
3981
3982     @classmethod
3983     def long2short(cls, code):
3984         """Convert language code from ISO 639-2/T to ISO 639-1"""
3985         for short_name, long_name in cls._lang_map.items():
3986             if long_name == code:
3987                 return short_name
3988
3989
3990 class ISO3166Utils(object):
3991     # From http://data.okfn.org/data/core/country-list
3992     _country_map = {
3993         'AF': 'Afghanistan',
3994         'AX': 'Åland Islands',
3995         'AL': 'Albania',
3996         'DZ': 'Algeria',
3997         'AS': 'American Samoa',
3998         'AD': 'Andorra',
3999         'AO': 'Angola',
4000         'AI': 'Anguilla',
4001         'AQ': 'Antarctica',
4002         'AG': 'Antigua and Barbuda',
4003         'AR': 'Argentina',
4004         'AM': 'Armenia',
4005         'AW': 'Aruba',
4006         'AU': 'Australia',
4007         'AT': 'Austria',
4008         'AZ': 'Azerbaijan',
4009         'BS': 'Bahamas',
4010         'BH': 'Bahrain',
4011         'BD': 'Bangladesh',
4012         'BB': 'Barbados',
4013         'BY': 'Belarus',
4014         'BE': 'Belgium',
4015         'BZ': 'Belize',
4016         'BJ': 'Benin',
4017         'BM': 'Bermuda',
4018         'BT': 'Bhutan',
4019         'BO': 'Bolivia, Plurinational State of',
4020         'BQ': 'Bonaire, Sint Eustatius and Saba',
4021         'BA': 'Bosnia and Herzegovina',
4022         'BW': 'Botswana',
4023         'BV': 'Bouvet Island',
4024         'BR': 'Brazil',
4025         'IO': 'British Indian Ocean Territory',
4026         'BN': 'Brunei Darussalam',
4027         'BG': 'Bulgaria',
4028         'BF': 'Burkina Faso',
4029         'BI': 'Burundi',
4030         'KH': 'Cambodia',
4031         'CM': 'Cameroon',
4032         'CA': 'Canada',
4033         'CV': 'Cape Verde',
4034         'KY': 'Cayman Islands',
4035         'CF': 'Central African Republic',
4036         'TD': 'Chad',
4037         'CL': 'Chile',
4038         'CN': 'China',
4039         'CX': 'Christmas Island',
4040         'CC': 'Cocos (Keeling) Islands',
4041         'CO': 'Colombia',
4042         'KM': 'Comoros',
4043         'CG': 'Congo',
4044         'CD': 'Congo, the Democratic Republic of the',
4045         'CK': 'Cook Islands',
4046         'CR': 'Costa Rica',
4047         'CI': 'Côte d\'Ivoire',
4048         'HR': 'Croatia',
4049         'CU': 'Cuba',
4050         'CW': 'Curaçao',
4051         'CY': 'Cyprus',
4052         'CZ': 'Czech Republic',
4053         'DK': 'Denmark',
4054         'DJ': 'Djibouti',
4055         'DM': 'Dominica',
4056         'DO': 'Dominican Republic',
4057         'EC': 'Ecuador',
4058         'EG': 'Egypt',
4059         'SV': 'El Salvador',
4060         'GQ': 'Equatorial Guinea',
4061         'ER': 'Eritrea',
4062         'EE': 'Estonia',
4063         'ET': 'Ethiopia',
4064         'FK': 'Falkland Islands (Malvinas)',
4065         'FO': 'Faroe Islands',
4066         'FJ': 'Fiji',
4067         'FI': 'Finland',
4068         'FR': 'France',
4069         'GF': 'French Guiana',
4070         'PF': 'French Polynesia',
4071         'TF': 'French Southern Territories',
4072         'GA': 'Gabon',
4073         'GM': 'Gambia',
4074         'GE': 'Georgia',
4075         'DE': 'Germany',
4076         'GH': 'Ghana',
4077         'GI': 'Gibraltar',
4078         'GR': 'Greece',
4079         'GL': 'Greenland',
4080         'GD': 'Grenada',
4081         'GP': 'Guadeloupe',
4082         'GU': 'Guam',
4083         'GT': 'Guatemala',
4084         'GG': 'Guernsey',
4085         'GN': 'Guinea',
4086         'GW': 'Guinea-Bissau',
4087         'GY': 'Guyana',
4088         'HT': 'Haiti',
4089         'HM': 'Heard Island and McDonald Islands',
4090         'VA': 'Holy See (Vatican City State)',
4091         'HN': 'Honduras',
4092         'HK': 'Hong Kong',
4093         'HU': 'Hungary',
4094         'IS': 'Iceland',
4095         'IN': 'India',
4096         'ID': 'Indonesia',
4097         'IR': 'Iran, Islamic Republic of',
4098         'IQ': 'Iraq',
4099         'IE': 'Ireland',
4100         'IM': 'Isle of Man',
4101         'IL': 'Israel',
4102         'IT': 'Italy',
4103         'JM': 'Jamaica',
4104         'JP': 'Japan',
4105         'JE': 'Jersey',
4106         'JO': 'Jordan',
4107         'KZ': 'Kazakhstan',
4108         'KE': 'Kenya',
4109         'KI': 'Kiribati',
4110         'KP': 'Korea, Democratic People\'s Republic of',
4111         'KR': 'Korea, Republic of',
4112         'KW': 'Kuwait',
4113         'KG': 'Kyrgyzstan',
4114         'LA': 'Lao People\'s Democratic Republic',
4115         'LV': 'Latvia',
4116         'LB': 'Lebanon',
4117         'LS': 'Lesotho',
4118         'LR': 'Liberia',
4119         'LY': 'Libya',
4120         'LI': 'Liechtenstein',
4121         'LT': 'Lithuania',
4122         'LU': 'Luxembourg',
4123         'MO': 'Macao',
4124         'MK': 'Macedonia, the Former Yugoslav Republic of',
4125         'MG': 'Madagascar',
4126         'MW': 'Malawi',
4127         'MY': 'Malaysia',
4128         'MV': 'Maldives',
4129         'ML': 'Mali',
4130         'MT': 'Malta',
4131         'MH': 'Marshall Islands',
4132         'MQ': 'Martinique',
4133         'MR': 'Mauritania',
4134         'MU': 'Mauritius',
4135         'YT': 'Mayotte',
4136         'MX': 'Mexico',
4137         'FM': 'Micronesia, Federated States of',
4138         'MD': 'Moldova, Republic of',
4139         'MC': 'Monaco',
4140         'MN': 'Mongolia',
4141         'ME': 'Montenegro',
4142         'MS': 'Montserrat',
4143         'MA': 'Morocco',
4144         'MZ': 'Mozambique',
4145         'MM': 'Myanmar',
4146         'NA': 'Namibia',
4147         'NR': 'Nauru',
4148         'NP': 'Nepal',
4149         'NL': 'Netherlands',
4150         'NC': 'New Caledonia',
4151         'NZ': 'New Zealand',
4152         'NI': 'Nicaragua',
4153         'NE': 'Niger',
4154         'NG': 'Nigeria',
4155         'NU': 'Niue',
4156         'NF': 'Norfolk Island',
4157         'MP': 'Northern Mariana Islands',
4158         'NO': 'Norway',
4159         'OM': 'Oman',
4160         'PK': 'Pakistan',
4161         'PW': 'Palau',
4162         'PS': 'Palestine, State of',
4163         'PA': 'Panama',
4164         'PG': 'Papua New Guinea',
4165         'PY': 'Paraguay',
4166         'PE': 'Peru',
4167         'PH': 'Philippines',
4168         'PN': 'Pitcairn',
4169         'PL': 'Poland',
4170         'PT': 'Portugal',
4171         'PR': 'Puerto Rico',
4172         'QA': 'Qatar',
4173         'RE': 'Réunion',
4174         'RO': 'Romania',
4175         'RU': 'Russian Federation',
4176         'RW': 'Rwanda',
4177         'BL': 'Saint Barthélemy',
4178         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4179         'KN': 'Saint Kitts and Nevis',
4180         'LC': 'Saint Lucia',
4181         'MF': 'Saint Martin (French part)',
4182         'PM': 'Saint Pierre and Miquelon',
4183         'VC': 'Saint Vincent and the Grenadines',
4184         'WS': 'Samoa',
4185         'SM': 'San Marino',
4186         'ST': 'Sao Tome and Principe',
4187         'SA': 'Saudi Arabia',
4188         'SN': 'Senegal',
4189         'RS': 'Serbia',
4190         'SC': 'Seychelles',
4191         'SL': 'Sierra Leone',
4192         'SG': 'Singapore',
4193         'SX': 'Sint Maarten (Dutch part)',
4194         'SK': 'Slovakia',
4195         'SI': 'Slovenia',
4196         'SB': 'Solomon Islands',
4197         'SO': 'Somalia',
4198         'ZA': 'South Africa',
4199         'GS': 'South Georgia and the South Sandwich Islands',
4200         'SS': 'South Sudan',
4201         'ES': 'Spain',
4202         'LK': 'Sri Lanka',
4203         'SD': 'Sudan',
4204         'SR': 'Suriname',
4205         'SJ': 'Svalbard and Jan Mayen',
4206         'SZ': 'Swaziland',
4207         'SE': 'Sweden',
4208         'CH': 'Switzerland',
4209         'SY': 'Syrian Arab Republic',
4210         'TW': 'Taiwan, Province of China',
4211         'TJ': 'Tajikistan',
4212         'TZ': 'Tanzania, United Republic of',
4213         'TH': 'Thailand',
4214         'TL': 'Timor-Leste',
4215         'TG': 'Togo',
4216         'TK': 'Tokelau',
4217         'TO': 'Tonga',
4218         'TT': 'Trinidad and Tobago',
4219         'TN': 'Tunisia',
4220         'TR': 'Turkey',
4221         'TM': 'Turkmenistan',
4222         'TC': 'Turks and Caicos Islands',
4223         'TV': 'Tuvalu',
4224         'UG': 'Uganda',
4225         'UA': 'Ukraine',
4226         'AE': 'United Arab Emirates',
4227         'GB': 'United Kingdom',
4228         'US': 'United States',
4229         'UM': 'United States Minor Outlying Islands',
4230         'UY': 'Uruguay',
4231         'UZ': 'Uzbekistan',
4232         'VU': 'Vanuatu',
4233         'VE': 'Venezuela, Bolivarian Republic of',
4234         'VN': 'Viet Nam',
4235         'VG': 'Virgin Islands, British',
4236         'VI': 'Virgin Islands, U.S.',
4237         'WF': 'Wallis and Futuna',
4238         'EH': 'Western Sahara',
4239         'YE': 'Yemen',
4240         'ZM': 'Zambia',
4241         'ZW': 'Zimbabwe',
4242     }
4243
4244     @classmethod
4245     def short2full(cls, code):
4246         """Convert an ISO 3166-2 country code to the corresponding full name"""
4247         return cls._country_map.get(code.upper())
4248
4249
4250 class GeoUtils(object):
4251     # Major IPv4 address blocks per country
4252     _country_ip_map = {
4253         'AD': '46.172.224.0/19',
4254         'AE': '94.200.0.0/13',
4255         'AF': '149.54.0.0/17',
4256         'AG': '209.59.64.0/18',
4257         'AI': '204.14.248.0/21',
4258         'AL': '46.99.0.0/16',
4259         'AM': '46.70.0.0/15',
4260         'AO': '105.168.0.0/13',
4261         'AP': '182.50.184.0/21',
4262         'AQ': '23.154.160.0/24',
4263         'AR': '181.0.0.0/12',
4264         'AS': '202.70.112.0/20',
4265         'AT': '77.116.0.0/14',
4266         'AU': '1.128.0.0/11',
4267         'AW': '181.41.0.0/18',
4268         'AX': '185.217.4.0/22',
4269         'AZ': '5.197.0.0/16',
4270         'BA': '31.176.128.0/17',
4271         'BB': '65.48.128.0/17',
4272         'BD': '114.130.0.0/16',
4273         'BE': '57.0.0.0/8',
4274         'BF': '102.178.0.0/15',
4275         'BG': '95.42.0.0/15',
4276         'BH': '37.131.0.0/17',
4277         'BI': '154.117.192.0/18',
4278         'BJ': '137.255.0.0/16',
4279         'BL': '185.212.72.0/23',
4280         'BM': '196.12.64.0/18',
4281         'BN': '156.31.0.0/16',
4282         'BO': '161.56.0.0/16',
4283         'BQ': '161.0.80.0/20',
4284         'BR': '191.128.0.0/12',
4285         'BS': '24.51.64.0/18',
4286         'BT': '119.2.96.0/19',
4287         'BW': '168.167.0.0/16',
4288         'BY': '178.120.0.0/13',
4289         'BZ': '179.42.192.0/18',
4290         'CA': '99.224.0.0/11',
4291         'CD': '41.243.0.0/16',
4292         'CF': '197.242.176.0/21',
4293         'CG': '160.113.0.0/16',
4294         'CH': '85.0.0.0/13',
4295         'CI': '102.136.0.0/14',
4296         'CK': '202.65.32.0/19',
4297         'CL': '152.172.0.0/14',
4298         'CM': '102.244.0.0/14',
4299         'CN': '36.128.0.0/10',
4300         'CO': '181.240.0.0/12',
4301         'CR': '201.192.0.0/12',
4302         'CU': '152.206.0.0/15',
4303         'CV': '165.90.96.0/19',
4304         'CW': '190.88.128.0/17',
4305         'CY': '31.153.0.0/16',
4306         'CZ': '88.100.0.0/14',
4307         'DE': '53.0.0.0/8',
4308         'DJ': '197.241.0.0/17',
4309         'DK': '87.48.0.0/12',
4310         'DM': '192.243.48.0/20',
4311         'DO': '152.166.0.0/15',
4312         'DZ': '41.96.0.0/12',
4313         'EC': '186.68.0.0/15',
4314         'EE': '90.190.0.0/15',
4315         'EG': '156.160.0.0/11',
4316         'ER': '196.200.96.0/20',
4317         'ES': '88.0.0.0/11',
4318         'ET': '196.188.0.0/14',
4319         'EU': '2.16.0.0/13',
4320         'FI': '91.152.0.0/13',
4321         'FJ': '144.120.0.0/16',
4322         'FK': '80.73.208.0/21',
4323         'FM': '119.252.112.0/20',
4324         'FO': '88.85.32.0/19',
4325         'FR': '90.0.0.0/9',
4326         'GA': '41.158.0.0/15',
4327         'GB': '25.0.0.0/8',
4328         'GD': '74.122.88.0/21',
4329         'GE': '31.146.0.0/16',
4330         'GF': '161.22.64.0/18',
4331         'GG': '62.68.160.0/19',
4332         'GH': '154.160.0.0/12',
4333         'GI': '95.164.0.0/16',
4334         'GL': '88.83.0.0/19',
4335         'GM': '160.182.0.0/15',
4336         'GN': '197.149.192.0/18',
4337         'GP': '104.250.0.0/19',
4338         'GQ': '105.235.224.0/20',
4339         'GR': '94.64.0.0/13',
4340         'GT': '168.234.0.0/16',
4341         'GU': '168.123.0.0/16',
4342         'GW': '197.214.80.0/20',
4343         'GY': '181.41.64.0/18',
4344         'HK': '113.252.0.0/14',
4345         'HN': '181.210.0.0/16',
4346         'HR': '93.136.0.0/13',
4347         'HT': '148.102.128.0/17',
4348         'HU': '84.0.0.0/14',
4349         'ID': '39.192.0.0/10',
4350         'IE': '87.32.0.0/12',
4351         'IL': '79.176.0.0/13',
4352         'IM': '5.62.80.0/20',
4353         'IN': '117.192.0.0/10',
4354         'IO': '203.83.48.0/21',
4355         'IQ': '37.236.0.0/14',
4356         'IR': '2.176.0.0/12',
4357         'IS': '82.221.0.0/16',
4358         'IT': '79.0.0.0/10',
4359         'JE': '87.244.64.0/18',
4360         'JM': '72.27.0.0/17',
4361         'JO': '176.29.0.0/16',
4362         'JP': '133.0.0.0/8',
4363         'KE': '105.48.0.0/12',
4364         'KG': '158.181.128.0/17',
4365         'KH': '36.37.128.0/17',
4366         'KI': '103.25.140.0/22',
4367         'KM': '197.255.224.0/20',
4368         'KN': '198.167.192.0/19',
4369         'KP': '175.45.176.0/22',
4370         'KR': '175.192.0.0/10',
4371         'KW': '37.36.0.0/14',
4372         'KY': '64.96.0.0/15',
4373         'KZ': '2.72.0.0/13',
4374         'LA': '115.84.64.0/18',
4375         'LB': '178.135.0.0/16',
4376         'LC': '24.92.144.0/20',
4377         'LI': '82.117.0.0/19',
4378         'LK': '112.134.0.0/15',
4379         'LR': '102.183.0.0/16',
4380         'LS': '129.232.0.0/17',
4381         'LT': '78.56.0.0/13',
4382         'LU': '188.42.0.0/16',
4383         'LV': '46.109.0.0/16',
4384         'LY': '41.252.0.0/14',
4385         'MA': '105.128.0.0/11',
4386         'MC': '88.209.64.0/18',
4387         'MD': '37.246.0.0/16',
4388         'ME': '178.175.0.0/17',
4389         'MF': '74.112.232.0/21',
4390         'MG': '154.126.0.0/17',
4391         'MH': '117.103.88.0/21',
4392         'MK': '77.28.0.0/15',
4393         'ML': '154.118.128.0/18',
4394         'MM': '37.111.0.0/17',
4395         'MN': '49.0.128.0/17',
4396         'MO': '60.246.0.0/16',
4397         'MP': '202.88.64.0/20',
4398         'MQ': '109.203.224.0/19',
4399         'MR': '41.188.64.0/18',
4400         'MS': '208.90.112.0/22',
4401         'MT': '46.11.0.0/16',
4402         'MU': '105.16.0.0/12',
4403         'MV': '27.114.128.0/18',
4404         'MW': '102.70.0.0/15',
4405         'MX': '187.192.0.0/11',
4406         'MY': '175.136.0.0/13',
4407         'MZ': '197.218.0.0/15',
4408         'NA': '41.182.0.0/16',
4409         'NC': '101.101.0.0/18',
4410         'NE': '197.214.0.0/18',
4411         'NF': '203.17.240.0/22',
4412         'NG': '105.112.0.0/12',
4413         'NI': '186.76.0.0/15',
4414         'NL': '145.96.0.0/11',
4415         'NO': '84.208.0.0/13',
4416         'NP': '36.252.0.0/15',
4417         'NR': '203.98.224.0/19',
4418         'NU': '49.156.48.0/22',
4419         'NZ': '49.224.0.0/14',
4420         'OM': '5.36.0.0/15',
4421         'PA': '186.72.0.0/15',
4422         'PE': '186.160.0.0/14',
4423         'PF': '123.50.64.0/18',
4424         'PG': '124.240.192.0/19',
4425         'PH': '49.144.0.0/13',
4426         'PK': '39.32.0.0/11',
4427         'PL': '83.0.0.0/11',
4428         'PM': '70.36.0.0/20',
4429         'PR': '66.50.0.0/16',
4430         'PS': '188.161.0.0/16',
4431         'PT': '85.240.0.0/13',
4432         'PW': '202.124.224.0/20',
4433         'PY': '181.120.0.0/14',
4434         'QA': '37.210.0.0/15',
4435         'RE': '102.35.0.0/16',
4436         'RO': '79.112.0.0/13',
4437         'RS': '93.86.0.0/15',
4438         'RU': '5.136.0.0/13',
4439         'RW': '41.186.0.0/16',
4440         'SA': '188.48.0.0/13',
4441         'SB': '202.1.160.0/19',
4442         'SC': '154.192.0.0/11',
4443         'SD': '102.120.0.0/13',
4444         'SE': '78.64.0.0/12',
4445         'SG': '8.128.0.0/10',
4446         'SI': '188.196.0.0/14',
4447         'SK': '78.98.0.0/15',
4448         'SL': '102.143.0.0/17',
4449         'SM': '89.186.32.0/19',
4450         'SN': '41.82.0.0/15',
4451         'SO': '154.115.192.0/18',
4452         'SR': '186.179.128.0/17',
4453         'SS': '105.235.208.0/21',
4454         'ST': '197.159.160.0/19',
4455         'SV': '168.243.0.0/16',
4456         'SX': '190.102.0.0/20',
4457         'SY': '5.0.0.0/16',
4458         'SZ': '41.84.224.0/19',
4459         'TC': '65.255.48.0/20',
4460         'TD': '154.68.128.0/19',
4461         'TG': '196.168.0.0/14',
4462         'TH': '171.96.0.0/13',
4463         'TJ': '85.9.128.0/18',
4464         'TK': '27.96.24.0/21',
4465         'TL': '180.189.160.0/20',
4466         'TM': '95.85.96.0/19',
4467         'TN': '197.0.0.0/11',
4468         'TO': '175.176.144.0/21',
4469         'TR': '78.160.0.0/11',
4470         'TT': '186.44.0.0/15',
4471         'TV': '202.2.96.0/19',
4472         'TW': '120.96.0.0/11',
4473         'TZ': '156.156.0.0/14',
4474         'UA': '37.52.0.0/14',
4475         'UG': '102.80.0.0/13',
4476         'US': '6.0.0.0/8',
4477         'UY': '167.56.0.0/13',
4478         'UZ': '84.54.64.0/18',
4479         'VA': '212.77.0.0/19',
4480         'VC': '207.191.240.0/21',
4481         'VE': '186.88.0.0/13',
4482         'VG': '66.81.192.0/20',
4483         'VI': '146.226.0.0/16',
4484         'VN': '14.160.0.0/11',
4485         'VU': '202.80.32.0/20',
4486         'WF': '117.20.32.0/21',
4487         'WS': '202.4.32.0/19',
4488         'YE': '134.35.0.0/16',
4489         'YT': '41.242.116.0/22',
4490         'ZA': '41.0.0.0/11',
4491         'ZM': '102.144.0.0/13',
4492         'ZW': '102.177.192.0/18',
4493     }
4494
4495     @classmethod
4496     def random_ipv4(cls, code_or_block):
4497         if len(code_or_block) == 2:
4498             block = cls._country_ip_map.get(code_or_block.upper())
4499             if not block:
4500                 return None
4501         else:
4502             block = code_or_block
4503         addr, preflen = block.split('/')
4504         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4505         addr_max = addr_min | (0xffffffff >> int(preflen))
4506         return compat_str(socket.inet_ntoa(
4507             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4508
4509
4510 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4511     def __init__(self, proxies=None):
4512         # Set default handlers
4513         for type in ('http', 'https'):
4514             setattr(self, '%s_open' % type,
4515                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4516                         meth(r, proxy, type))
4517         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4518
4519     def proxy_open(self, req, proxy, type):
4520         req_proxy = req.headers.get('Ytdl-request-proxy')
4521         if req_proxy is not None:
4522             proxy = req_proxy
4523             del req.headers['Ytdl-request-proxy']
4524
4525         if proxy == '__noproxy__':
4526             return None  # No Proxy
4527         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4528             req.add_header('Ytdl-socks-proxy', proxy)
4529             # yt-dlp's http/https handlers do wrapping the socket with socks
4530             return None
4531         return compat_urllib_request.ProxyHandler.proxy_open(
4532             self, req, proxy, type)
4533
4534
4535 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4536 # released into Public Domain
4537 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4538
4539 def long_to_bytes(n, blocksize=0):
4540     """long_to_bytes(n:long, blocksize:int) : string
4541     Convert a long integer to a byte string.
4542
4543     If optional blocksize is given and greater than zero, pad the front of the
4544     byte string with binary zeros so that the length is a multiple of
4545     blocksize.
4546     """
4547     # after much testing, this algorithm was deemed to be the fastest
4548     s = b''
4549     n = int(n)
4550     while n > 0:
4551         s = compat_struct_pack('>I', n & 0xffffffff) + s
4552         n = n >> 32
4553     # strip off leading zeros
4554     for i in range(len(s)):
4555         if s[i] != b'\000'[0]:
4556             break
4557     else:
4558         # only happens when n == 0
4559         s = b'\000'
4560         i = 0
4561     s = s[i:]
4562     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4563     # de-padding being done above, but sigh...
4564     if blocksize > 0 and len(s) % blocksize:
4565         s = (blocksize - len(s) % blocksize) * b'\000' + s
4566     return s
4567
4568
4569 def bytes_to_long(s):
4570     """bytes_to_long(string) : long
4571     Convert a byte string to a long integer.
4572
4573     This is (essentially) the inverse of long_to_bytes().
4574     """
4575     acc = 0
4576     length = len(s)
4577     if length % 4:
4578         extra = (4 - length % 4)
4579         s = b'\000' * extra + s
4580         length = length + extra
4581     for i in range(0, length, 4):
4582         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4583     return acc
4584
4585
4586 def ohdave_rsa_encrypt(data, exponent, modulus):
4587     '''
4588     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4589
4590     Input:
4591         data: data to encrypt, bytes-like object
4592         exponent, modulus: parameter e and N of RSA algorithm, both integer
4593     Output: hex string of encrypted data
4594
4595     Limitation: supports one block encryption only
4596     '''
4597
4598     payload = int(binascii.hexlify(data[::-1]), 16)
4599     encrypted = pow(payload, exponent, modulus)
4600     return '%x' % encrypted
4601
4602
4603 def pkcs1pad(data, length):
4604     """
4605     Padding input data with PKCS#1 scheme
4606
4607     @param {int[]} data        input data
4608     @param {int}   length      target length
4609     @returns {int[]}           padded data
4610     """
4611     if len(data) > length - 11:
4612         raise ValueError('Input data too long for PKCS#1 padding')
4613
4614     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4615     return [0, 2] + pseudo_random + [0] + data
4616
4617
4618 def encode_base_n(num, n, table=None):
4619     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4620     if not table:
4621         table = FULL_TABLE[:n]
4622
4623     if n > len(table):
4624         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4625
4626     if num == 0:
4627         return table[0]
4628
4629     ret = ''
4630     while num:
4631         ret = table[num % n] + ret
4632         num = num // n
4633     return ret
4634
4635
4636 def decode_packed_codes(code):
4637     mobj = re.search(PACKED_CODES_RE, code)
4638     obfuscated_code, base, count, symbols = mobj.groups()
4639     base = int(base)
4640     count = int(count)
4641     symbols = symbols.split('|')
4642     symbol_table = {}
4643
4644     while count:
4645         count -= 1
4646         base_n_count = encode_base_n(count, base)
4647         symbol_table[base_n_count] = symbols[count] or base_n_count
4648
4649     return re.sub(
4650         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4651         obfuscated_code)
4652
4653
4654 def caesar(s, alphabet, shift):
4655     if shift == 0:
4656         return s
4657     l = len(alphabet)
4658     return ''.join(
4659         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4660         for c in s)
4661
4662
4663 def rot47(s):
4664     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4665
4666
4667 def parse_m3u8_attributes(attrib):
4668     info = {}
4669     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4670         if val.startswith('"'):
4671             val = val[1:-1]
4672         info[key] = val
4673     return info
4674
4675
4676 def urshift(val, n):
4677     return val >> n if val >= 0 else (val + 0x100000000) >> n
4678
4679
4680 # Based on png2str() written by @gdkchan and improved by @yokrysty
4681 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4682 def decode_png(png_data):
4683     # Reference: https://www.w3.org/TR/PNG/
4684     header = png_data[8:]
4685
4686     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4687         raise IOError('Not a valid PNG file.')
4688
4689     int_map = {1: '>B', 2: '>H', 4: '>I'}
4690     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4691
4692     chunks = []
4693
4694     while header:
4695         length = unpack_integer(header[:4])
4696         header = header[4:]
4697
4698         chunk_type = header[:4]
4699         header = header[4:]
4700
4701         chunk_data = header[:length]
4702         header = header[length:]
4703
4704         header = header[4:]  # Skip CRC
4705
4706         chunks.append({
4707             'type': chunk_type,
4708             'length': length,
4709             'data': chunk_data
4710         })
4711
4712     ihdr = chunks[0]['data']
4713
4714     width = unpack_integer(ihdr[:4])
4715     height = unpack_integer(ihdr[4:8])
4716
4717     idat = b''
4718
4719     for chunk in chunks:
4720         if chunk['type'] == b'IDAT':
4721             idat += chunk['data']
4722
4723     if not idat:
4724         raise IOError('Unable to read PNG data.')
4725
4726     decompressed_data = bytearray(zlib.decompress(idat))
4727
4728     stride = width * 3
4729     pixels = []
4730
4731     def _get_pixel(idx):
4732         x = idx % stride
4733         y = idx // stride
4734         return pixels[y][x]
4735
4736     for y in range(height):
4737         basePos = y * (1 + stride)
4738         filter_type = decompressed_data[basePos]
4739
4740         current_row = []
4741
4742         pixels.append(current_row)
4743
4744         for x in range(stride):
4745             color = decompressed_data[1 + basePos + x]
4746             basex = y * stride + x
4747             left = 0
4748             up = 0
4749
4750             if x > 2:
4751                 left = _get_pixel(basex - 3)
4752             if y > 0:
4753                 up = _get_pixel(basex - stride)
4754
4755             if filter_type == 1:  # Sub
4756                 color = (color + left) & 0xff
4757             elif filter_type == 2:  # Up
4758                 color = (color + up) & 0xff
4759             elif filter_type == 3:  # Average
4760                 color = (color + ((left + up) >> 1)) & 0xff
4761             elif filter_type == 4:  # Paeth
4762                 a = left
4763                 b = up
4764                 c = 0
4765
4766                 if x > 2 and y > 0:
4767                     c = _get_pixel(basex - stride - 3)
4768
4769                 p = a + b - c
4770
4771                 pa = abs(p - a)
4772                 pb = abs(p - b)
4773                 pc = abs(p - c)
4774
4775                 if pa <= pb and pa <= pc:
4776                     color = (color + a) & 0xff
4777                 elif pb <= pc:
4778                     color = (color + b) & 0xff
4779                 else:
4780                     color = (color + c) & 0xff
4781
4782             current_row.append(color)
4783
4784     return width, height, pixels
4785
4786
4787 def write_xattr(path, key, value):
4788     # This mess below finds the best xattr tool for the job
4789     try:
4790         # try the pyxattr module...
4791         import xattr
4792
4793         if hasattr(xattr, 'set'):  # pyxattr
4794             # Unicode arguments are not supported in python-pyxattr until
4795             # version 0.5.0
4796             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4797             pyxattr_required_version = '0.5.0'
4798             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4799                 # TODO: fallback to CLI tools
4800                 raise XAttrUnavailableError(
4801                     'python-pyxattr is detected but is too old. '
4802                     'yt-dlp requires %s or above while your version is %s. '
4803                     'Falling back to other xattr implementations' % (
4804                         pyxattr_required_version, xattr.__version__))
4805
4806             setxattr = xattr.set
4807         else:  # xattr
4808             setxattr = xattr.setxattr
4809
4810         try:
4811             setxattr(path, key, value)
4812         except EnvironmentError as e:
4813             raise XAttrMetadataError(e.errno, e.strerror)
4814
4815     except ImportError:
4816         if compat_os_name == 'nt':
4817             # Write xattrs to NTFS Alternate Data Streams:
4818             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4819             assert ':' not in key
4820             assert os.path.exists(path)
4821
4822             ads_fn = path + ':' + key
4823             try:
4824                 with open(ads_fn, 'wb') as f:
4825                     f.write(value)
4826             except EnvironmentError as e:
4827                 raise XAttrMetadataError(e.errno, e.strerror)
4828         else:
4829             user_has_setfattr = check_executable('setfattr', ['--version'])
4830             user_has_xattr = check_executable('xattr', ['-h'])
4831
4832             if user_has_setfattr or user_has_xattr:
4833
4834                 value = value.decode('utf-8')
4835                 if user_has_setfattr:
4836                     executable = 'setfattr'
4837                     opts = ['-n', key, '-v', value]
4838                 elif user_has_xattr:
4839                     executable = 'xattr'
4840                     opts = ['-w', key, value]
4841
4842                 cmd = ([encodeFilename(executable, True)]
4843                        + [encodeArgument(o) for o in opts]
4844                        + [encodeFilename(path, True)])
4845
4846                 try:
4847                     p = Popen(
4848                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4849                 except EnvironmentError as e:
4850                     raise XAttrMetadataError(e.errno, e.strerror)
4851                 stdout, stderr = p.communicate_or_kill()
4852                 stderr = stderr.decode('utf-8', 'replace')
4853                 if p.returncode != 0:
4854                     raise XAttrMetadataError(p.returncode, stderr)
4855
4856             else:
4857                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4858                 if sys.platform.startswith('linux'):
4859                     raise XAttrUnavailableError(
4860                         "Couldn't find a tool to set the xattrs. "
4861                         "Install either the python 'pyxattr' or 'xattr' "
4862                         "modules, or the GNU 'attr' package "
4863                         "(which contains the 'setfattr' tool).")
4864                 else:
4865                     raise XAttrUnavailableError(
4866                         "Couldn't find a tool to set the xattrs. "
4867                         "Install either the python 'xattr' module, "
4868                         "or the 'xattr' binary.")
4869
4870
4871 def random_birthday(year_field, month_field, day_field):
4872     start_date = datetime.date(1950, 1, 1)
4873     end_date = datetime.date(1995, 12, 31)
4874     offset = random.randint(0, (end_date - start_date).days)
4875     random_date = start_date + datetime.timedelta(offset)
4876     return {
4877         year_field: str(random_date.year),
4878         month_field: str(random_date.month),
4879         day_field: str(random_date.day),
4880     }
4881
4882
4883 # Templates for internet shortcut files, which are plain text files.
4884 DOT_URL_LINK_TEMPLATE = '''
4885 [InternetShortcut]
4886 URL=%(url)s
4887 '''.lstrip()
4888
4889 DOT_WEBLOC_LINK_TEMPLATE = '''
4890 <?xml version="1.0" encoding="UTF-8"?>
4891 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4892 <plist version="1.0">
4893 <dict>
4894 \t<key>URL</key>
4895 \t<string>%(url)s</string>
4896 </dict>
4897 </plist>
4898 '''.lstrip()
4899
4900 DOT_DESKTOP_LINK_TEMPLATE = '''
4901 [Desktop Entry]
4902 Encoding=UTF-8
4903 Name=%(filename)s
4904 Type=Link
4905 URL=%(url)s
4906 Icon=text-html
4907 '''.lstrip()
4908
4909 LINK_TEMPLATES = {
4910     'url': DOT_URL_LINK_TEMPLATE,
4911     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4912     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4913 }
4914
4915
4916 def iri_to_uri(iri):
4917     """
4918     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4919
4920     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4921     """
4922
4923     iri_parts = compat_urllib_parse_urlparse(iri)
4924
4925     if '[' in iri_parts.netloc:
4926         raise ValueError('IPv6 URIs are not, yet, supported.')
4927         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4928
4929     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4930
4931     net_location = ''
4932     if iri_parts.username:
4933         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4934         if iri_parts.password is not None:
4935             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4936         net_location += '@'
4937
4938     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4939     # The 'idna' encoding produces ASCII text.
4940     if iri_parts.port is not None and iri_parts.port != 80:
4941         net_location += ':' + str(iri_parts.port)
4942
4943     return compat_urllib_parse_urlunparse(
4944         (iri_parts.scheme,
4945             net_location,
4946
4947             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4948
4949             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4950             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4951
4952             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4953             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4954
4955             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4956
4957     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4958
4959
4960 def to_high_limit_path(path):
4961     if sys.platform in ['win32', 'cygwin']:
4962         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4963         return r'\\?\ '.rstrip() + os.path.abspath(path)
4964
4965     return path
4966
4967
4968 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4969     if field is None:
4970         val = obj if obj is not None else default
4971     else:
4972         val = obj.get(field, default)
4973     if func and val not in ignore:
4974         val = func(val)
4975     return template % val if val not in ignore else default
4976
4977
4978 def clean_podcast_url(url):
4979     return re.sub(r'''(?x)
4980         (?:
4981             (?:
4982                 chtbl\.com/track|
4983                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4984                 play\.podtrac\.com
4985             )/[^/]+|
4986             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4987             flex\.acast\.com|
4988             pd(?:
4989                 cn\.co| # https://podcorn.com/analytics-prefix/
4990                 st\.fm # https://podsights.com/docs/
4991             )/e
4992         )/''', '', url)
4993
4994
4995 _HEX_TABLE = '0123456789abcdef'
4996
4997
4998 def random_uuidv4():
4999     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5000
5001
5002 def make_dir(path, to_screen=None):
5003     try:
5004         dn = os.path.dirname(path)
5005         if dn and not os.path.exists(dn):
5006             os.makedirs(dn)
5007         return True
5008     except (OSError, IOError) as err:
5009         if callable(to_screen) is not None:
5010             to_screen('unable to create directory ' + error_to_compat_str(err))
5011         return False
5012
5013
5014 def get_executable_path():
5015     from zipimport import zipimporter
5016     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5017         path = os.path.dirname(sys.executable)
5018     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5019         path = os.path.join(os.path.dirname(__file__), '../..')
5020     else:
5021         path = os.path.join(os.path.dirname(__file__), '..')
5022     return os.path.abspath(path)
5023
5024
5025 def load_plugins(name, suffix, namespace):
5026     classes = {}
5027     try:
5028         plugins_spec = importlib.util.spec_from_file_location(
5029             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5030         plugins = importlib.util.module_from_spec(plugins_spec)
5031         sys.modules[plugins_spec.name] = plugins
5032         plugins_spec.loader.exec_module(plugins)
5033         for name in dir(plugins):
5034             if name in namespace:
5035                 continue
5036             if not name.endswith(suffix):
5037                 continue
5038             klass = getattr(plugins, name)
5039             classes[name] = namespace[name] = klass
5040     except FileNotFoundError:
5041         pass
5042     return classes
5043
5044
5045 def traverse_obj(
5046         obj, *path_list, default=None, expected_type=None, get_all=True,
5047         casesense=True, is_user_input=False, traverse_string=False):
5048     ''' Traverse nested list/dict/tuple
5049     @param path_list        A list of paths which are checked one by one.
5050                             Each path is a list of keys where each key is a string,
5051                             a function, a tuple of strings/None or "...".
5052                             When a fuction is given, it takes the key as argument and
5053                             returns whether the key matches or not. When a tuple is given,
5054                             all the keys given in the tuple are traversed, and
5055                             "..." traverses all the keys in the object
5056                             "None" returns the object without traversal
5057     @param default          Default value to return
5058     @param expected_type    Only accept final value of this type (Can also be any callable)
5059     @param get_all          Return all the values obtained from a path or only the first one
5060     @param casesense        Whether to consider dictionary keys as case sensitive
5061     @param is_user_input    Whether the keys are generated from user input. If True,
5062                             strings are converted to int/slice if necessary
5063     @param traverse_string  Whether to traverse inside strings. If True, any
5064                             non-compatible object will also be converted into a string
5065     # TODO: Write tests
5066     '''
5067     if not casesense:
5068         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5069         path_list = (map(_lower, variadic(path)) for path in path_list)
5070
5071     def _traverse_obj(obj, path, _current_depth=0):
5072         nonlocal depth
5073         path = tuple(variadic(path))
5074         for i, key in enumerate(path):
5075             if None in (key, obj):
5076                 return obj
5077             if isinstance(key, (list, tuple)):
5078                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5079                 key = ...
5080             if key is ...:
5081                 obj = (obj.values() if isinstance(obj, dict)
5082                        else obj if isinstance(obj, (list, tuple, LazyList))
5083                        else str(obj) if traverse_string else [])
5084                 _current_depth += 1
5085                 depth = max(depth, _current_depth)
5086                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5087             elif callable(key):
5088                 if isinstance(obj, (list, tuple, LazyList)):
5089                     obj = enumerate(obj)
5090                 elif isinstance(obj, dict):
5091                     obj = obj.items()
5092                 else:
5093                     if not traverse_string:
5094                         return None
5095                     obj = str(obj)
5096                 _current_depth += 1
5097                 depth = max(depth, _current_depth)
5098                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5099             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5100                 obj = (obj.get(key) if casesense or (key in obj)
5101                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5102             else:
5103                 if is_user_input:
5104                     key = (int_or_none(key) if ':' not in key
5105                            else slice(*map(int_or_none, key.split(':'))))
5106                     if key == slice(None):
5107                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5108                 if not isinstance(key, (int, slice)):
5109                     return None
5110                 if not isinstance(obj, (list, tuple, LazyList)):
5111                     if not traverse_string:
5112                         return None
5113                     obj = str(obj)
5114                 try:
5115                     obj = obj[key]
5116                 except IndexError:
5117                     return None
5118         return obj
5119
5120     if isinstance(expected_type, type):
5121         type_test = lambda val: val if isinstance(val, expected_type) else None
5122     elif expected_type is not None:
5123         type_test = expected_type
5124     else:
5125         type_test = lambda val: val
5126
5127     for path in path_list:
5128         depth = 0
5129         val = _traverse_obj(obj, path)
5130         if val is not None:
5131             if depth:
5132                 for _ in range(depth - 1):
5133                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5134                 val = [v for v in map(type_test, val) if v is not None]
5135                 if val:
5136                     return val if get_all else val[0]
5137             else:
5138                 val = type_test(val)
5139                 if val is not None:
5140                     return val
5141     return default
5142
5143
5144 def traverse_dict(dictn, keys, casesense=True):
5145     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5146                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5147     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5148
5149
5150 def variadic(x, allowed_types=(str, bytes, dict)):
5151     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5152
5153
5154 # create a JSON Web Signature (jws) with HS256 algorithm
5155 # the resulting format is in JWS Compact Serialization
5156 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5157 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5158 def jwt_encode_hs256(payload_data, key, headers={}):
5159     header_data = {
5160         'alg': 'HS256',
5161         'typ': 'JWT',
5162     }
5163     if headers:
5164         header_data.update(headers)
5165     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5166     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5167     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5168     signature_b64 = base64.b64encode(h.digest())
5169     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5170     return token
5171
5172
5173 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5174 def jwt_decode_hs256(jwt):
5175     header_b64, payload_b64, signature_b64 = jwt.split('.')
5176     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5177     return payload_data
5178
5179
5180 def supports_terminal_sequences(stream):
5181     if compat_os_name == 'nt':
5182         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5183         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5184             return False
5185     elif not os.getenv('TERM'):
5186         return False
5187     try:
5188         return stream.isatty()
5189     except BaseException:
5190         return False
5191
5192
5193 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5194
5195
5196 def remove_terminal_sequences(string):
5197     return _terminal_sequences_re.sub('', string)
5198
5199
5200 def number_of_digits(number):
5201     return len('%d' % number)
5202
5203
5204 def join_nonempty(*values, delim='-', from_dict=None):
5205     if from_dict is not None:
5206         values = map(from_dict.get, values)
5207     return delim.join(map(str, filter(None, values)))
5208
5209
5210 class Config:
5211     own_args = None
5212     filename = None
5213     __initialized = False
5214
5215     def __init__(self, parser, label=None):
5216         self._parser, self.label = parser, label
5217         self._loaded_paths, self.configs = set(), []
5218
5219     def init(self, args=None, filename=None):
5220         assert not self.__initialized
5221         if filename:
5222             location = os.path.realpath(filename)
5223             if location in self._loaded_paths:
5224                 return False
5225             self._loaded_paths.add(location)
5226
5227         self.__initialized = True
5228         self.own_args, self.filename = args, filename
5229         for location in self._parser.parse_args(args)[0].config_locations or []:
5230             location = compat_expanduser(location)
5231             if os.path.isdir(location):
5232                 location = os.path.join(location, 'yt-dlp.conf')
5233             if not os.path.exists(location):
5234                 self._parser.error(f'config location {location} does not exist')
5235             self.append_config(self.read_file(location), location)
5236         return True
5237
5238     def __str__(self):
5239         label = join_nonempty(
5240             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5241             delim=' ')
5242         return join_nonempty(
5243             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5244             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5245             delim='\n')
5246
5247     @staticmethod
5248     def read_file(filename, default=[]):
5249         try:
5250             optionf = open(filename)
5251         except IOError:
5252             return default  # silently skip if file is not present
5253         try:
5254             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5255             contents = optionf.read()
5256             if sys.version_info < (3,):
5257                 contents = contents.decode(preferredencoding())
5258             res = compat_shlex_split(contents, comments=True)
5259         finally:
5260             optionf.close()
5261         return res
5262
5263     @staticmethod
5264     def hide_login_info(opts):
5265         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5266         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5267
5268         def _scrub_eq(o):
5269             m = eqre.match(o)
5270             if m:
5271                 return m.group('key') + '=PRIVATE'
5272             else:
5273                 return o
5274
5275         opts = list(map(_scrub_eq, opts))
5276         for idx, opt in enumerate(opts):
5277             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5278                 opts[idx + 1] = 'PRIVATE'
5279         return opts
5280
5281     def append_config(self, *args, label=None):
5282         config = type(self)(self._parser, label)
5283         config._loaded_paths = self._loaded_paths
5284         if config.init(*args):
5285             self.configs.append(config)
5286
5287     @property
5288     def all_args(self):
5289         for config in reversed(self.configs):
5290             yield from config.all_args
5291         yield from self.own_args or []
5292
5293     def parse_args(self):
5294         return self._parser.parse_args(list(self.all_args))