yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_brotli,
  51     compat_chr,
  52     compat_cookiejar,
  53     compat_ctypes_WINFUNCTYPE,
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_html_entities,
  57     compat_html_entities_html5,
  58     compat_http_client,
  59     compat_integer_types,
  60     compat_numeric_types,
  61     compat_kwargs,
  62     compat_os_name,
  63     compat_parse_qs,
  64     compat_shlex_split,
  65     compat_shlex_quote,
  66     compat_str,
  67     compat_struct_pack,
  68     compat_struct_unpack,
  69     compat_urllib_error,
  70     compat_urllib_parse,
  71     compat_urllib_parse_urlencode,
  72     compat_urllib_parse_urlparse,
  73     compat_urllib_parse_urlunparse,
  74     compat_urllib_parse_quote,
  75     compat_urllib_parse_quote_plus,
  76     compat_urllib_parse_unquote_plus,
  77     compat_urllib_request,
  78     compat_urlparse,
  79     compat_websockets,
  80     compat_xpath,
  81 )
  82
  83 from .socks import (
  84     ProxyType,
  85     sockssocket,
  86 )
  87
  88
  89 def register_socks_protocols():
  90     # "Register" SOCKS protocols
  91     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  92     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  93     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  94         if scheme not in compat_urlparse.uses_netloc:
  95             compat_urlparse.uses_netloc.append(scheme)
  96
  97
  98 # This is not clearly defined otherwise
  99 compiled_regex_type = type(re.compile(''))
 100
 101
 102 def random_user_agent():
 103     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 104     _CHROME_VERSIONS = (
 105         '90.0.4430.212',
 106         '90.0.4430.24',
 107         '90.0.4430.70',
 108         '90.0.4430.72',
 109         '90.0.4430.85',
 110         '90.0.4430.93',
 111         '91.0.4472.101',
 112         '91.0.4472.106',
 113         '91.0.4472.114',
 114         '91.0.4472.124',
 115         '91.0.4472.164',
 116         '91.0.4472.19',
 117         '91.0.4472.77',
 118         '92.0.4515.107',
 119         '92.0.4515.115',
 120         '92.0.4515.131',
 121         '92.0.4515.159',
 122         '92.0.4515.43',
 123         '93.0.4556.0',
 124         '93.0.4577.15',
 125         '93.0.4577.63',
 126         '93.0.4577.82',
 127         '94.0.4606.41',
 128         '94.0.4606.54',
 129         '94.0.4606.61',
 130         '94.0.4606.71',
 131         '94.0.4606.81',
 132         '94.0.4606.85',
 133         '95.0.4638.17',
 134         '95.0.4638.50',
 135         '95.0.4638.54',
 136         '95.0.4638.69',
 137         '95.0.4638.74',
 138         '96.0.4664.18',
 139         '96.0.4664.45',
 140         '96.0.4664.55',
 141         '96.0.4664.93',
 142         '97.0.4692.20',
 143     )
 144     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 145
 146
 147 SUPPORTED_ENCODINGS = [
 148     'gzip', 'deflate'
 149 ]
 150 if compat_brotli:
 151     SUPPORTED_ENCODINGS.append('br')
 152
 153 std_headers = {
 154     'User-Agent': random_user_agent(),
 155     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 156     'Accept-Encoding': ', '.join(SUPPORTED_ENCODINGS),
 157     'Accept-Language': 'en-us,en;q=0.5',
 158     'Sec-Fetch-Mode': 'navigate',
 159 }
 160
 161
 162 USER_AGENTS = {
 163     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 164 }
 165
 166
 167 NO_DEFAULT = object()
 168
 169 ENGLISH_MONTH_NAMES = [
 170     'January', 'February', 'March', 'April', 'May', 'June',
 171     'July', 'August', 'September', 'October', 'November', 'December']
 172
 173 MONTH_NAMES = {
 174     'en': ENGLISH_MONTH_NAMES,
 175     'fr': [
 176         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 177         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 178 }
 179
 180 KNOWN_EXTENSIONS = (
 181     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 182     'flv', 'f4v', 'f4a', 'f4b',
 183     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 184     'mkv', 'mka', 'mk3d',
 185     'avi', 'divx',
 186     'mov',
 187     'asf', 'wmv', 'wma',
 188     '3gp', '3g2',
 189     'mp3',
 190     'flac',
 191     'ape',
 192     'wav',
 193     'f4f', 'f4m', 'm3u8', 'smil')
 194
 195 # needed for sanitizing filenames in restricted mode
 196 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 197                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 198                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 199
 200 DATE_FORMATS = (
 201     '%d %B %Y',
 202     '%d %b %Y',
 203     '%B %d %Y',
 204     '%B %dst %Y',
 205     '%B %dnd %Y',
 206     '%B %drd %Y',
 207     '%B %dth %Y',
 208     '%b %d %Y',
 209     '%b %dst %Y',
 210     '%b %dnd %Y',
 211     '%b %drd %Y',
 212     '%b %dth %Y',
 213     '%b %dst %Y %I:%M',
 214     '%b %dnd %Y %I:%M',
 215     '%b %drd %Y %I:%M',
 216     '%b %dth %Y %I:%M',
 217     '%Y %m %d',
 218     '%Y-%m-%d',
 219     '%Y.%m.%d.',
 220     '%Y/%m/%d',
 221     '%Y/%m/%d %H:%M',
 222     '%Y/%m/%d %H:%M:%S',
 223     '%Y%m%d%H%M',
 224     '%Y%m%d%H%M%S',
 225     '%Y%m%d',
 226     '%Y-%m-%d %H:%M',
 227     '%Y-%m-%d %H:%M:%S',
 228     '%Y-%m-%d %H:%M:%S.%f',
 229     '%Y-%m-%d %H:%M:%S:%f',
 230     '%d.%m.%Y %H:%M',
 231     '%d.%m.%Y %H.%M',
 232     '%Y-%m-%dT%H:%M:%SZ',
 233     '%Y-%m-%dT%H:%M:%S.%fZ',
 234     '%Y-%m-%dT%H:%M:%S.%f0Z',
 235     '%Y-%m-%dT%H:%M:%S',
 236     '%Y-%m-%dT%H:%M:%S.%f',
 237     '%Y-%m-%dT%H:%M',
 238     '%b %d %Y at %H:%M',
 239     '%b %d %Y at %H:%M:%S',
 240     '%B %d %Y at %H:%M',
 241     '%B %d %Y at %H:%M:%S',
 242     '%H:%M %d-%b-%Y',
 243 )
 244
 245 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 246 DATE_FORMATS_DAY_FIRST.extend([
 247     '%d-%m-%Y',
 248     '%d.%m.%Y',
 249     '%d.%m.%y',
 250     '%d/%m/%Y',
 251     '%d/%m/%y',
 252     '%d/%m/%Y %H:%M:%S',
 253 ])
 254
 255 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 256 DATE_FORMATS_MONTH_FIRST.extend([
 257     '%m-%d-%Y',
 258     '%m.%d.%Y',
 259     '%m/%d/%Y',
 260     '%m/%d/%y',
 261     '%m/%d/%Y %H:%M:%S',
 262 ])
 263
 264 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 265 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 266
 267
 268 def preferredencoding():
 269     """Get preferred encoding.
 270
 271     Returns the best encoding scheme for the system, based on
 272     locale.getpreferredencoding() and some further tweaks.
 273     """
 274     try:
 275         pref = locale.getpreferredencoding()
 276         'TEST'.encode(pref)
 277     except Exception:
 278         pref = 'UTF-8'
 279
 280     return pref
 281
 282
 283 def write_json_file(obj, fn):
 284     """ Encode obj as JSON and write it to fn, atomically if possible """
 285
 286     fn = encodeFilename(fn)
 287     if sys.version_info < (3, 0) and sys.platform != 'win32':
 288         encoding = get_filesystem_encoding()
 289         # os.path.basename returns a bytes object, but NamedTemporaryFile
 290         # will fail if the filename contains non ascii characters unless we
 291         # use a unicode object
 292         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 293         # the same for os.path.dirname
 294         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 295     else:
 296         path_basename = os.path.basename
 297         path_dirname = os.path.dirname
 298
 299     args = {
 300         'suffix': '.tmp',
 301         'prefix': path_basename(fn) + '.',
 302         'dir': path_dirname(fn),
 303         'delete': False,
 304     }
 305
 306     # In Python 2.x, json.dump expects a bytestream.
 307     # In Python 3.x, it writes to a character stream
 308     if sys.version_info < (3, 0):
 309         args['mode'] = 'wb'
 310     else:
 311         args.update({
 312             'mode': 'w',
 313             'encoding': 'utf-8',
 314         })
 315
 316     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 317
 318     try:
 319         with tf:
 320             json.dump(obj, tf, ensure_ascii=False)
 321         if sys.platform == 'win32':
 322             # Need to remove existing file on Windows, else os.rename raises
 323             # WindowsError or FileExistsError.
 324             try:
 325                 os.unlink(fn)
 326             except OSError:
 327                 pass
 328         try:
 329             mask = os.umask(0)
 330             os.umask(mask)
 331             os.chmod(tf.name, 0o666 & ~mask)
 332         except OSError:
 333             pass
 334         os.rename(tf.name, fn)
 335     except Exception:
 336         try:
 337             os.remove(tf.name)
 338         except OSError:
 339             pass
 340         raise
 341
 342
 343 if sys.version_info >= (2, 7):
 344     def find_xpath_attr(node, xpath, key, val=None):
 345         """ Find the xpath xpath[@key=val] """
 346         assert re.match(r'^[a-zA-Z_-]+$', key)
 347         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 348         return node.find(expr)
 349 else:
 350     def find_xpath_attr(node, xpath, key, val=None):
 351         for f in node.findall(compat_xpath(xpath)):
 352             if key not in f.attrib:
 353                 continue
 354             if val is None or f.attrib.get(key) == val:
 355                 return f
 356         return None
 357
 358 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 359 # the namespace parameter
 360
 361
 362 def xpath_with_ns(path, ns_map):
 363     components = [c.split(':') for c in path.split('/')]
 364     replaced = []
 365     for c in components:
 366         if len(c) == 1:
 367             replaced.append(c[0])
 368         else:
 369             ns, tag = c
 370             replaced.append('{%s}%s' % (ns_map[ns], tag))
 371     return '/'.join(replaced)
 372
 373
 374 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 375     def _find_xpath(xpath):
 376         return node.find(compat_xpath(xpath))
 377
 378     if isinstance(xpath, (str, compat_str)):
 379         n = _find_xpath(xpath)
 380     else:
 381         for xp in xpath:
 382             n = _find_xpath(xp)
 383             if n is not None:
 384                 break
 385
 386     if n is None:
 387         if default is not NO_DEFAULT:
 388             return default
 389         elif fatal:
 390             name = xpath if name is None else name
 391             raise ExtractorError('Could not find XML element %s' % name)
 392         else:
 393             return None
 394     return n
 395
 396
 397 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 398     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 399     if n is None or n == default:
 400         return n
 401     if n.text is None:
 402         if default is not NO_DEFAULT:
 403             return default
 404         elif fatal:
 405             name = xpath if name is None else name
 406             raise ExtractorError('Could not find XML element\'s text %s' % name)
 407         else:
 408             return None
 409     return n.text
 410
 411
 412 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 413     n = find_xpath_attr(node, xpath, key)
 414     if n is None:
 415         if default is not NO_DEFAULT:
 416             return default
 417         elif fatal:
 418             name = '%s[@%s]' % (xpath, key) if name is None else name
 419             raise ExtractorError('Could not find XML attribute %s' % name)
 420         else:
 421             return None
 422     return n.attrib[key]
 423
 424
 425 def get_element_by_id(id, html):
 426     """Return the content of the tag with the specified ID in the passed HTML document"""
 427     return get_element_by_attribute('id', id, html)
 428
 429
 430 def get_element_html_by_id(id, html):
 431     """Return the html of the tag with the specified ID in the passed HTML document"""
 432     return get_element_html_by_attribute('id', id, html)
 433
 434
 435 def get_element_by_class(class_name, html):
 436     """Return the content of the first tag with the specified class in the passed HTML document"""
 437     retval = get_elements_by_class(class_name, html)
 438     return retval[0] if retval else None
 439
 440
 441 def get_element_html_by_class(class_name, html):
 442     """Return the html of the first tag with the specified class in the passed HTML document"""
 443     retval = get_elements_html_by_class(class_name, html)
 444     return retval[0] if retval else None
 445
 446
 447 def get_element_by_attribute(attribute, value, html, escape_value=True):
 448     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 449     return retval[0] if retval else None
 450
 451
 452 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 453     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 454     return retval[0] if retval else None
 455
 456
 457 def get_elements_by_class(class_name, html):
 458     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 459     return get_elements_by_attribute(
 460         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 461         html, escape_value=False)
 462
 463
 464 def get_elements_html_by_class(class_name, html):
 465     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 466     return get_elements_html_by_attribute(
 467         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 468         html, escape_value=False)
 469
 470
 471 def get_elements_by_attribute(*args, **kwargs):
 472     """Return the content of the tag with the specified attribute in the passed HTML document"""
 473     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 474
 475
 476 def get_elements_html_by_attribute(*args, **kwargs):
 477     """Return the html of the tag with the specified attribute in the passed HTML document"""
 478     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 479
 480
 481 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 482     """
 483     Return the text (content) and the html (whole) of the tag with the specified
 484     attribute in the passed HTML document
 485     """
 486
 487     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 488
 489     value = re.escape(value) if escape_value else value
 490
 491     partial_element_re = r'''(?x)
 492         <(?P<tag>[a-zA-Z0-9:._-]+)
 493          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 494          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 495         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 496
 497     for m in re.finditer(partial_element_re, html):
 498         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 499
 500         yield (
 501             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 502             whole
 503         )
 504
 505
 506 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 507     """
 508     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 509     closing tag for the first opening tag it has encountered, and can be used
 510     as a context manager
 511     """
 512
 513     class HTMLBreakOnClosingTagException(Exception):
 514         pass
 515
 516     def __init__(self):
 517         self.tagstack = collections.deque()
 518         compat_HTMLParser.__init__(self)
 519
 520     def __enter__(self):
 521         return self
 522
 523     def __exit__(self, *_):
 524         self.close()
 525
 526     def close(self):
 527         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 528         # so data remains buffered; we no longer have any interest in it, thus
 529         # override this method to discard it
 530         pass
 531
 532     def handle_starttag(self, tag, _):
 533         self.tagstack.append(tag)
 534
 535     def handle_endtag(self, tag):
 536         if not self.tagstack:
 537             raise compat_HTMLParseError('no tags in the stack')
 538         while self.tagstack:
 539             inner_tag = self.tagstack.pop()
 540             if inner_tag == tag:
 541                 break
 542         else:
 543             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 544         if not self.tagstack:
 545             raise self.HTMLBreakOnClosingTagException()
 546
 547
 548 def get_element_text_and_html_by_tag(tag, html):
 549     """
 550     For the first element with the specified tag in the passed HTML document
 551     return its' content (text) and the whole element (html)
 552     """
 553     def find_or_raise(haystack, needle, exc):
 554         try:
 555             return haystack.index(needle)
 556         except ValueError:
 557             raise exc
 558     closing_tag = f'</{tag}>'
 559     whole_start = find_or_raise(
 560         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 561     content_start = find_or_raise(
 562         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 563     content_start += whole_start + 1
 564     with HTMLBreakOnClosingTagParser() as parser:
 565         parser.feed(html[whole_start:content_start])
 566         if not parser.tagstack or parser.tagstack[0] != tag:
 567             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 568         offset = content_start
 569         while offset < len(html):
 570             next_closing_tag_start = find_or_raise(
 571                 html[offset:], closing_tag,
 572                 compat_HTMLParseError(f'closing {tag} tag not found'))
 573             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 574             try:
 575                 parser.feed(html[offset:offset + next_closing_tag_end])
 576                 offset += next_closing_tag_end
 577             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 578                 return html[content_start:offset + next_closing_tag_start], \
 579                     html[whole_start:offset + next_closing_tag_end]
 580         raise compat_HTMLParseError('unexpected end of html')
 581
 582
 583 class HTMLAttributeParser(compat_HTMLParser):
 584     """Trivial HTML parser to gather the attributes for a single element"""
 585
 586     def __init__(self):
 587         self.attrs = {}
 588         compat_HTMLParser.__init__(self)
 589
 590     def handle_starttag(self, tag, attrs):
 591         self.attrs = dict(attrs)
 592
 593
 594 class HTMLListAttrsParser(compat_HTMLParser):
 595     """HTML parser to gather the attributes for the elements of a list"""
 596
 597     def __init__(self):
 598         compat_HTMLParser.__init__(self)
 599         self.items = []
 600         self._level = 0
 601
 602     def handle_starttag(self, tag, attrs):
 603         if tag == 'li' and self._level == 0:
 604             self.items.append(dict(attrs))
 605         self._level += 1
 606
 607     def handle_endtag(self, tag):
 608         self._level -= 1
 609
 610
 611 def extract_attributes(html_element):
 612     """Given a string for an HTML element such as
 613     <el
 614          a="foo" B="bar" c="&98;az" d=boz
 615          empty= noval entity="&amp;"
 616          sq='"' dq="'"
 617     >
 618     Decode and return a dictionary of attributes.
 619     {
 620         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 621         'empty': '', 'noval': None, 'entity': '&',
 622         'sq': '"', 'dq': '\''
 623     }.
 624     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 625     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 626     """
 627     parser = HTMLAttributeParser()
 628     try:
 629         parser.feed(html_element)
 630         parser.close()
 631     # Older Python may throw HTMLParseError in case of malformed HTML
 632     except compat_HTMLParseError:
 633         pass
 634     return parser.attrs
 635
 636
 637 def parse_list(webpage):
 638     """Given a string for an series of HTML <li> elements,
 639     return a dictionary of their attributes"""
 640     parser = HTMLListAttrsParser()
 641     parser.feed(webpage)
 642     parser.close()
 643     return parser.items
 644
 645
 646 def clean_html(html):
 647     """Clean an HTML snippet into a readable string"""
 648
 649     if html is None:  # Convenience for sanitizing descriptions etc.
 650         return html
 651
 652     html = re.sub(r'\s+', ' ', html)
 653     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 654     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 655     # Strip html tags
 656     html = re.sub('<.*?>', '', html)
 657     # Replace html entities
 658     html = unescapeHTML(html)
 659     return html.strip()
 660
 661
 662 def sanitize_open(filename, open_mode):
 663     """Try to open the given filename, and slightly tweak it if this fails.
 664
 665     Attempts to open the given filename. If this fails, it tries to change
 666     the filename slightly, step by step, until it's either able to open it
 667     or it fails and raises a final exception, like the standard open()
 668     function.
 669
 670     It returns the tuple (stream, definitive_file_name).
 671     """
 672     try:
 673         if filename == '-':
 674             if sys.platform == 'win32':
 675                 import msvcrt
 676                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 677             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 678         stream = locked_file(filename, open_mode, block=False).open()
 679         return (stream, filename)
 680     except (IOError, OSError) as err:
 681         if err.errno in (errno.EACCES,):
 682             raise
 683
 684         # In case of error, try to remove win32 forbidden chars
 685         alt_filename = sanitize_path(filename)
 686         if alt_filename == filename:
 687             raise
 688         else:
 689             # An exception here should be caught in the caller
 690             stream = locked_file(filename, open_mode, block=False).open()
 691             return (stream, alt_filename)
 692
 693
 694 def timeconvert(timestr):
 695     """Convert RFC 2822 defined time string into system timestamp"""
 696     timestamp = None
 697     timetuple = email.utils.parsedate_tz(timestr)
 698     if timetuple is not None:
 699         timestamp = email.utils.mktime_tz(timetuple)
 700     return timestamp
 701
 702
 703 def sanitize_filename(s, restricted=False, is_id=False):
 704     """Sanitizes a string so it could be used as part of a filename.
 705     If restricted is set, use a stricter subset of allowed characters.
 706     Set is_id if this is not an arbitrary string, but an ID that should be kept
 707     if possible.
 708     """
 709     def replace_insane(char):
 710         if restricted and char in ACCENT_CHARS:
 711             return ACCENT_CHARS[char]
 712         elif not restricted and char == '\n':
 713             return ' '
 714         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 715             return ''
 716         elif char == '"':
 717             return '' if restricted else '\''
 718         elif char == ':':
 719             return '_-' if restricted else ' -'
 720         elif char in '\\/|*<>':
 721             return '_'
 722         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 723             return '_'
 724         if restricted and ord(char) > 127:
 725             return '_'
 726         return char
 727
 728     if s == '':
 729         return ''
 730     # Handle timestamps
 731     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 732     result = ''.join(map(replace_insane, s))
 733     if not is_id:
 734         while '__' in result:
 735             result = result.replace('__', '_')
 736         result = result.strip('_')
 737         # Common case of "Foreign band name - English song title"
 738         if restricted and result.startswith('-_'):
 739             result = result[2:]
 740         if result.startswith('-'):
 741             result = '_' + result[len('-'):]
 742         result = result.lstrip('.')
 743         if not result:
 744             result = '_'
 745     return result
 746
 747
 748 def sanitize_path(s, force=False):
 749     """Sanitizes and normalizes path on Windows"""
 750     if sys.platform == 'win32':
 751         force = False
 752         drive_or_unc, _ = os.path.splitdrive(s)
 753         if sys.version_info < (2, 7) and not drive_or_unc:
 754             drive_or_unc, _ = os.path.splitunc(s)
 755     elif force:
 756         drive_or_unc = ''
 757     else:
 758         return s
 759
 760     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 761     if drive_or_unc:
 762         norm_path.pop(0)
 763     sanitized_path = [
 764         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 765         for path_part in norm_path]
 766     if drive_or_unc:
 767         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 768     elif force and s[0] == os.path.sep:
 769         sanitized_path.insert(0, os.path.sep)
 770     return os.path.join(*sanitized_path)
 771
 772
 773 def sanitize_url(url):
 774     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 775     # the number of unwanted failures due to missing protocol
 776     if url.startswith('//'):
 777         return 'http:%s' % url
 778     # Fix some common typos seen so far
 779     COMMON_TYPOS = (
 780         # https://github.com/ytdl-org/youtube-dl/issues/15649
 781         (r'^httpss://', r'https://'),
 782         # https://bx1.be/lives/direct-tv/
 783         (r'^rmtp([es]?)://', r'rtmp\1://'),
 784     )
 785     for mistake, fixup in COMMON_TYPOS:
 786         if re.match(mistake, url):
 787             return re.sub(mistake, fixup, url)
 788     return url
 789
 790
 791 def extract_basic_auth(url):
 792     parts = compat_urlparse.urlsplit(url)
 793     if parts.username is None:
 794         return url, None
 795     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 796         parts.hostname if parts.port is None
 797         else '%s:%d' % (parts.hostname, parts.port))))
 798     auth_payload = base64.b64encode(
 799         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 800     return url, 'Basic ' + auth_payload.decode('utf-8')
 801
 802
 803 def sanitized_Request(url, *args, **kwargs):
 804     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 805     if auth_header is not None:
 806         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 807         headers['Authorization'] = auth_header
 808     return compat_urllib_request.Request(url, *args, **kwargs)
 809
 810
 811 def expand_path(s):
 812     """Expand shell variables and ~"""
 813     return os.path.expandvars(compat_expanduser(s))
 814
 815
 816 def orderedSet(iterable):
 817     """ Remove all duplicates from the input iterable """
 818     res = []
 819     for el in iterable:
 820         if el not in res:
 821             res.append(el)
 822     return res
 823
 824
 825 def _htmlentity_transform(entity_with_semicolon):
 826     """Transforms an HTML entity to a character."""
 827     entity = entity_with_semicolon[:-1]
 828
 829     # Known non-numeric HTML entity
 830     if entity in compat_html_entities.name2codepoint:
 831         return compat_chr(compat_html_entities.name2codepoint[entity])
 832
 833     # TODO: HTML5 allows entities without a semicolon. For example,
 834     # '&Eacuteric' should be decoded as 'Éric'.
 835     if entity_with_semicolon in compat_html_entities_html5:
 836         return compat_html_entities_html5[entity_with_semicolon]
 837
 838     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 839     if mobj is not None:
 840         numstr = mobj.group(1)
 841         if numstr.startswith('x'):
 842             base = 16
 843             numstr = '0%s' % numstr
 844         else:
 845             base = 10
 846         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 847         try:
 848             return compat_chr(int(numstr, base))
 849         except ValueError:
 850             pass
 851
 852     # Unknown entity in name, return its literal representation
 853     return '&%s;' % entity
 854
 855
 856 def unescapeHTML(s):
 857     if s is None:
 858         return None
 859     assert type(s) == compat_str
 860
 861     return re.sub(
 862         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 863
 864
 865 def escapeHTML(text):
 866     return (
 867         text
 868         .replace('&', '&amp;')
 869         .replace('<', '&lt;')
 870         .replace('>', '&gt;')
 871         .replace('"', '&quot;')
 872         .replace("'", '&#39;')
 873     )
 874
 875
 876 def process_communicate_or_kill(p, *args, **kwargs):
 877     try:
 878         return p.communicate(*args, **kwargs)
 879     except BaseException:  # Including KeyboardInterrupt
 880         p.kill()
 881         p.wait()
 882         raise
 883
 884
 885 class Popen(subprocess.Popen):
 886     if sys.platform == 'win32':
 887         _startupinfo = subprocess.STARTUPINFO()
 888         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 889     else:
 890         _startupinfo = None
 891
 892     def __init__(self, *args, **kwargs):
 893         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 894
 895     def communicate_or_kill(self, *args, **kwargs):
 896         return process_communicate_or_kill(self, *args, **kwargs)
 897
 898
 899 def get_subprocess_encoding():
 900     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 901         # For subprocess calls, encode with locale encoding
 902         # Refer to http://stackoverflow.com/a/9951851/35070
 903         encoding = preferredencoding()
 904     else:
 905         encoding = sys.getfilesystemencoding()
 906     if encoding is None:
 907         encoding = 'utf-8'
 908     return encoding
 909
 910
 911 def encodeFilename(s, for_subprocess=False):
 912     """
 913     @param s The name of the file
 914     """
 915
 916     assert type(s) == compat_str
 917
 918     # Python 3 has a Unicode API
 919     if sys.version_info >= (3, 0):
 920         return s
 921
 922     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 923     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 924     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 925     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 926         return s
 927
 928     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 929     if sys.platform.startswith('java'):
 930         return s
 931
 932     return s.encode(get_subprocess_encoding(), 'ignore')
 933
 934
 935 def decodeFilename(b, for_subprocess=False):
 936
 937     if sys.version_info >= (3, 0):
 938         return b
 939
 940     if not isinstance(b, bytes):
 941         return b
 942
 943     return b.decode(get_subprocess_encoding(), 'ignore')
 944
 945
 946 def encodeArgument(s):
 947     if not isinstance(s, compat_str):
 948         # Legacy code that uses byte strings
 949         # Uncomment the following line after fixing all post processors
 950         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 951         s = s.decode('ascii')
 952     return encodeFilename(s, True)
 953
 954
 955 def decodeArgument(b):
 956     return decodeFilename(b, True)
 957
 958
 959 def decodeOption(optval):
 960     if optval is None:
 961         return optval
 962     if isinstance(optval, bytes):
 963         optval = optval.decode(preferredencoding())
 964
 965     assert isinstance(optval, compat_str)
 966     return optval
 967
 968
 969 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 970
 971
 972 def timetuple_from_msec(msec):
 973     secs, msec = divmod(msec, 1000)
 974     mins, secs = divmod(secs, 60)
 975     hrs, mins = divmod(mins, 60)
 976     return _timetuple(hrs, mins, secs, msec)
 977
 978
 979 def formatSeconds(secs, delim=':', msec=False):
 980     time = timetuple_from_msec(secs * 1000)
 981     if time.hours:
 982         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 983     elif time.minutes:
 984         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 985     else:
 986         ret = '%d' % time.seconds
 987     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 988
 989
 990 def _ssl_load_windows_store_certs(ssl_context, storename):
 991     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 992     try:
 993         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 994                  if encoding == 'x509_asn' and (
 995                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 996     except PermissionError:
 997         return
 998     for cert in certs:
 999         try:
1000             ssl_context.load_verify_locations(cadata=cert)
1001         except ssl.SSLError:
1002             pass
1003
1004
1005 def make_HTTPS_handler(params, **kwargs):
1006     opts_check_certificate = not params.get('nocheckcertificate')
1007     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1008     context.check_hostname = opts_check_certificate
1009     if params.get('legacyserverconnect'):
1010         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1011     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1012     if opts_check_certificate:
1013         try:
1014             context.load_default_certs()
1015             # Work around the issue in load_default_certs when there are bad certificates. See:
1016             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1017             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1018         except ssl.SSLError:
1019             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1020             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1021                 # Create a new context to discard any certificates that were already loaded
1022                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1023                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1024                 for storename in ('CA', 'ROOT'):
1025                     _ssl_load_windows_store_certs(context, storename)
1026             context.set_default_verify_paths()
1027     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1028
1029
1030 def bug_reports_message(before=';'):
1031     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1032            'filling out the appropriate issue template. '
1033            'Confirm you are on the latest version using  yt-dlp -U')
1034
1035     before = before.rstrip()
1036     if not before or before.endswith(('.', '!', '?')):
1037         msg = msg[0].title() + msg[1:]
1038
1039     return (before + ' ' if before else '') + msg
1040
1041
1042 class YoutubeDLError(Exception):
1043     """Base exception for YoutubeDL errors."""
1044     msg = None
1045
1046     def __init__(self, msg=None):
1047         if msg is not None:
1048             self.msg = msg
1049         elif self.msg is None:
1050             self.msg = type(self).__name__
1051         super().__init__(self.msg)
1052
1053
1054 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1055 if hasattr(ssl, 'CertificateError'):
1056     network_exceptions.append(ssl.CertificateError)
1057 network_exceptions = tuple(network_exceptions)
1058
1059
1060 class ExtractorError(YoutubeDLError):
1061     """Error during info extraction."""
1062
1063     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1064         """ tb, if given, is the original traceback (so that it can be printed out).
1065         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1066         """
1067         if sys.exc_info()[0] in network_exceptions:
1068             expected = True
1069
1070         self.orig_msg = str(msg)
1071         self.traceback = tb
1072         self.expected = expected
1073         self.cause = cause
1074         self.video_id = video_id
1075         self.ie = ie
1076         self.exc_info = sys.exc_info()  # preserve original exception
1077
1078         super(ExtractorError, self).__init__(''.join((
1079             format_field(ie, template='[%s] '),
1080             format_field(video_id, template='%s: '),
1081             msg,
1082             format_field(cause, template=' (caused by %r)'),
1083             '' if expected else bug_reports_message())))
1084
1085     def format_traceback(self):
1086         return join_nonempty(
1087             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1088             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1089             delim='\n') or None
1090
1091
1092 class UnsupportedError(ExtractorError):
1093     def __init__(self, url):
1094         super(UnsupportedError, self).__init__(
1095             'Unsupported URL: %s' % url, expected=True)
1096         self.url = url
1097
1098
1099 class RegexNotFoundError(ExtractorError):
1100     """Error when a regex didn't match"""
1101     pass
1102
1103
1104 class GeoRestrictedError(ExtractorError):
1105     """Geographic restriction Error exception.
1106
1107     This exception may be thrown when a video is not available from your
1108     geographic location due to geographic restrictions imposed by a website.
1109     """
1110
1111     def __init__(self, msg, countries=None, **kwargs):
1112         kwargs['expected'] = True
1113         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1114         self.countries = countries
1115
1116
1117 class DownloadError(YoutubeDLError):
1118     """Download Error exception.
1119
1120     This exception may be thrown by FileDownloader objects if they are not
1121     configured to continue on errors. They will contain the appropriate
1122     error message.
1123     """
1124
1125     def __init__(self, msg, exc_info=None):
1126         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1127         super(DownloadError, self).__init__(msg)
1128         self.exc_info = exc_info
1129
1130
1131 class EntryNotInPlaylist(YoutubeDLError):
1132     """Entry not in playlist exception.
1133
1134     This exception will be thrown by YoutubeDL when a requested entry
1135     is not found in the playlist info_dict
1136     """
1137     msg = 'Entry not found in info'
1138
1139
1140 class SameFileError(YoutubeDLError):
1141     """Same File exception.
1142
1143     This exception will be thrown by FileDownloader objects if they detect
1144     multiple files would have to be downloaded to the same file on disk.
1145     """
1146     msg = 'Fixed output name but more than one file to download'
1147
1148     def __init__(self, filename=None):
1149         if filename is not None:
1150             self.msg += f': {filename}'
1151         super().__init__(self.msg)
1152
1153
1154 class PostProcessingError(YoutubeDLError):
1155     """Post Processing exception.
1156
1157     This exception may be raised by PostProcessor's .run() method to
1158     indicate an error in the postprocessing task.
1159     """
1160
1161
1162 class DownloadCancelled(YoutubeDLError):
1163     """ Exception raised when the download queue should be interrupted """
1164     msg = 'The download was cancelled'
1165
1166
1167 class ExistingVideoReached(DownloadCancelled):
1168     """ --break-on-existing triggered """
1169     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1170
1171
1172 class RejectedVideoReached(DownloadCancelled):
1173     """ --break-on-reject triggered """
1174     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1175
1176
1177 class MaxDownloadsReached(DownloadCancelled):
1178     """ --max-downloads limit has been reached. """
1179     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1180
1181
1182 class ReExtractInfo(YoutubeDLError):
1183     """ Video info needs to be re-extracted. """
1184
1185     def __init__(self, msg, expected=False):
1186         super().__init__(msg)
1187         self.expected = expected
1188
1189
1190 class ThrottledDownload(ReExtractInfo):
1191     """ Download speed below --throttled-rate. """
1192     msg = 'The download speed is below throttle limit'
1193
1194     def __init__(self):
1195         super().__init__(self.msg, expected=False)
1196
1197
1198 class UnavailableVideoError(YoutubeDLError):
1199     """Unavailable Format exception.
1200
1201     This exception will be thrown when a video is requested
1202     in a format that is not available for that video.
1203     """
1204     msg = 'Unable to download video'
1205
1206     def __init__(self, err=None):
1207         if err is not None:
1208             self.msg += f': {err}'
1209         super().__init__(self.msg)
1210
1211
1212 class ContentTooShortError(YoutubeDLError):
1213     """Content Too Short exception.
1214
1215     This exception may be raised by FileDownloader objects when a file they
1216     download is too small for what the server announced first, indicating
1217     the connection was probably interrupted.
1218     """
1219
1220     def __init__(self, downloaded, expected):
1221         super(ContentTooShortError, self).__init__(
1222             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1223         )
1224         # Both in bytes
1225         self.downloaded = downloaded
1226         self.expected = expected
1227
1228
1229 class XAttrMetadataError(YoutubeDLError):
1230     def __init__(self, code=None, msg='Unknown error'):
1231         super(XAttrMetadataError, self).__init__(msg)
1232         self.code = code
1233         self.msg = msg
1234
1235         # Parsing code and msg
1236         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1237                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1238             self.reason = 'NO_SPACE'
1239         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1240             self.reason = 'VALUE_TOO_LONG'
1241         else:
1242             self.reason = 'NOT_SUPPORTED'
1243
1244
1245 class XAttrUnavailableError(YoutubeDLError):
1246     pass
1247
1248
1249 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1250     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1251     # expected HTTP responses to meet HTTP/1.0 or later (see also
1252     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1253     if sys.version_info < (3, 0):
1254         kwargs['strict'] = True
1255     hc = http_class(*args, **compat_kwargs(kwargs))
1256     source_address = ydl_handler._params.get('source_address')
1257
1258     if source_address is not None:
1259         # This is to workaround _create_connection() from socket where it will try all
1260         # address data from getaddrinfo() including IPv6. This filters the result from
1261         # getaddrinfo() based on the source_address value.
1262         # This is based on the cpython socket.create_connection() function.
1263         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1264         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1265             host, port = address
1266             err = None
1267             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1268             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1269             ip_addrs = [addr for addr in addrs if addr[0] == af]
1270             if addrs and not ip_addrs:
1271                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1272                 raise socket.error(
1273                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1274                     % (ip_version, source_address[0]))
1275             for res in ip_addrs:
1276                 af, socktype, proto, canonname, sa = res
1277                 sock = None
1278                 try:
1279                     sock = socket.socket(af, socktype, proto)
1280                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1281                         sock.settimeout(timeout)
1282                     sock.bind(source_address)
1283                     sock.connect(sa)
1284                     err = None  # Explicitly break reference cycle
1285                     return sock
1286                 except socket.error as _:
1287                     err = _
1288                     if sock is not None:
1289                         sock.close()
1290             if err is not None:
1291                 raise err
1292             else:
1293                 raise socket.error('getaddrinfo returns an empty list')
1294         if hasattr(hc, '_create_connection'):
1295             hc._create_connection = _create_connection
1296         sa = (source_address, 0)
1297         if hasattr(hc, 'source_address'):  # Python 2.7+
1298             hc.source_address = sa
1299         else:  # Python 2.6
1300             def _hc_connect(self, *args, **kwargs):
1301                 sock = _create_connection(
1302                     (self.host, self.port), self.timeout, sa)
1303                 if is_https:
1304                     self.sock = ssl.wrap_socket(
1305                         sock, self.key_file, self.cert_file,
1306                         ssl_version=ssl.PROTOCOL_TLSv1)
1307                 else:
1308                     self.sock = sock
1309             hc.connect = functools.partial(_hc_connect, hc)
1310
1311     return hc
1312
1313
1314 def handle_youtubedl_headers(headers):
1315     filtered_headers = headers
1316
1317     if 'Youtubedl-no-compression' in filtered_headers:
1318         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1319         del filtered_headers['Youtubedl-no-compression']
1320
1321     return filtered_headers
1322
1323
1324 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1325     """Handler for HTTP requests and responses.
1326
1327     This class, when installed with an OpenerDirector, automatically adds
1328     the standard headers to every HTTP request and handles gzipped and
1329     deflated responses from web servers. If compression is to be avoided in
1330     a particular request, the original request in the program code only has
1331     to include the HTTP header "Youtubedl-no-compression", which will be
1332     removed before making the real request.
1333
1334     Part of this code was copied from:
1335
1336     http://techknack.net/python-urllib2-handlers/
1337
1338     Andrew Rowls, the author of that code, agreed to release it to the
1339     public domain.
1340     """
1341
1342     def __init__(self, params, *args, **kwargs):
1343         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1344         self._params = params
1345
1346     def http_open(self, req):
1347         conn_class = compat_http_client.HTTPConnection
1348
1349         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1350         if socks_proxy:
1351             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1352             del req.headers['Ytdl-socks-proxy']
1353
1354         return self.do_open(functools.partial(
1355             _create_http_connection, self, conn_class, False),
1356             req)
1357
1358     @staticmethod
1359     def deflate(data):
1360         if not data:
1361             return data
1362         try:
1363             return zlib.decompress(data, -zlib.MAX_WBITS)
1364         except zlib.error:
1365             return zlib.decompress(data)
1366
1367     @staticmethod
1368     def brotli(data):
1369         if not data:
1370             return data
1371         return compat_brotli.decompress(data)
1372
1373     def http_request(self, req):
1374         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1375         # always respected by websites, some tend to give out URLs with non percent-encoded
1376         # non-ASCII characters (see telemb.py, ard.py [#3412])
1377         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1378         # To work around aforementioned issue we will replace request's original URL with
1379         # percent-encoded one
1380         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1381         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1382         url = req.get_full_url()
1383         url_escaped = escape_url(url)
1384
1385         # Substitute URL if any change after escaping
1386         if url != url_escaped:
1387             req = update_Request(req, url=url_escaped)
1388
1389         for h, v in self._params.get('http_headers', std_headers).items():
1390             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1391             # The dict keys are capitalized because of this bug by urllib
1392             if h.capitalize() not in req.headers:
1393                 req.add_header(h, v)
1394
1395         req.headers = handle_youtubedl_headers(req.headers)
1396
1397         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1398             # Python 2.6 is brain-dead when it comes to fragments
1399             req._Request__original = req._Request__original.partition('#')[0]
1400             req._Request__r_type = req._Request__r_type.partition('#')[0]
1401
1402         return req
1403
1404     def http_response(self, req, resp):
1405         old_resp = resp
1406         # gzip
1407         if resp.headers.get('Content-encoding', '') == 'gzip':
1408             content = resp.read()
1409             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1410             try:
1411                 uncompressed = io.BytesIO(gz.read())
1412             except IOError as original_ioerror:
1413                 # There may be junk add the end of the file
1414                 # See http://stackoverflow.com/q/4928560/35070 for details
1415                 for i in range(1, 1024):
1416                     try:
1417                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1418                         uncompressed = io.BytesIO(gz.read())
1419                     except IOError:
1420                         continue
1421                     break
1422                 else:
1423                     raise original_ioerror
1424             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1425             resp.msg = old_resp.msg
1426             del resp.headers['Content-encoding']
1427         # deflate
1428         if resp.headers.get('Content-encoding', '') == 'deflate':
1429             gz = io.BytesIO(self.deflate(resp.read()))
1430             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1431             resp.msg = old_resp.msg
1432             del resp.headers['Content-encoding']
1433         # brotli
1434         if resp.headers.get('Content-encoding', '') == 'br':
1435             resp = compat_urllib_request.addinfourl(
1436                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1437             resp.msg = old_resp.msg
1438             del resp.headers['Content-encoding']
1439         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1440         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1441         if 300 <= resp.code < 400:
1442             location = resp.headers.get('Location')
1443             if location:
1444                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1445                 if sys.version_info >= (3, 0):
1446                     location = location.encode('iso-8859-1').decode('utf-8')
1447                 else:
1448                     location = location.decode('utf-8')
1449                 location_escaped = escape_url(location)
1450                 if location != location_escaped:
1451                     del resp.headers['Location']
1452                     if sys.version_info < (3, 0):
1453                         location_escaped = location_escaped.encode('utf-8')
1454                     resp.headers['Location'] = location_escaped
1455         return resp
1456
1457     https_request = http_request
1458     https_response = http_response
1459
1460
1461 def make_socks_conn_class(base_class, socks_proxy):
1462     assert issubclass(base_class, (
1463         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1464
1465     url_components = compat_urlparse.urlparse(socks_proxy)
1466     if url_components.scheme.lower() == 'socks5':
1467         socks_type = ProxyType.SOCKS5
1468     elif url_components.scheme.lower() in ('socks', 'socks4'):
1469         socks_type = ProxyType.SOCKS4
1470     elif url_components.scheme.lower() == 'socks4a':
1471         socks_type = ProxyType.SOCKS4A
1472
1473     def unquote_if_non_empty(s):
1474         if not s:
1475             return s
1476         return compat_urllib_parse_unquote_plus(s)
1477
1478     proxy_args = (
1479         socks_type,
1480         url_components.hostname, url_components.port or 1080,
1481         True,  # Remote DNS
1482         unquote_if_non_empty(url_components.username),
1483         unquote_if_non_empty(url_components.password),
1484     )
1485
1486     class SocksConnection(base_class):
1487         def connect(self):
1488             self.sock = sockssocket()
1489             self.sock.setproxy(*proxy_args)
1490             if type(self.timeout) in (int, float):
1491                 self.sock.settimeout(self.timeout)
1492             self.sock.connect((self.host, self.port))
1493
1494             if isinstance(self, compat_http_client.HTTPSConnection):
1495                 if hasattr(self, '_context'):  # Python > 2.6
1496                     self.sock = self._context.wrap_socket(
1497                         self.sock, server_hostname=self.host)
1498                 else:
1499                     self.sock = ssl.wrap_socket(self.sock)
1500
1501     return SocksConnection
1502
1503
1504 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1505     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1506         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1507         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1508         self._params = params
1509
1510     def https_open(self, req):
1511         kwargs = {}
1512         conn_class = self._https_conn_class
1513
1514         if hasattr(self, '_context'):  # python > 2.6
1515             kwargs['context'] = self._context
1516         if hasattr(self, '_check_hostname'):  # python 3.x
1517             kwargs['check_hostname'] = self._check_hostname
1518
1519         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1520         if socks_proxy:
1521             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1522             del req.headers['Ytdl-socks-proxy']
1523
1524         return self.do_open(functools.partial(
1525             _create_http_connection, self, conn_class, True),
1526             req, **kwargs)
1527
1528
1529 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1530     """
1531     See [1] for cookie file format.
1532
1533     1. https://curl.haxx.se/docs/http-cookies.html
1534     """
1535     _HTTPONLY_PREFIX = '#HttpOnly_'
1536     _ENTRY_LEN = 7
1537     _HEADER = '''# Netscape HTTP Cookie File
1538 # This file is generated by yt-dlp.  Do not edit.
1539
1540 '''
1541     _CookieFileEntry = collections.namedtuple(
1542         'CookieFileEntry',
1543         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1544
1545     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1546         """
1547         Save cookies to a file.
1548
1549         Most of the code is taken from CPython 3.8 and slightly adapted
1550         to support cookie files with UTF-8 in both python 2 and 3.
1551         """
1552         if filename is None:
1553             if self.filename is not None:
1554                 filename = self.filename
1555             else:
1556                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1557
1558         # Store session cookies with `expires` set to 0 instead of an empty
1559         # string
1560         for cookie in self:
1561             if cookie.expires is None:
1562                 cookie.expires = 0
1563
1564         with io.open(filename, 'w', encoding='utf-8') as f:
1565             f.write(self._HEADER)
1566             now = time.time()
1567             for cookie in self:
1568                 if not ignore_discard and cookie.discard:
1569                     continue
1570                 if not ignore_expires and cookie.is_expired(now):
1571                     continue
1572                 if cookie.secure:
1573                     secure = 'TRUE'
1574                 else:
1575                     secure = 'FALSE'
1576                 if cookie.domain.startswith('.'):
1577                     initial_dot = 'TRUE'
1578                 else:
1579                     initial_dot = 'FALSE'
1580                 if cookie.expires is not None:
1581                     expires = compat_str(cookie.expires)
1582                 else:
1583                     expires = ''
1584                 if cookie.value is None:
1585                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1586                     # with no name, whereas http.cookiejar regards it as a
1587                     # cookie with no value.
1588                     name = ''
1589                     value = cookie.name
1590                 else:
1591                     name = cookie.name
1592                     value = cookie.value
1593                 f.write(
1594                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1595                                secure, expires, name, value]) + '\n')
1596
1597     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1598         """Load cookies from a file."""
1599         if filename is None:
1600             if self.filename is not None:
1601                 filename = self.filename
1602             else:
1603                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1604
1605         def prepare_line(line):
1606             if line.startswith(self._HTTPONLY_PREFIX):
1607                 line = line[len(self._HTTPONLY_PREFIX):]
1608             # comments and empty lines are fine
1609             if line.startswith('#') or not line.strip():
1610                 return line
1611             cookie_list = line.split('\t')
1612             if len(cookie_list) != self._ENTRY_LEN:
1613                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1614             cookie = self._CookieFileEntry(*cookie_list)
1615             if cookie.expires_at and not cookie.expires_at.isdigit():
1616                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1617             return line
1618
1619         cf = io.StringIO()
1620         with io.open(filename, encoding='utf-8') as f:
1621             for line in f:
1622                 try:
1623                     cf.write(prepare_line(line))
1624                 except compat_cookiejar.LoadError as e:
1625                     write_string(
1626                         'WARNING: skipping cookie file entry due to %s: %r\n'
1627                         % (e, line), sys.stderr)
1628                     continue
1629         cf.seek(0)
1630         self._really_load(cf, filename, ignore_discard, ignore_expires)
1631         # Session cookies are denoted by either `expires` field set to
1632         # an empty string or 0. MozillaCookieJar only recognizes the former
1633         # (see [1]). So we need force the latter to be recognized as session
1634         # cookies on our own.
1635         # Session cookies may be important for cookies-based authentication,
1636         # e.g. usually, when user does not check 'Remember me' check box while
1637         # logging in on a site, some important cookies are stored as session
1638         # cookies so that not recognizing them will result in failed login.
1639         # 1. https://bugs.python.org/issue17164
1640         for cookie in self:
1641             # Treat `expires=0` cookies as session cookies
1642             if cookie.expires == 0:
1643                 cookie.expires = None
1644                 cookie.discard = True
1645
1646
1647 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1648     def __init__(self, cookiejar=None):
1649         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1650
1651     def http_response(self, request, response):
1652         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1653         # characters in Set-Cookie HTTP header of last response (see
1654         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1655         # In order to at least prevent crashing we will percent encode Set-Cookie
1656         # header before HTTPCookieProcessor starts processing it.
1657         # if sys.version_info < (3, 0) and response.headers:
1658         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1659         #         set_cookie = response.headers.get(set_cookie_header)
1660         #         if set_cookie:
1661         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1662         #             if set_cookie != set_cookie_escaped:
1663         #                 del response.headers[set_cookie_header]
1664         #                 response.headers[set_cookie_header] = set_cookie_escaped
1665         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1666
1667     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1668     https_response = http_response
1669
1670
1671 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1672     """YoutubeDL redirect handler
1673
1674     The code is based on HTTPRedirectHandler implementation from CPython [1].
1675
1676     This redirect handler solves two issues:
1677      - ensures redirect URL is always unicode under python 2
1678      - introduces support for experimental HTTP response status code
1679        308 Permanent Redirect [2] used by some sites [3]
1680
1681     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1682     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1683     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1684     """
1685
1686     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1687
1688     def redirect_request(self, req, fp, code, msg, headers, newurl):
1689         """Return a Request or None in response to a redirect.
1690
1691         This is called by the http_error_30x methods when a
1692         redirection response is received.  If a redirection should
1693         take place, return a new Request to allow http_error_30x to
1694         perform the redirect.  Otherwise, raise HTTPError if no-one
1695         else should try to handle this url.  Return None if you can't
1696         but another Handler might.
1697         """
1698         m = req.get_method()
1699         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1700                  or code in (301, 302, 303) and m == "POST")):
1701             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1702         # Strictly (according to RFC 2616), 301 or 302 in response to
1703         # a POST MUST NOT cause a redirection without confirmation
1704         # from the user (of urllib.request, in this case).  In practice,
1705         # essentially all clients do redirect in this case, so we do
1706         # the same.
1707
1708         # On python 2 urlh.geturl() may sometimes return redirect URL
1709         # as byte string instead of unicode. This workaround allows
1710         # to force it always return unicode.
1711         if sys.version_info[0] < 3:
1712             newurl = compat_str(newurl)
1713
1714         # Be conciliant with URIs containing a space.  This is mainly
1715         # redundant with the more complete encoding done in http_error_302(),
1716         # but it is kept for compatibility with other callers.
1717         newurl = newurl.replace(' ', '%20')
1718
1719         CONTENT_HEADERS = ("content-length", "content-type")
1720         # NB: don't use dict comprehension for python 2.6 compatibility
1721         newheaders = dict((k, v) for k, v in req.headers.items()
1722                           if k.lower() not in CONTENT_HEADERS)
1723         return compat_urllib_request.Request(
1724             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1725             unverifiable=True)
1726
1727
1728 def extract_timezone(date_str):
1729     m = re.search(
1730         r'''(?x)
1731             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1732             (?P<tz>Z|                                            # just the UTC Z, or
1733                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1734                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1735                    [ ]?                                          # optional space
1736                 (?P<sign>\+|-)                                   # +/-
1737                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1738             $)
1739         ''', date_str)
1740     if not m:
1741         timezone = datetime.timedelta()
1742     else:
1743         date_str = date_str[:-len(m.group('tz'))]
1744         if not m.group('sign'):
1745             timezone = datetime.timedelta()
1746         else:
1747             sign = 1 if m.group('sign') == '+' else -1
1748             timezone = datetime.timedelta(
1749                 hours=sign * int(m.group('hours')),
1750                 minutes=sign * int(m.group('minutes')))
1751     return timezone, date_str
1752
1753
1754 def parse_iso8601(date_str, delimiter='T', timezone=None):
1755     """ Return a UNIX timestamp from the given date """
1756
1757     if date_str is None:
1758         return None
1759
1760     date_str = re.sub(r'\.[0-9]+', '', date_str)
1761
1762     if timezone is None:
1763         timezone, date_str = extract_timezone(date_str)
1764
1765     try:
1766         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1767         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1768         return calendar.timegm(dt.timetuple())
1769     except ValueError:
1770         pass
1771
1772
1773 def date_formats(day_first=True):
1774     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1775
1776
1777 def unified_strdate(date_str, day_first=True):
1778     """Return a string with the date in the format YYYYMMDD"""
1779
1780     if date_str is None:
1781         return None
1782     upload_date = None
1783     # Replace commas
1784     date_str = date_str.replace(',', ' ')
1785     # Remove AM/PM + timezone
1786     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1787     _, date_str = extract_timezone(date_str)
1788
1789     for expression in date_formats(day_first):
1790         try:
1791             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1792         except ValueError:
1793             pass
1794     if upload_date is None:
1795         timetuple = email.utils.parsedate_tz(date_str)
1796         if timetuple:
1797             try:
1798                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1799             except ValueError:
1800                 pass
1801     if upload_date is not None:
1802         return compat_str(upload_date)
1803
1804
1805 def unified_timestamp(date_str, day_first=True):
1806     if date_str is None:
1807         return None
1808
1809     date_str = re.sub(r'[,|]', '', date_str)
1810
1811     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1812     timezone, date_str = extract_timezone(date_str)
1813
1814     # Remove AM/PM + timezone
1815     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1816
1817     # Remove unrecognized timezones from ISO 8601 alike timestamps
1818     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1819     if m:
1820         date_str = date_str[:-len(m.group('tz'))]
1821
1822     # Python only supports microseconds, so remove nanoseconds
1823     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1824     if m:
1825         date_str = m.group(1)
1826
1827     for expression in date_formats(day_first):
1828         try:
1829             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1830             return calendar.timegm(dt.timetuple())
1831         except ValueError:
1832             pass
1833     timetuple = email.utils.parsedate_tz(date_str)
1834     if timetuple:
1835         return calendar.timegm(timetuple) + pm_delta * 3600
1836
1837
1838 def determine_ext(url, default_ext='unknown_video'):
1839     if url is None or '.' not in url:
1840         return default_ext
1841     guess = url.partition('?')[0].rpartition('.')[2]
1842     if re.match(r'^[A-Za-z0-9]+$', guess):
1843         return guess
1844     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1845     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1846         return guess.rstrip('/')
1847     else:
1848         return default_ext
1849
1850
1851 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1852     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1853
1854
1855 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1856     """
1857     Return a datetime object from a string in the format YYYYMMDD or
1858     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1859
1860     format: string date format used to return datetime object from
1861     precision: round the time portion of a datetime object.
1862                 auto|microsecond|second|minute|hour|day.
1863                 auto: round to the unit provided in date_str (if applicable).
1864     """
1865     auto_precision = False
1866     if precision == 'auto':
1867         auto_precision = True
1868         precision = 'microsecond'
1869     today = datetime_round(datetime.datetime.utcnow(), precision)
1870     if date_str in ('now', 'today'):
1871         return today
1872     if date_str == 'yesterday':
1873         return today - datetime.timedelta(days=1)
1874     match = re.match(
1875         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1876         date_str)
1877     if match is not None:
1878         start_time = datetime_from_str(match.group('start'), precision, format)
1879         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1880         unit = match.group('unit')
1881         if unit == 'month' or unit == 'year':
1882             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1883             unit = 'day'
1884         else:
1885             if unit == 'week':
1886                 unit = 'day'
1887                 time *= 7
1888             delta = datetime.timedelta(**{unit + 's': time})
1889             new_date = start_time + delta
1890         if auto_precision:
1891             return datetime_round(new_date, unit)
1892         return new_date
1893
1894     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1895
1896
1897 def date_from_str(date_str, format='%Y%m%d', strict=False):
1898     """
1899     Return a datetime object from a string in the format YYYYMMDD or
1900     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1901
1902     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1903
1904     format: string date format used to return datetime object from
1905     """
1906     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1907         raise ValueError(f'Invalid date format {date_str}')
1908     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1909
1910
1911 def datetime_add_months(dt, months):
1912     """Increment/Decrement a datetime object by months."""
1913     month = dt.month + months - 1
1914     year = dt.year + month // 12
1915     month = month % 12 + 1
1916     day = min(dt.day, calendar.monthrange(year, month)[1])
1917     return dt.replace(year, month, day)
1918
1919
1920 def datetime_round(dt, precision='day'):
1921     """
1922     Round a datetime object's time to a specific precision
1923     """
1924     if precision == 'microsecond':
1925         return dt
1926
1927     unit_seconds = {
1928         'day': 86400,
1929         'hour': 3600,
1930         'minute': 60,
1931         'second': 1,
1932     }
1933     roundto = lambda x, n: ((x + n / 2) // n) * n
1934     timestamp = calendar.timegm(dt.timetuple())
1935     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1936
1937
1938 def hyphenate_date(date_str):
1939     """
1940     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1941     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1942     if match is not None:
1943         return '-'.join(match.groups())
1944     else:
1945         return date_str
1946
1947
1948 class DateRange(object):
1949     """Represents a time interval between two dates"""
1950
1951     def __init__(self, start=None, end=None):
1952         """start and end must be strings in the format accepted by date"""
1953         if start is not None:
1954             self.start = date_from_str(start, strict=True)
1955         else:
1956             self.start = datetime.datetime.min.date()
1957         if end is not None:
1958             self.end = date_from_str(end, strict=True)
1959         else:
1960             self.end = datetime.datetime.max.date()
1961         if self.start > self.end:
1962             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1963
1964     @classmethod
1965     def day(cls, day):
1966         """Returns a range that only contains the given day"""
1967         return cls(day, day)
1968
1969     def __contains__(self, date):
1970         """Check if the date is in the range"""
1971         if not isinstance(date, datetime.date):
1972             date = date_from_str(date)
1973         return self.start <= date <= self.end
1974
1975     def __str__(self):
1976         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1977
1978
1979 def platform_name():
1980     """ Returns the platform name as a compat_str """
1981     res = platform.platform()
1982     if isinstance(res, bytes):
1983         res = res.decode(preferredencoding())
1984
1985     assert isinstance(res, compat_str)
1986     return res
1987
1988
1989 def get_windows_version():
1990     ''' Get Windows version. None if it's not running on Windows '''
1991     if compat_os_name == 'nt':
1992         return version_tuple(platform.win32_ver()[1])
1993     else:
1994         return None
1995
1996
1997 def _windows_write_string(s, out):
1998     """ Returns True if the string was written using special methods,
1999     False if it has yet to be written out."""
2000     # Adapted from http://stackoverflow.com/a/3259271/35070
2001
2002     import ctypes.wintypes
2003
2004     WIN_OUTPUT_IDS = {
2005         1: -11,
2006         2: -12,
2007     }
2008
2009     try:
2010         fileno = out.fileno()
2011     except AttributeError:
2012         # If the output stream doesn't have a fileno, it's virtual
2013         return False
2014     except io.UnsupportedOperation:
2015         # Some strange Windows pseudo files?
2016         return False
2017     if fileno not in WIN_OUTPUT_IDS:
2018         return False
2019
2020     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2021         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2022         ('GetStdHandle', ctypes.windll.kernel32))
2023     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2024
2025     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2026         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2027         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2028         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2029     written = ctypes.wintypes.DWORD(0)
2030
2031     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2032     FILE_TYPE_CHAR = 0x0002
2033     FILE_TYPE_REMOTE = 0x8000
2034     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2035         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2036         ctypes.POINTER(ctypes.wintypes.DWORD))(
2037         ('GetConsoleMode', ctypes.windll.kernel32))
2038     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2039
2040     def not_a_console(handle):
2041         if handle == INVALID_HANDLE_VALUE or handle is None:
2042             return True
2043         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2044                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2045
2046     if not_a_console(h):
2047         return False
2048
2049     def next_nonbmp_pos(s):
2050         try:
2051             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2052         except StopIteration:
2053             return len(s)
2054
2055     while s:
2056         count = min(next_nonbmp_pos(s), 1024)
2057
2058         ret = WriteConsoleW(
2059             h, s, count if count else 2, ctypes.byref(written), None)
2060         if ret == 0:
2061             raise OSError('Failed to write string')
2062         if not count:  # We just wrote a non-BMP character
2063             assert written.value == 2
2064             s = s[1:]
2065         else:
2066             assert written.value > 0
2067             s = s[written.value:]
2068     return True
2069
2070
2071 def write_string(s, out=None, encoding=None):
2072     if out is None:
2073         out = sys.stderr
2074     assert type(s) == compat_str
2075
2076     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2077         if _windows_write_string(s, out):
2078             return
2079
2080     if ('b' in getattr(out, 'mode', '')
2081             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2082         byt = s.encode(encoding or preferredencoding(), 'ignore')
2083         out.write(byt)
2084     elif hasattr(out, 'buffer'):
2085         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2086         byt = s.encode(enc, 'ignore')
2087         out.buffer.write(byt)
2088     else:
2089         out.write(s)
2090     out.flush()
2091
2092
2093 def bytes_to_intlist(bs):
2094     if not bs:
2095         return []
2096     if isinstance(bs[0], int):  # Python 3
2097         return list(bs)
2098     else:
2099         return [ord(c) for c in bs]
2100
2101
2102 def intlist_to_bytes(xs):
2103     if not xs:
2104         return b''
2105     return compat_struct_pack('%dB' % len(xs), *xs)
2106
2107
2108 # Cross-platform file locking
2109 if sys.platform == 'win32':
2110     import ctypes.wintypes
2111     import msvcrt
2112
2113     class OVERLAPPED(ctypes.Structure):
2114         _fields_ = [
2115             ('Internal', ctypes.wintypes.LPVOID),
2116             ('InternalHigh', ctypes.wintypes.LPVOID),
2117             ('Offset', ctypes.wintypes.DWORD),
2118             ('OffsetHigh', ctypes.wintypes.DWORD),
2119             ('hEvent', ctypes.wintypes.HANDLE),
2120         ]
2121
2122     kernel32 = ctypes.windll.kernel32
2123     LockFileEx = kernel32.LockFileEx
2124     LockFileEx.argtypes = [
2125         ctypes.wintypes.HANDLE,     # hFile
2126         ctypes.wintypes.DWORD,      # dwFlags
2127         ctypes.wintypes.DWORD,      # dwReserved
2128         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2129         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2130         ctypes.POINTER(OVERLAPPED)  # Overlapped
2131     ]
2132     LockFileEx.restype = ctypes.wintypes.BOOL
2133     UnlockFileEx = kernel32.UnlockFileEx
2134     UnlockFileEx.argtypes = [
2135         ctypes.wintypes.HANDLE,     # hFile
2136         ctypes.wintypes.DWORD,      # dwReserved
2137         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2138         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2139         ctypes.POINTER(OVERLAPPED)  # Overlapped
2140     ]
2141     UnlockFileEx.restype = ctypes.wintypes.BOOL
2142     whole_low = 0xffffffff
2143     whole_high = 0x7fffffff
2144
2145     def _lock_file(f, exclusive, block):
2146         overlapped = OVERLAPPED()
2147         overlapped.Offset = 0
2148         overlapped.OffsetHigh = 0
2149         overlapped.hEvent = 0
2150         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2151
2152         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2153                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2154                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2155             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2156
2157     def _unlock_file(f):
2158         assert f._lock_file_overlapped_p
2159         handle = msvcrt.get_osfhandle(f.fileno())
2160         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2161             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2162
2163 else:
2164     try:
2165         import fcntl
2166
2167         def _lock_file(f, exclusive, block):
2168             try:
2169                 fcntl.flock(f,
2170                             fcntl.LOCK_SH if not exclusive
2171                             else fcntl.LOCK_EX if block
2172                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2173             except BlockingIOError:
2174                 raise
2175             except OSError:  # AOSP does not have flock()
2176                 fcntl.lockf(f,
2177                             fcntl.LOCK_SH if not exclusive
2178                             else fcntl.LOCK_EX if block
2179                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2180
2181         def _unlock_file(f):
2182             try:
2183                 fcntl.flock(f, fcntl.LOCK_UN)
2184             except OSError:
2185                 fcntl.lockf(f, fcntl.LOCK_UN)
2186
2187     except ImportError:
2188         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2189
2190         def _lock_file(f, exclusive, block):
2191             raise IOError(UNSUPPORTED_MSG)
2192
2193         def _unlock_file(f):
2194             raise IOError(UNSUPPORTED_MSG)
2195
2196
2197 class locked_file(object):
2198     _closed = False
2199
2200     def __init__(self, filename, mode, block=True, encoding=None):
2201         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2202         self.f = io.open(filename, mode, encoding=encoding)
2203         self.mode = mode
2204         self.block = block
2205
2206     def __enter__(self):
2207         exclusive = 'r' not in self.mode
2208         try:
2209             _lock_file(self.f, exclusive, self.block)
2210         except IOError:
2211             self.f.close()
2212             raise
2213         return self
2214
2215     def __exit__(self, etype, value, traceback):
2216         try:
2217             if not self._closed:
2218                 _unlock_file(self.f)
2219         finally:
2220             self.f.close()
2221             self._closed = True
2222
2223     def __iter__(self):
2224         return iter(self.f)
2225
2226     def write(self, *args):
2227         return self.f.write(*args)
2228
2229     def read(self, *args):
2230         return self.f.read(*args)
2231
2232     def flush(self):
2233         self.f.flush()
2234
2235     def open(self):
2236         return self.__enter__()
2237
2238     def close(self, *args):
2239         self.__exit__(self, *args, value=False, traceback=False)
2240
2241
2242 def get_filesystem_encoding():
2243     encoding = sys.getfilesystemencoding()
2244     return encoding if encoding is not None else 'utf-8'
2245
2246
2247 def shell_quote(args):
2248     quoted_args = []
2249     encoding = get_filesystem_encoding()
2250     for a in args:
2251         if isinstance(a, bytes):
2252             # We may get a filename encoded with 'encodeFilename'
2253             a = a.decode(encoding)
2254         quoted_args.append(compat_shlex_quote(a))
2255     return ' '.join(quoted_args)
2256
2257
2258 def smuggle_url(url, data):
2259     """ Pass additional data in a URL for internal use. """
2260
2261     url, idata = unsmuggle_url(url, {})
2262     data.update(idata)
2263     sdata = compat_urllib_parse_urlencode(
2264         {'__youtubedl_smuggle': json.dumps(data)})
2265     return url + '#' + sdata
2266
2267
2268 def unsmuggle_url(smug_url, default=None):
2269     if '#__youtubedl_smuggle' not in smug_url:
2270         return smug_url, default
2271     url, _, sdata = smug_url.rpartition('#')
2272     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2273     data = json.loads(jsond)
2274     return url, data
2275
2276
2277 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2278     """ Formats numbers with decimal sufixes like K, M, etc """
2279     num, factor = float_or_none(num), float(factor)
2280     if num is None or num < 0:
2281         return None
2282     exponent = 0 if num == 0 else int(math.log(num, factor))
2283     suffix = ['', *'kMGTPEZY'][exponent]
2284     if factor == 1024:
2285         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2286     converted = num / (factor ** exponent)
2287     return fmt % (converted, suffix)
2288
2289
2290 def format_bytes(bytes):
2291     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2292
2293
2294 def lookup_unit_table(unit_table, s):
2295     units_re = '|'.join(re.escape(u) for u in unit_table)
2296     m = re.match(
2297         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2298     if not m:
2299         return None
2300     num_str = m.group('num').replace(',', '.')
2301     mult = unit_table[m.group('unit')]
2302     return int(float(num_str) * mult)
2303
2304
2305 def parse_filesize(s):
2306     if s is None:
2307         return None
2308
2309     # The lower-case forms are of course incorrect and unofficial,
2310     # but we support those too
2311     _UNIT_TABLE = {
2312         'B': 1,
2313         'b': 1,
2314         'bytes': 1,
2315         'KiB': 1024,
2316         'KB': 1000,
2317         'kB': 1024,
2318         'Kb': 1000,
2319         'kb': 1000,
2320         'kilobytes': 1000,
2321         'kibibytes': 1024,
2322         'MiB': 1024 ** 2,
2323         'MB': 1000 ** 2,
2324         'mB': 1024 ** 2,
2325         'Mb': 1000 ** 2,
2326         'mb': 1000 ** 2,
2327         'megabytes': 1000 ** 2,
2328         'mebibytes': 1024 ** 2,
2329         'GiB': 1024 ** 3,
2330         'GB': 1000 ** 3,
2331         'gB': 1024 ** 3,
2332         'Gb': 1000 ** 3,
2333         'gb': 1000 ** 3,
2334         'gigabytes': 1000 ** 3,
2335         'gibibytes': 1024 ** 3,
2336         'TiB': 1024 ** 4,
2337         'TB': 1000 ** 4,
2338         'tB': 1024 ** 4,
2339         'Tb': 1000 ** 4,
2340         'tb': 1000 ** 4,
2341         'terabytes': 1000 ** 4,
2342         'tebibytes': 1024 ** 4,
2343         'PiB': 1024 ** 5,
2344         'PB': 1000 ** 5,
2345         'pB': 1024 ** 5,
2346         'Pb': 1000 ** 5,
2347         'pb': 1000 ** 5,
2348         'petabytes': 1000 ** 5,
2349         'pebibytes': 1024 ** 5,
2350         'EiB': 1024 ** 6,
2351         'EB': 1000 ** 6,
2352         'eB': 1024 ** 6,
2353         'Eb': 1000 ** 6,
2354         'eb': 1000 ** 6,
2355         'exabytes': 1000 ** 6,
2356         'exbibytes': 1024 ** 6,
2357         'ZiB': 1024 ** 7,
2358         'ZB': 1000 ** 7,
2359         'zB': 1024 ** 7,
2360         'Zb': 1000 ** 7,
2361         'zb': 1000 ** 7,
2362         'zettabytes': 1000 ** 7,
2363         'zebibytes': 1024 ** 7,
2364         'YiB': 1024 ** 8,
2365         'YB': 1000 ** 8,
2366         'yB': 1024 ** 8,
2367         'Yb': 1000 ** 8,
2368         'yb': 1000 ** 8,
2369         'yottabytes': 1000 ** 8,
2370         'yobibytes': 1024 ** 8,
2371     }
2372
2373     return lookup_unit_table(_UNIT_TABLE, s)
2374
2375
2376 def parse_count(s):
2377     if s is None:
2378         return None
2379
2380     s = re.sub(r'^[^\d]+\s', '', s).strip()
2381
2382     if re.match(r'^[\d,.]+$', s):
2383         return str_to_int(s)
2384
2385     _UNIT_TABLE = {
2386         'k': 1000,
2387         'K': 1000,
2388         'm': 1000 ** 2,
2389         'M': 1000 ** 2,
2390         'kk': 1000 ** 2,
2391         'KK': 1000 ** 2,
2392         'b': 1000 ** 3,
2393         'B': 1000 ** 3,
2394     }
2395
2396     ret = lookup_unit_table(_UNIT_TABLE, s)
2397     if ret is not None:
2398         return ret
2399
2400     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2401     if mobj:
2402         return str_to_int(mobj.group(1))
2403
2404
2405 def parse_resolution(s):
2406     if s is None:
2407         return {}
2408
2409     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2410     if mobj:
2411         return {
2412             'width': int(mobj.group('w')),
2413             'height': int(mobj.group('h')),
2414         }
2415
2416     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2417     if mobj:
2418         return {'height': int(mobj.group(1))}
2419
2420     mobj = re.search(r'\b([48])[kK]\b', s)
2421     if mobj:
2422         return {'height': int(mobj.group(1)) * 540}
2423
2424     return {}
2425
2426
2427 def parse_bitrate(s):
2428     if not isinstance(s, compat_str):
2429         return
2430     mobj = re.search(r'\b(\d+)\s*kbps', s)
2431     if mobj:
2432         return int(mobj.group(1))
2433
2434
2435 def month_by_name(name, lang='en'):
2436     """ Return the number of a month by (locale-independently) English name """
2437
2438     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2439
2440     try:
2441         return month_names.index(name) + 1
2442     except ValueError:
2443         return None
2444
2445
2446 def month_by_abbreviation(abbrev):
2447     """ Return the number of a month by (locale-independently) English
2448         abbreviations """
2449
2450     try:
2451         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2452     except ValueError:
2453         return None
2454
2455
2456 def fix_xml_ampersands(xml_str):
2457     """Replace all the '&' by '&amp;' in XML"""
2458     return re.sub(
2459         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2460         '&amp;',
2461         xml_str)
2462
2463
2464 def setproctitle(title):
2465     assert isinstance(title, compat_str)
2466
2467     # ctypes in Jython is not complete
2468     # http://bugs.jython.org/issue2148
2469     if sys.platform.startswith('java'):
2470         return
2471
2472     try:
2473         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2474     except OSError:
2475         return
2476     except TypeError:
2477         # LoadLibrary in Windows Python 2.7.13 only expects
2478         # a bytestring, but since unicode_literals turns
2479         # every string into a unicode string, it fails.
2480         return
2481     title_bytes = title.encode('utf-8')
2482     buf = ctypes.create_string_buffer(len(title_bytes))
2483     buf.value = title_bytes
2484     try:
2485         libc.prctl(15, buf, 0, 0, 0)
2486     except AttributeError:
2487         return  # Strange libc, just skip this
2488
2489
2490 def remove_start(s, start):
2491     return s[len(start):] if s is not None and s.startswith(start) else s
2492
2493
2494 def remove_end(s, end):
2495     return s[:-len(end)] if s is not None and s.endswith(end) else s
2496
2497
2498 def remove_quotes(s):
2499     if s is None or len(s) < 2:
2500         return s
2501     for quote in ('"', "'", ):
2502         if s[0] == quote and s[-1] == quote:
2503             return s[1:-1]
2504     return s
2505
2506
2507 def get_domain(url):
2508     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2509     return domain.group('domain') if domain else None
2510
2511
2512 def url_basename(url):
2513     path = compat_urlparse.urlparse(url).path
2514     return path.strip('/').split('/')[-1]
2515
2516
2517 def base_url(url):
2518     return re.match(r'https?://[^?#&]+/', url).group()
2519
2520
2521 def urljoin(base, path):
2522     if isinstance(path, bytes):
2523         path = path.decode('utf-8')
2524     if not isinstance(path, compat_str) or not path:
2525         return None
2526     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2527         return path
2528     if isinstance(base, bytes):
2529         base = base.decode('utf-8')
2530     if not isinstance(base, compat_str) or not re.match(
2531             r'^(?:https?:)?//', base):
2532         return None
2533     return compat_urlparse.urljoin(base, path)
2534
2535
2536 class HEADRequest(compat_urllib_request.Request):
2537     def get_method(self):
2538         return 'HEAD'
2539
2540
2541 class PUTRequest(compat_urllib_request.Request):
2542     def get_method(self):
2543         return 'PUT'
2544
2545
2546 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2547     if get_attr and v is not None:
2548         v = getattr(v, get_attr, None)
2549     try:
2550         return int(v) * invscale // scale
2551     except (ValueError, TypeError, OverflowError):
2552         return default
2553
2554
2555 def str_or_none(v, default=None):
2556     return default if v is None else compat_str(v)
2557
2558
2559 def str_to_int(int_str):
2560     """ A more relaxed version of int_or_none """
2561     if isinstance(int_str, compat_integer_types):
2562         return int_str
2563     elif isinstance(int_str, compat_str):
2564         int_str = re.sub(r'[,\.\+]', '', int_str)
2565         return int_or_none(int_str)
2566
2567
2568 def float_or_none(v, scale=1, invscale=1, default=None):
2569     if v is None:
2570         return default
2571     try:
2572         return float(v) * invscale / scale
2573     except (ValueError, TypeError):
2574         return default
2575
2576
2577 def bool_or_none(v, default=None):
2578     return v if isinstance(v, bool) else default
2579
2580
2581 def strip_or_none(v, default=None):
2582     return v.strip() if isinstance(v, compat_str) else default
2583
2584
2585 def url_or_none(url):
2586     if not url or not isinstance(url, compat_str):
2587         return None
2588     url = url.strip()
2589     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2590
2591
2592 def request_to_url(req):
2593     if isinstance(req, compat_urllib_request.Request):
2594         return req.get_full_url()
2595     else:
2596         return req
2597
2598
2599 def strftime_or_none(timestamp, date_format, default=None):
2600     datetime_object = None
2601     try:
2602         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2603             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2604         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2605             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2606         return datetime_object.strftime(date_format)
2607     except (ValueError, TypeError, AttributeError):
2608         return default
2609
2610
2611 def parse_duration(s):
2612     if not isinstance(s, compat_basestring):
2613         return None
2614     s = s.strip()
2615     if not s:
2616         return None
2617
2618     days, hours, mins, secs, ms = [None] * 5
2619     m = re.match(r'''(?x)
2620             (?P<before_secs>
2621                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2622             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2623             (?P<ms>[.:][0-9]+)?Z?$
2624         ''', s)
2625     if m:
2626         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2627     else:
2628         m = re.match(
2629             r'''(?ix)(?:P?
2630                 (?:
2631                     [0-9]+\s*y(?:ears?)?\s*
2632                 )?
2633                 (?:
2634                     [0-9]+\s*m(?:onths?)?\s*
2635                 )?
2636                 (?:
2637                     [0-9]+\s*w(?:eeks?)?\s*
2638                 )?
2639                 (?:
2640                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2641                 )?
2642                 T)?
2643                 (?:
2644                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2645                 )?
2646                 (?:
2647                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2648                 )?
2649                 (?:
2650                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2651                 )?Z?$''', s)
2652         if m:
2653             days, hours, mins, secs, ms = m.groups()
2654         else:
2655             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2656             if m:
2657                 hours, mins = m.groups()
2658             else:
2659                 return None
2660
2661     duration = 0
2662     if secs:
2663         duration += float(secs)
2664     if mins:
2665         duration += float(mins) * 60
2666     if hours:
2667         duration += float(hours) * 60 * 60
2668     if days:
2669         duration += float(days) * 24 * 60 * 60
2670     if ms:
2671         duration += float(ms.replace(':', '.'))
2672     return duration
2673
2674
2675 def prepend_extension(filename, ext, expected_real_ext=None):
2676     name, real_ext = os.path.splitext(filename)
2677     return (
2678         '{0}.{1}{2}'.format(name, ext, real_ext)
2679         if not expected_real_ext or real_ext[1:] == expected_real_ext
2680         else '{0}.{1}'.format(filename, ext))
2681
2682
2683 def replace_extension(filename, ext, expected_real_ext=None):
2684     name, real_ext = os.path.splitext(filename)
2685     return '{0}.{1}'.format(
2686         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2687         ext)
2688
2689
2690 def check_executable(exe, args=[]):
2691     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2692     args can be a list of arguments for a short output (like -version) """
2693     try:
2694         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2695     except OSError:
2696         return False
2697     return exe
2698
2699
2700 def _get_exe_version_output(exe, args):
2701     try:
2702         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2703         # SIGTTOU if yt-dlp is run in the background.
2704         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2705         out, _ = Popen(
2706             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2707             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2708     except OSError:
2709         return False
2710     if isinstance(out, bytes):  # Python 2.x
2711         out = out.decode('ascii', 'ignore')
2712     return out
2713
2714
2715 def detect_exe_version(output, version_re=None, unrecognized='present'):
2716     assert isinstance(output, compat_str)
2717     if version_re is None:
2718         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2719     m = re.search(version_re, output)
2720     if m:
2721         return m.group(1)
2722     else:
2723         return unrecognized
2724
2725
2726 def get_exe_version(exe, args=['--version'],
2727                     version_re=None, unrecognized='present'):
2728     """ Returns the version of the specified executable,
2729     or False if the executable is not present """
2730     out = _get_exe_version_output(exe, args)
2731     return detect_exe_version(out, version_re, unrecognized) if out else False
2732
2733
2734 class LazyList(collections.abc.Sequence):
2735     ''' Lazy immutable list from an iterable
2736     Note that slices of a LazyList are lists and not LazyList'''
2737
2738     class IndexError(IndexError):
2739         pass
2740
2741     def __init__(self, iterable, *, reverse=False, _cache=None):
2742         self.__iterable = iter(iterable)
2743         self.__cache = [] if _cache is None else _cache
2744         self.__reversed = reverse
2745
2746     def __iter__(self):
2747         if self.__reversed:
2748             # We need to consume the entire iterable to iterate in reverse
2749             yield from self.exhaust()
2750             return
2751         yield from self.__cache
2752         for item in self.__iterable:
2753             self.__cache.append(item)
2754             yield item
2755
2756     def __exhaust(self):
2757         self.__cache.extend(self.__iterable)
2758         # Discard the emptied iterable to make it pickle-able
2759         self.__iterable = []
2760         return self.__cache
2761
2762     def exhaust(self):
2763         ''' Evaluate the entire iterable '''
2764         return self.__exhaust()[::-1 if self.__reversed else 1]
2765
2766     @staticmethod
2767     def __reverse_index(x):
2768         return None if x is None else -(x + 1)
2769
2770     def __getitem__(self, idx):
2771         if isinstance(idx, slice):
2772             if self.__reversed:
2773                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2774             start, stop, step = idx.start, idx.stop, idx.step or 1
2775         elif isinstance(idx, int):
2776             if self.__reversed:
2777                 idx = self.__reverse_index(idx)
2778             start, stop, step = idx, idx, 0
2779         else:
2780             raise TypeError('indices must be integers or slices')
2781         if ((start or 0) < 0 or (stop or 0) < 0
2782                 or (start is None and step < 0)
2783                 or (stop is None and step > 0)):
2784             # We need to consume the entire iterable to be able to slice from the end
2785             # Obviously, never use this with infinite iterables
2786             self.__exhaust()
2787             try:
2788                 return self.__cache[idx]
2789             except IndexError as e:
2790                 raise self.IndexError(e) from e
2791         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2792         if n > 0:
2793             self.__cache.extend(itertools.islice(self.__iterable, n))
2794         try:
2795             return self.__cache[idx]
2796         except IndexError as e:
2797             raise self.IndexError(e) from e
2798
2799     def __bool__(self):
2800         try:
2801             self[-1] if self.__reversed else self[0]
2802         except self.IndexError:
2803             return False
2804         return True
2805
2806     def __len__(self):
2807         self.__exhaust()
2808         return len(self.__cache)
2809
2810     def __reversed__(self):
2811         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2812
2813     def __copy__(self):
2814         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2815
2816     def __repr__(self):
2817         # repr and str should mimic a list. So we exhaust the iterable
2818         return repr(self.exhaust())
2819
2820     def __str__(self):
2821         return repr(self.exhaust())
2822
2823
2824 class PagedList:
2825
2826     class IndexError(IndexError):
2827         pass
2828
2829     def __len__(self):
2830         # This is only useful for tests
2831         return len(self.getslice())
2832
2833     def __init__(self, pagefunc, pagesize, use_cache=True):
2834         self._pagefunc = pagefunc
2835         self._pagesize = pagesize
2836         self._pagecount = float('inf')
2837         self._use_cache = use_cache
2838         self._cache = {}
2839
2840     def getpage(self, pagenum):
2841         page_results = self._cache.get(pagenum)
2842         if page_results is None:
2843             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2844         if self._use_cache:
2845             self._cache[pagenum] = page_results
2846         return page_results
2847
2848     def getslice(self, start=0, end=None):
2849         return list(self._getslice(start, end))
2850
2851     def _getslice(self, start, end):
2852         raise NotImplementedError('This method must be implemented by subclasses')
2853
2854     def __getitem__(self, idx):
2855         assert self._use_cache, 'Indexing PagedList requires cache'
2856         if not isinstance(idx, int) or idx < 0:
2857             raise TypeError('indices must be non-negative integers')
2858         entries = self.getslice(idx, idx + 1)
2859         if not entries:
2860             raise self.IndexError()
2861         return entries[0]
2862
2863
2864 class OnDemandPagedList(PagedList):
2865     def _getslice(self, start, end):
2866         for pagenum in itertools.count(start // self._pagesize):
2867             firstid = pagenum * self._pagesize
2868             nextfirstid = pagenum * self._pagesize + self._pagesize
2869             if start >= nextfirstid:
2870                 continue
2871
2872             startv = (
2873                 start % self._pagesize
2874                 if firstid <= start < nextfirstid
2875                 else 0)
2876             endv = (
2877                 ((end - 1) % self._pagesize) + 1
2878                 if (end is not None and firstid <= end <= nextfirstid)
2879                 else None)
2880
2881             try:
2882                 page_results = self.getpage(pagenum)
2883             except Exception:
2884                 self._pagecount = pagenum - 1
2885                 raise
2886             if startv != 0 or endv is not None:
2887                 page_results = page_results[startv:endv]
2888             yield from page_results
2889
2890             # A little optimization - if current page is not "full", ie. does
2891             # not contain page_size videos then we can assume that this page
2892             # is the last one - there are no more ids on further pages -
2893             # i.e. no need to query again.
2894             if len(page_results) + startv < self._pagesize:
2895                 break
2896
2897             # If we got the whole page, but the next page is not interesting,
2898             # break out early as well
2899             if end == nextfirstid:
2900                 break
2901
2902
2903 class InAdvancePagedList(PagedList):
2904     def __init__(self, pagefunc, pagecount, pagesize):
2905         PagedList.__init__(self, pagefunc, pagesize, True)
2906         self._pagecount = pagecount
2907
2908     def _getslice(self, start, end):
2909         start_page = start // self._pagesize
2910         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2911         skip_elems = start - start_page * self._pagesize
2912         only_more = None if end is None else end - start
2913         for pagenum in range(start_page, end_page):
2914             page_results = self.getpage(pagenum)
2915             if skip_elems:
2916                 page_results = page_results[skip_elems:]
2917                 skip_elems = None
2918             if only_more is not None:
2919                 if len(page_results) < only_more:
2920                     only_more -= len(page_results)
2921                 else:
2922                     yield from page_results[:only_more]
2923                     break
2924             yield from page_results
2925
2926
2927 def uppercase_escape(s):
2928     unicode_escape = codecs.getdecoder('unicode_escape')
2929     return re.sub(
2930         r'\\U[0-9a-fA-F]{8}',
2931         lambda m: unicode_escape(m.group(0))[0],
2932         s)
2933
2934
2935 def lowercase_escape(s):
2936     unicode_escape = codecs.getdecoder('unicode_escape')
2937     return re.sub(
2938         r'\\u[0-9a-fA-F]{4}',
2939         lambda m: unicode_escape(m.group(0))[0],
2940         s)
2941
2942
2943 def escape_rfc3986(s):
2944     """Escape non-ASCII characters as suggested by RFC 3986"""
2945     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2946         s = s.encode('utf-8')
2947     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2948
2949
2950 def escape_url(url):
2951     """Escape URL as suggested by RFC 3986"""
2952     url_parsed = compat_urllib_parse_urlparse(url)
2953     return url_parsed._replace(
2954         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2955         path=escape_rfc3986(url_parsed.path),
2956         params=escape_rfc3986(url_parsed.params),
2957         query=escape_rfc3986(url_parsed.query),
2958         fragment=escape_rfc3986(url_parsed.fragment)
2959     ).geturl()
2960
2961
2962 def parse_qs(url):
2963     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2964
2965
2966 def read_batch_urls(batch_fd):
2967     def fixup(url):
2968         if not isinstance(url, compat_str):
2969             url = url.decode('utf-8', 'replace')
2970         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2971         for bom in BOM_UTF8:
2972             if url.startswith(bom):
2973                 url = url[len(bom):]
2974         url = url.lstrip()
2975         if not url or url.startswith(('#', ';', ']')):
2976             return False
2977         # "#" cannot be stripped out since it is part of the URI
2978         # However, it can be safely stipped out if follwing a whitespace
2979         return re.split(r'\s#', url, 1)[0].rstrip()
2980
2981     with contextlib.closing(batch_fd) as fd:
2982         return [url for url in map(fixup, fd) if url]
2983
2984
2985 def urlencode_postdata(*args, **kargs):
2986     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2987
2988
2989 def update_url_query(url, query):
2990     if not query:
2991         return url
2992     parsed_url = compat_urlparse.urlparse(url)
2993     qs = compat_parse_qs(parsed_url.query)
2994     qs.update(query)
2995     return compat_urlparse.urlunparse(parsed_url._replace(
2996         query=compat_urllib_parse_urlencode(qs, True)))
2997
2998
2999 def update_Request(req, url=None, data=None, headers={}, query={}):
3000     req_headers = req.headers.copy()
3001     req_headers.update(headers)
3002     req_data = data or req.data
3003     req_url = update_url_query(url or req.get_full_url(), query)
3004     req_get_method = req.get_method()
3005     if req_get_method == 'HEAD':
3006         req_type = HEADRequest
3007     elif req_get_method == 'PUT':
3008         req_type = PUTRequest
3009     else:
3010         req_type = compat_urllib_request.Request
3011     new_req = req_type(
3012         req_url, data=req_data, headers=req_headers,
3013         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3014     if hasattr(req, 'timeout'):
3015         new_req.timeout = req.timeout
3016     return new_req
3017
3018
3019 def _multipart_encode_impl(data, boundary):
3020     content_type = 'multipart/form-data; boundary=%s' % boundary
3021
3022     out = b''
3023     for k, v in data.items():
3024         out += b'--' + boundary.encode('ascii') + b'\r\n'
3025         if isinstance(k, compat_str):
3026             k = k.encode('utf-8')
3027         if isinstance(v, compat_str):
3028             v = v.encode('utf-8')
3029         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3030         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3031         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3032         if boundary.encode('ascii') in content:
3033             raise ValueError('Boundary overlaps with data')
3034         out += content
3035
3036     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3037
3038     return out, content_type
3039
3040
3041 def multipart_encode(data, boundary=None):
3042     '''
3043     Encode a dict to RFC 7578-compliant form-data
3044
3045     data:
3046         A dict where keys and values can be either Unicode or bytes-like
3047         objects.
3048     boundary:
3049         If specified a Unicode object, it's used as the boundary. Otherwise
3050         a random boundary is generated.
3051
3052     Reference: https://tools.ietf.org/html/rfc7578
3053     '''
3054     has_specified_boundary = boundary is not None
3055
3056     while True:
3057         if boundary is None:
3058             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3059
3060         try:
3061             out, content_type = _multipart_encode_impl(data, boundary)
3062             break
3063         except ValueError:
3064             if has_specified_boundary:
3065                 raise
3066             boundary = None
3067
3068     return out, content_type
3069
3070
3071 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3072     if isinstance(key_or_keys, (list, tuple)):
3073         for key in key_or_keys:
3074             if key not in d or d[key] is None or skip_false_values and not d[key]:
3075                 continue
3076             return d[key]
3077         return default
3078     return d.get(key_or_keys, default)
3079
3080
3081 def try_get(src, getter, expected_type=None):
3082     for get in variadic(getter):
3083         try:
3084             v = get(src)
3085         except (AttributeError, KeyError, TypeError, IndexError):
3086             pass
3087         else:
3088             if expected_type is None or isinstance(v, expected_type):
3089                 return v
3090
3091
3092 def merge_dicts(*dicts):
3093     merged = {}
3094     for a_dict in dicts:
3095         for k, v in a_dict.items():
3096             if v is None:
3097                 continue
3098             if (k not in merged
3099                     or (isinstance(v, compat_str) and v
3100                         and isinstance(merged[k], compat_str)
3101                         and not merged[k])):
3102                 merged[k] = v
3103     return merged
3104
3105
3106 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3107     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3108
3109
3110 US_RATINGS = {
3111     'G': 0,
3112     'PG': 10,
3113     'PG-13': 13,
3114     'R': 16,
3115     'NC': 18,
3116 }
3117
3118
3119 TV_PARENTAL_GUIDELINES = {
3120     'TV-Y': 0,
3121     'TV-Y7': 7,
3122     'TV-G': 0,
3123     'TV-PG': 0,
3124     'TV-14': 14,
3125     'TV-MA': 17,
3126 }
3127
3128
3129 def parse_age_limit(s):
3130     if type(s) == int:
3131         return s if 0 <= s <= 21 else None
3132     if not isinstance(s, compat_basestring):
3133         return None
3134     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3135     if m:
3136         return int(m.group('age'))
3137     s = s.upper()
3138     if s in US_RATINGS:
3139         return US_RATINGS[s]
3140     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3141     if m:
3142         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3143     return None
3144
3145
3146 def strip_jsonp(code):
3147     return re.sub(
3148         r'''(?sx)^
3149             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3150             (?:\s*&&\s*(?P=func_name))?
3151             \s*\(\s*(?P<callback_data>.*)\);?
3152             \s*?(?://[^\n]*)*$''',
3153         r'\g<callback_data>', code)
3154
3155
3156 def js_to_json(code, vars={}):
3157     # vars is a dict of var, val pairs to substitute
3158     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3159     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3160     INTEGER_TABLE = (
3161         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3162         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3163     )
3164
3165     def fix_kv(m):
3166         v = m.group(0)
3167         if v in ('true', 'false', 'null'):
3168             return v
3169         elif v in ('undefined', 'void 0'):
3170             return 'null'
3171         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3172             return ""
3173
3174         if v[0] in ("'", '"'):
3175             v = re.sub(r'(?s)\\.|"', lambda m: {
3176                 '"': '\\"',
3177                 "\\'": "'",
3178                 '\\\n': '',
3179                 '\\x': '\\u00',
3180             }.get(m.group(0), m.group(0)), v[1:-1])
3181         else:
3182             for regex, base in INTEGER_TABLE:
3183                 im = re.match(regex, v)
3184                 if im:
3185                     i = int(im.group(1), base)
3186                     return '"%d":' % i if v.endswith(':') else '%d' % i
3187
3188             if v in vars:
3189                 return vars[v]
3190
3191         return '"%s"' % v
3192
3193     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3194
3195     return re.sub(r'''(?sx)
3196         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3197         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3198         {comment}|,(?={skip}[\]}}])|
3199         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3200         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3201         [0-9]+(?={skip}:)|
3202         !+
3203         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3204
3205
3206 def qualities(quality_ids):
3207     """ Get a numeric quality value out of a list of possible values """
3208     def q(qid):
3209         try:
3210             return quality_ids.index(qid)
3211         except ValueError:
3212             return -1
3213     return q
3214
3215
3216 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3217
3218
3219 DEFAULT_OUTTMPL = {
3220     'default': '%(title)s [%(id)s].%(ext)s',
3221     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3222 }
3223 OUTTMPL_TYPES = {
3224     'chapter': None,
3225     'subtitle': None,
3226     'thumbnail': None,
3227     'description': 'description',
3228     'annotation': 'annotations.xml',
3229     'infojson': 'info.json',
3230     'link': None,
3231     'pl_video': None,
3232     'pl_thumbnail': None,
3233     'pl_description': 'description',
3234     'pl_infojson': 'info.json',
3235 }
3236
3237 # As of [1] format syntax is:
3238 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3239 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3240 STR_FORMAT_RE_TMPL = r'''(?x)
3241     (?<!%)(?P<prefix>(?:%%)*)
3242     %
3243     (?P<has_key>\((?P<key>{0})\))?
3244     (?P<format>
3245         (?P<conversion>[#0\-+ ]+)?
3246         (?P<min_width>\d+)?
3247         (?P<precision>\.\d+)?
3248         (?P<len_mod>[hlL])?  # unused in python
3249         {1}  # conversion type
3250     )
3251 '''
3252
3253
3254 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3255
3256
3257 def limit_length(s, length):
3258     """ Add ellipses to overly long strings """
3259     if s is None:
3260         return None
3261     ELLIPSES = '...'
3262     if len(s) > length:
3263         return s[:length - len(ELLIPSES)] + ELLIPSES
3264     return s
3265
3266
3267 def version_tuple(v):
3268     return tuple(int(e) for e in re.split(r'[-.]', v))
3269
3270
3271 def is_outdated_version(version, limit, assume_new=True):
3272     if not version:
3273         return not assume_new
3274     try:
3275         return version_tuple(version) < version_tuple(limit)
3276     except ValueError:
3277         return not assume_new
3278
3279
3280 def ytdl_is_updateable():
3281     """ Returns if yt-dlp can be updated with -U """
3282
3283     from .update import is_non_updateable
3284
3285     return not is_non_updateable()
3286
3287
3288 def args_to_str(args):
3289     # Get a short string representation for a subprocess command
3290     return ' '.join(compat_shlex_quote(a) for a in args)
3291
3292
3293 def error_to_compat_str(err):
3294     err_str = str(err)
3295     # On python 2 error byte string must be decoded with proper
3296     # encoding rather than ascii
3297     if sys.version_info[0] < 3:
3298         err_str = err_str.decode(preferredencoding())
3299     return err_str
3300
3301
3302 def mimetype2ext(mt):
3303     if mt is None:
3304         return None
3305
3306     mt, _, params = mt.partition(';')
3307     mt = mt.strip()
3308
3309     FULL_MAP = {
3310         'audio/mp4': 'm4a',
3311         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3312         # it's the most popular one
3313         'audio/mpeg': 'mp3',
3314         'audio/x-wav': 'wav',
3315         'audio/wav': 'wav',
3316         'audio/wave': 'wav',
3317     }
3318
3319     ext = FULL_MAP.get(mt)
3320     if ext is not None:
3321         return ext
3322
3323     SUBTYPE_MAP = {
3324         '3gpp': '3gp',
3325         'smptett+xml': 'tt',
3326         'ttaf+xml': 'dfxp',
3327         'ttml+xml': 'ttml',
3328         'x-flv': 'flv',
3329         'x-mp4-fragmented': 'mp4',
3330         'x-ms-sami': 'sami',
3331         'x-ms-wmv': 'wmv',
3332         'mpegurl': 'm3u8',
3333         'x-mpegurl': 'm3u8',
3334         'vnd.apple.mpegurl': 'm3u8',
3335         'dash+xml': 'mpd',
3336         'f4m+xml': 'f4m',
3337         'hds+xml': 'f4m',
3338         'vnd.ms-sstr+xml': 'ism',
3339         'quicktime': 'mov',
3340         'mp2t': 'ts',
3341         'x-wav': 'wav',
3342         'filmstrip+json': 'fs',
3343         'svg+xml': 'svg',
3344     }
3345
3346     _, _, subtype = mt.rpartition('/')
3347     ext = SUBTYPE_MAP.get(subtype.lower())
3348     if ext is not None:
3349         return ext
3350
3351     SUFFIX_MAP = {
3352         'json': 'json',
3353         'xml': 'xml',
3354         'zip': 'zip',
3355         'gzip': 'gz',
3356     }
3357
3358     _, _, suffix = subtype.partition('+')
3359     ext = SUFFIX_MAP.get(suffix)
3360     if ext is not None:
3361         return ext
3362
3363     return subtype.replace('+', '.')
3364
3365
3366 def ext2mimetype(ext_or_url):
3367     if not ext_or_url:
3368         return None
3369     if '.' not in ext_or_url:
3370         ext_or_url = f'file.{ext_or_url}'
3371     return mimetypes.guess_type(ext_or_url)[0]
3372
3373
3374 def parse_codecs(codecs_str):
3375     # http://tools.ietf.org/html/rfc6381
3376     if not codecs_str:
3377         return {}
3378     split_codecs = list(filter(None, map(
3379         str.strip, codecs_str.strip().strip(',').split(','))))
3380     vcodec, acodec, tcodec, hdr = None, None, None, None
3381     for full_codec in split_codecs:
3382         parts = full_codec.split('.')
3383         codec = parts[0].replace('0', '')
3384         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3385                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3386             if not vcodec:
3387                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3388                 if codec in ('dvh1', 'dvhe'):
3389                     hdr = 'DV'
3390                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3391                     hdr = 'HDR10'
3392                 elif full_codec.replace('0', '').startswith('vp9.2'):
3393                     hdr = 'HDR10'
3394         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3395             if not acodec:
3396                 acodec = full_codec
3397         elif codec in ('stpp', 'wvtt',):
3398             if not tcodec:
3399                 tcodec = full_codec
3400         else:
3401             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3402     if vcodec or acodec or tcodec:
3403         return {
3404             'vcodec': vcodec or 'none',
3405             'acodec': acodec or 'none',
3406             'dynamic_range': hdr,
3407             **({'tcodec': tcodec} if tcodec is not None else {}),
3408         }
3409     elif len(split_codecs) == 2:
3410         return {
3411             'vcodec': split_codecs[0],
3412             'acodec': split_codecs[1],
3413         }
3414     return {}
3415
3416
3417 def urlhandle_detect_ext(url_handle):
3418     getheader = url_handle.headers.get
3419
3420     cd = getheader('Content-Disposition')
3421     if cd:
3422         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3423         if m:
3424             e = determine_ext(m.group('filename'), default_ext=None)
3425             if e:
3426                 return e
3427
3428     return mimetype2ext(getheader('Content-Type'))
3429
3430
3431 def encode_data_uri(data, mime_type):
3432     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3433
3434
3435 def age_restricted(content_limit, age_limit):
3436     """ Returns True iff the content should be blocked """
3437
3438     if age_limit is None:  # No limit set
3439         return False
3440     if content_limit is None:
3441         return False  # Content available for everyone
3442     return age_limit < content_limit
3443
3444
3445 def is_html(first_bytes):
3446     """ Detect whether a file contains HTML by examining its first bytes. """
3447
3448     BOMS = [
3449         (b'\xef\xbb\xbf', 'utf-8'),
3450         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3451         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3452         (b'\xff\xfe', 'utf-16-le'),
3453         (b'\xfe\xff', 'utf-16-be'),
3454     ]
3455     for bom, enc in BOMS:
3456         if first_bytes.startswith(bom):
3457             s = first_bytes[len(bom):].decode(enc, 'replace')
3458             break
3459     else:
3460         s = first_bytes.decode('utf-8', 'replace')
3461
3462     return re.match(r'^\s*<', s)
3463
3464
3465 def determine_protocol(info_dict):
3466     protocol = info_dict.get('protocol')
3467     if protocol is not None:
3468         return protocol
3469
3470     url = sanitize_url(info_dict['url'])
3471     if url.startswith('rtmp'):
3472         return 'rtmp'
3473     elif url.startswith('mms'):
3474         return 'mms'
3475     elif url.startswith('rtsp'):
3476         return 'rtsp'
3477
3478     ext = determine_ext(url)
3479     if ext == 'm3u8':
3480         return 'm3u8'
3481     elif ext == 'f4m':
3482         return 'f4m'
3483
3484     return compat_urllib_parse_urlparse(url).scheme
3485
3486
3487 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3488     """ Render a list of rows, each as a list of values.
3489     Text after a \t will be right aligned """
3490     def width(string):
3491         return len(remove_terminal_sequences(string).replace('\t', ''))
3492
3493     def get_max_lens(table):
3494         return [max(width(str(v)) for v in col) for col in zip(*table)]
3495
3496     def filter_using_list(row, filterArray):
3497         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3498
3499     max_lens = get_max_lens(data) if hide_empty else []
3500     header_row = filter_using_list(header_row, max_lens)
3501     data = [filter_using_list(row, max_lens) for row in data]
3502
3503     table = [header_row] + data
3504     max_lens = get_max_lens(table)
3505     extra_gap += 1
3506     if delim:
3507         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3508         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3509     for row in table:
3510         for pos, text in enumerate(map(str, row)):
3511             if '\t' in text:
3512                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3513             else:
3514                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3515     ret = '\n'.join(''.join(row).rstrip() for row in table)
3516     return ret
3517
3518
3519 def _match_one(filter_part, dct, incomplete):
3520     # TODO: Generalize code with YoutubeDL._build_format_filter
3521     STRING_OPERATORS = {
3522         '*=': operator.contains,
3523         '^=': lambda attr, value: attr.startswith(value),
3524         '$=': lambda attr, value: attr.endswith(value),
3525         '~=': lambda attr, value: re.search(value, attr),
3526     }
3527     COMPARISON_OPERATORS = {
3528         **STRING_OPERATORS,
3529         '<=': operator.le,  # "<=" must be defined above "<"
3530         '<': operator.lt,
3531         '>=': operator.ge,
3532         '>': operator.gt,
3533         '=': operator.eq,
3534     }
3535
3536     operator_rex = re.compile(r'''(?x)\s*
3537         (?P<key>[a-z_]+)
3538         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3539         (?:
3540             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3541             (?P<strval>.+?)
3542         )
3543         \s*$
3544         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3545     m = operator_rex.search(filter_part)
3546     if m:
3547         m = m.groupdict()
3548         unnegated_op = COMPARISON_OPERATORS[m['op']]
3549         if m['negation']:
3550             op = lambda attr, value: not unnegated_op(attr, value)
3551         else:
3552             op = unnegated_op
3553         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3554         if m['quote']:
3555             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3556         actual_value = dct.get(m['key'])
3557         numeric_comparison = None
3558         if isinstance(actual_value, compat_numeric_types):
3559             # If the original field is a string and matching comparisonvalue is
3560             # a number we should respect the origin of the original field
3561             # and process comparison value as a string (see
3562             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3563             try:
3564                 numeric_comparison = int(comparison_value)
3565             except ValueError:
3566                 numeric_comparison = parse_filesize(comparison_value)
3567                 if numeric_comparison is None:
3568                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3569                 if numeric_comparison is None:
3570                     numeric_comparison = parse_duration(comparison_value)
3571         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3572             raise ValueError('Operator %s only supports string values!' % m['op'])
3573         if actual_value is None:
3574             return incomplete or m['none_inclusive']
3575         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3576
3577     UNARY_OPERATORS = {
3578         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3579         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3580     }
3581     operator_rex = re.compile(r'''(?x)\s*
3582         (?P<op>%s)\s*(?P<key>[a-z_]+)
3583         \s*$
3584         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3585     m = operator_rex.search(filter_part)
3586     if m:
3587         op = UNARY_OPERATORS[m.group('op')]
3588         actual_value = dct.get(m.group('key'))
3589         if incomplete and actual_value is None:
3590             return True
3591         return op(actual_value)
3592
3593     raise ValueError('Invalid filter part %r' % filter_part)
3594
3595
3596 def match_str(filter_str, dct, incomplete=False):
3597     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3598         When incomplete, all conditions passes on missing fields
3599     """
3600     return all(
3601         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3602         for filter_part in re.split(r'(?<!\\)&', filter_str))
3603
3604
3605 def match_filter_func(filter_str):
3606     if filter_str is None:
3607         return None
3608
3609     def _match_func(info_dict, *args, **kwargs):
3610         if match_str(filter_str, info_dict, *args, **kwargs):
3611             return None
3612         else:
3613             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3614             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3615     return _match_func
3616
3617
3618 def parse_dfxp_time_expr(time_expr):
3619     if not time_expr:
3620         return
3621
3622     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3623     if mobj:
3624         return float(mobj.group('time_offset'))
3625
3626     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3627     if mobj:
3628         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3629
3630
3631 def srt_subtitles_timecode(seconds):
3632     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3633
3634
3635 def ass_subtitles_timecode(seconds):
3636     time = timetuple_from_msec(seconds * 1000)
3637     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3638
3639
3640 def dfxp2srt(dfxp_data):
3641     '''
3642     @param dfxp_data A bytes-like object containing DFXP data
3643     @returns A unicode object containing converted SRT data
3644     '''
3645     LEGACY_NAMESPACES = (
3646         (b'http://www.w3.org/ns/ttml', [
3647             b'http://www.w3.org/2004/11/ttaf1',
3648             b'http://www.w3.org/2006/04/ttaf1',
3649             b'http://www.w3.org/2006/10/ttaf1',
3650         ]),
3651         (b'http://www.w3.org/ns/ttml#styling', [
3652             b'http://www.w3.org/ns/ttml#style',
3653         ]),
3654     )
3655
3656     SUPPORTED_STYLING = [
3657         'color',
3658         'fontFamily',
3659         'fontSize',
3660         'fontStyle',
3661         'fontWeight',
3662         'textDecoration'
3663     ]
3664
3665     _x = functools.partial(xpath_with_ns, ns_map={
3666         'xml': 'http://www.w3.org/XML/1998/namespace',
3667         'ttml': 'http://www.w3.org/ns/ttml',
3668         'tts': 'http://www.w3.org/ns/ttml#styling',
3669     })
3670
3671     styles = {}
3672     default_style = {}
3673
3674     class TTMLPElementParser(object):
3675         _out = ''
3676         _unclosed_elements = []
3677         _applied_styles = []
3678
3679         def start(self, tag, attrib):
3680             if tag in (_x('ttml:br'), 'br'):
3681                 self._out += '\n'
3682             else:
3683                 unclosed_elements = []
3684                 style = {}
3685                 element_style_id = attrib.get('style')
3686                 if default_style:
3687                     style.update(default_style)
3688                 if element_style_id:
3689                     style.update(styles.get(element_style_id, {}))
3690                 for prop in SUPPORTED_STYLING:
3691                     prop_val = attrib.get(_x('tts:' + prop))
3692                     if prop_val:
3693                         style[prop] = prop_val
3694                 if style:
3695                     font = ''
3696                     for k, v in sorted(style.items()):
3697                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3698                             continue
3699                         if k == 'color':
3700                             font += ' color="%s"' % v
3701                         elif k == 'fontSize':
3702                             font += ' size="%s"' % v
3703                         elif k == 'fontFamily':
3704                             font += ' face="%s"' % v
3705                         elif k == 'fontWeight' and v == 'bold':
3706                             self._out += '<b>'
3707                             unclosed_elements.append('b')
3708                         elif k == 'fontStyle' and v == 'italic':
3709                             self._out += '<i>'
3710                             unclosed_elements.append('i')
3711                         elif k == 'textDecoration' and v == 'underline':
3712                             self._out += '<u>'
3713                             unclosed_elements.append('u')
3714                     if font:
3715                         self._out += '<font' + font + '>'
3716                         unclosed_elements.append('font')
3717                     applied_style = {}
3718                     if self._applied_styles:
3719                         applied_style.update(self._applied_styles[-1])
3720                     applied_style.update(style)
3721                     self._applied_styles.append(applied_style)
3722                 self._unclosed_elements.append(unclosed_elements)
3723
3724         def end(self, tag):
3725             if tag not in (_x('ttml:br'), 'br'):
3726                 unclosed_elements = self._unclosed_elements.pop()
3727                 for element in reversed(unclosed_elements):
3728                     self._out += '</%s>' % element
3729                 if unclosed_elements and self._applied_styles:
3730                     self._applied_styles.pop()
3731
3732         def data(self, data):
3733             self._out += data
3734
3735         def close(self):
3736             return self._out.strip()
3737
3738     def parse_node(node):
3739         target = TTMLPElementParser()
3740         parser = xml.etree.ElementTree.XMLParser(target=target)
3741         parser.feed(xml.etree.ElementTree.tostring(node))
3742         return parser.close()
3743
3744     for k, v in LEGACY_NAMESPACES:
3745         for ns in v:
3746             dfxp_data = dfxp_data.replace(ns, k)
3747
3748     dfxp = compat_etree_fromstring(dfxp_data)
3749     out = []
3750     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3751
3752     if not paras:
3753         raise ValueError('Invalid dfxp/TTML subtitle')
3754
3755     repeat = False
3756     while True:
3757         for style in dfxp.findall(_x('.//ttml:style')):
3758             style_id = style.get('id') or style.get(_x('xml:id'))
3759             if not style_id:
3760                 continue
3761             parent_style_id = style.get('style')
3762             if parent_style_id:
3763                 if parent_style_id not in styles:
3764                     repeat = True
3765                     continue
3766                 styles[style_id] = styles[parent_style_id].copy()
3767             for prop in SUPPORTED_STYLING:
3768                 prop_val = style.get(_x('tts:' + prop))
3769                 if prop_val:
3770                     styles.setdefault(style_id, {})[prop] = prop_val
3771         if repeat:
3772             repeat = False
3773         else:
3774             break
3775
3776     for p in ('body', 'div'):
3777         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3778         if ele is None:
3779             continue
3780         style = styles.get(ele.get('style'))
3781         if not style:
3782             continue
3783         default_style.update(style)
3784
3785     for para, index in zip(paras, itertools.count(1)):
3786         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3787         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3788         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3789         if begin_time is None:
3790             continue
3791         if not end_time:
3792             if not dur:
3793                 continue
3794             end_time = begin_time + dur
3795         out.append('%d\n%s --> %s\n%s\n\n' % (
3796             index,
3797             srt_subtitles_timecode(begin_time),
3798             srt_subtitles_timecode(end_time),
3799             parse_node(para)))
3800
3801     return ''.join(out)
3802
3803
3804 def cli_option(params, command_option, param):
3805     param = params.get(param)
3806     if param:
3807         param = compat_str(param)
3808     return [command_option, param] if param is not None else []
3809
3810
3811 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3812     param = params.get(param)
3813     if param is None:
3814         return []
3815     assert isinstance(param, bool)
3816     if separator:
3817         return [command_option + separator + (true_value if param else false_value)]
3818     return [command_option, true_value if param else false_value]
3819
3820
3821 def cli_valueless_option(params, command_option, param, expected_value=True):
3822     param = params.get(param)
3823     return [command_option] if param == expected_value else []
3824
3825
3826 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3827     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3828         if use_compat:
3829             return argdict
3830         else:
3831             argdict = None
3832     if argdict is None:
3833         return default
3834     assert isinstance(argdict, dict)
3835
3836     assert isinstance(keys, (list, tuple))
3837     for key_list in keys:
3838         arg_list = list(filter(
3839             lambda x: x is not None,
3840             [argdict.get(key.lower()) for key in variadic(key_list)]))
3841         if arg_list:
3842             return [arg for args in arg_list for arg in args]
3843     return default
3844
3845
3846 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3847     main_key, exe = main_key.lower(), exe.lower()
3848     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3849     keys = [f'{root_key}{k}' for k in (keys or [''])]
3850     if root_key in keys:
3851         if main_key != exe:
3852             keys.append((main_key, exe))
3853         keys.append('default')
3854     else:
3855         use_compat = False
3856     return cli_configuration_args(argdict, keys, default, use_compat)
3857
3858
3859 class ISO639Utils(object):
3860     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3861     _lang_map = {
3862         'aa': 'aar',
3863         'ab': 'abk',
3864         'ae': 'ave',
3865         'af': 'afr',
3866         'ak': 'aka',
3867         'am': 'amh',
3868         'an': 'arg',
3869         'ar': 'ara',
3870         'as': 'asm',
3871         'av': 'ava',
3872         'ay': 'aym',
3873         'az': 'aze',
3874         'ba': 'bak',
3875         'be': 'bel',
3876         'bg': 'bul',
3877         'bh': 'bih',
3878         'bi': 'bis',
3879         'bm': 'bam',
3880         'bn': 'ben',
3881         'bo': 'bod',
3882         'br': 'bre',
3883         'bs': 'bos',
3884         'ca': 'cat',
3885         'ce': 'che',
3886         'ch': 'cha',
3887         'co': 'cos',
3888         'cr': 'cre',
3889         'cs': 'ces',
3890         'cu': 'chu',
3891         'cv': 'chv',
3892         'cy': 'cym',
3893         'da': 'dan',
3894         'de': 'deu',
3895         'dv': 'div',
3896         'dz': 'dzo',
3897         'ee': 'ewe',
3898         'el': 'ell',
3899         'en': 'eng',
3900         'eo': 'epo',
3901         'es': 'spa',
3902         'et': 'est',
3903         'eu': 'eus',
3904         'fa': 'fas',
3905         'ff': 'ful',
3906         'fi': 'fin',
3907         'fj': 'fij',
3908         'fo': 'fao',
3909         'fr': 'fra',
3910         'fy': 'fry',
3911         'ga': 'gle',
3912         'gd': 'gla',
3913         'gl': 'glg',
3914         'gn': 'grn',
3915         'gu': 'guj',
3916         'gv': 'glv',
3917         'ha': 'hau',
3918         'he': 'heb',
3919         'iw': 'heb',  # Replaced by he in 1989 revision
3920         'hi': 'hin',
3921         'ho': 'hmo',
3922         'hr': 'hrv',
3923         'ht': 'hat',
3924         'hu': 'hun',
3925         'hy': 'hye',
3926         'hz': 'her',
3927         'ia': 'ina',
3928         'id': 'ind',
3929         'in': 'ind',  # Replaced by id in 1989 revision
3930         'ie': 'ile',
3931         'ig': 'ibo',
3932         'ii': 'iii',
3933         'ik': 'ipk',
3934         'io': 'ido',
3935         'is': 'isl',
3936         'it': 'ita',
3937         'iu': 'iku',
3938         'ja': 'jpn',
3939         'jv': 'jav',
3940         'ka': 'kat',
3941         'kg': 'kon',
3942         'ki': 'kik',
3943         'kj': 'kua',
3944         'kk': 'kaz',
3945         'kl': 'kal',
3946         'km': 'khm',
3947         'kn': 'kan',
3948         'ko': 'kor',
3949         'kr': 'kau',
3950         'ks': 'kas',
3951         'ku': 'kur',
3952         'kv': 'kom',
3953         'kw': 'cor',
3954         'ky': 'kir',
3955         'la': 'lat',
3956         'lb': 'ltz',
3957         'lg': 'lug',
3958         'li': 'lim',
3959         'ln': 'lin',
3960         'lo': 'lao',
3961         'lt': 'lit',
3962         'lu': 'lub',
3963         'lv': 'lav',
3964         'mg': 'mlg',
3965         'mh': 'mah',
3966         'mi': 'mri',
3967         'mk': 'mkd',
3968         'ml': 'mal',
3969         'mn': 'mon',
3970         'mr': 'mar',
3971         'ms': 'msa',
3972         'mt': 'mlt',
3973         'my': 'mya',
3974         'na': 'nau',
3975         'nb': 'nob',
3976         'nd': 'nde',
3977         'ne': 'nep',
3978         'ng': 'ndo',
3979         'nl': 'nld',
3980         'nn': 'nno',
3981         'no': 'nor',
3982         'nr': 'nbl',
3983         'nv': 'nav',
3984         'ny': 'nya',
3985         'oc': 'oci',
3986         'oj': 'oji',
3987         'om': 'orm',
3988         'or': 'ori',
3989         'os': 'oss',
3990         'pa': 'pan',
3991         'pi': 'pli',
3992         'pl': 'pol',
3993         'ps': 'pus',
3994         'pt': 'por',
3995         'qu': 'que',
3996         'rm': 'roh',
3997         'rn': 'run',
3998         'ro': 'ron',
3999         'ru': 'rus',
4000         'rw': 'kin',
4001         'sa': 'san',
4002         'sc': 'srd',
4003         'sd': 'snd',
4004         'se': 'sme',
4005         'sg': 'sag',
4006         'si': 'sin',
4007         'sk': 'slk',
4008         'sl': 'slv',
4009         'sm': 'smo',
4010         'sn': 'sna',
4011         'so': 'som',
4012         'sq': 'sqi',
4013         'sr': 'srp',
4014         'ss': 'ssw',
4015         'st': 'sot',
4016         'su': 'sun',
4017         'sv': 'swe',
4018         'sw': 'swa',
4019         'ta': 'tam',
4020         'te': 'tel',
4021         'tg': 'tgk',
4022         'th': 'tha',
4023         'ti': 'tir',
4024         'tk': 'tuk',
4025         'tl': 'tgl',
4026         'tn': 'tsn',
4027         'to': 'ton',
4028         'tr': 'tur',
4029         'ts': 'tso',
4030         'tt': 'tat',
4031         'tw': 'twi',
4032         'ty': 'tah',
4033         'ug': 'uig',
4034         'uk': 'ukr',
4035         'ur': 'urd',
4036         'uz': 'uzb',
4037         've': 'ven',
4038         'vi': 'vie',
4039         'vo': 'vol',
4040         'wa': 'wln',
4041         'wo': 'wol',
4042         'xh': 'xho',
4043         'yi': 'yid',
4044         'ji': 'yid',  # Replaced by yi in 1989 revision
4045         'yo': 'yor',
4046         'za': 'zha',
4047         'zh': 'zho',
4048         'zu': 'zul',
4049     }
4050
4051     @classmethod
4052     def short2long(cls, code):
4053         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4054         return cls._lang_map.get(code[:2])
4055
4056     @classmethod
4057     def long2short(cls, code):
4058         """Convert language code from ISO 639-2/T to ISO 639-1"""
4059         for short_name, long_name in cls._lang_map.items():
4060             if long_name == code:
4061                 return short_name
4062
4063
4064 class ISO3166Utils(object):
4065     # From http://data.okfn.org/data/core/country-list
4066     _country_map = {
4067         'AF': 'Afghanistan',
4068         'AX': 'Åland Islands',
4069         'AL': 'Albania',
4070         'DZ': 'Algeria',
4071         'AS': 'American Samoa',
4072         'AD': 'Andorra',
4073         'AO': 'Angola',
4074         'AI': 'Anguilla',
4075         'AQ': 'Antarctica',
4076         'AG': 'Antigua and Barbuda',
4077         'AR': 'Argentina',
4078         'AM': 'Armenia',
4079         'AW': 'Aruba',
4080         'AU': 'Australia',
4081         'AT': 'Austria',
4082         'AZ': 'Azerbaijan',
4083         'BS': 'Bahamas',
4084         'BH': 'Bahrain',
4085         'BD': 'Bangladesh',
4086         'BB': 'Barbados',
4087         'BY': 'Belarus',
4088         'BE': 'Belgium',
4089         'BZ': 'Belize',
4090         'BJ': 'Benin',
4091         'BM': 'Bermuda',
4092         'BT': 'Bhutan',
4093         'BO': 'Bolivia, Plurinational State of',
4094         'BQ': 'Bonaire, Sint Eustatius and Saba',
4095         'BA': 'Bosnia and Herzegovina',
4096         'BW': 'Botswana',
4097         'BV': 'Bouvet Island',
4098         'BR': 'Brazil',
4099         'IO': 'British Indian Ocean Territory',
4100         'BN': 'Brunei Darussalam',
4101         'BG': 'Bulgaria',
4102         'BF': 'Burkina Faso',
4103         'BI': 'Burundi',
4104         'KH': 'Cambodia',
4105         'CM': 'Cameroon',
4106         'CA': 'Canada',
4107         'CV': 'Cape Verde',
4108         'KY': 'Cayman Islands',
4109         'CF': 'Central African Republic',
4110         'TD': 'Chad',
4111         'CL': 'Chile',
4112         'CN': 'China',
4113         'CX': 'Christmas Island',
4114         'CC': 'Cocos (Keeling) Islands',
4115         'CO': 'Colombia',
4116         'KM': 'Comoros',
4117         'CG': 'Congo',
4118         'CD': 'Congo, the Democratic Republic of the',
4119         'CK': 'Cook Islands',
4120         'CR': 'Costa Rica',
4121         'CI': 'Côte d\'Ivoire',
4122         'HR': 'Croatia',
4123         'CU': 'Cuba',
4124         'CW': 'Curaçao',
4125         'CY': 'Cyprus',
4126         'CZ': 'Czech Republic',
4127         'DK': 'Denmark',
4128         'DJ': 'Djibouti',
4129         'DM': 'Dominica',
4130         'DO': 'Dominican Republic',
4131         'EC': 'Ecuador',
4132         'EG': 'Egypt',
4133         'SV': 'El Salvador',
4134         'GQ': 'Equatorial Guinea',
4135         'ER': 'Eritrea',
4136         'EE': 'Estonia',
4137         'ET': 'Ethiopia',
4138         'FK': 'Falkland Islands (Malvinas)',
4139         'FO': 'Faroe Islands',
4140         'FJ': 'Fiji',
4141         'FI': 'Finland',
4142         'FR': 'France',
4143         'GF': 'French Guiana',
4144         'PF': 'French Polynesia',
4145         'TF': 'French Southern Territories',
4146         'GA': 'Gabon',
4147         'GM': 'Gambia',
4148         'GE': 'Georgia',
4149         'DE': 'Germany',
4150         'GH': 'Ghana',
4151         'GI': 'Gibraltar',
4152         'GR': 'Greece',
4153         'GL': 'Greenland',
4154         'GD': 'Grenada',
4155         'GP': 'Guadeloupe',
4156         'GU': 'Guam',
4157         'GT': 'Guatemala',
4158         'GG': 'Guernsey',
4159         'GN': 'Guinea',
4160         'GW': 'Guinea-Bissau',
4161         'GY': 'Guyana',
4162         'HT': 'Haiti',
4163         'HM': 'Heard Island and McDonald Islands',
4164         'VA': 'Holy See (Vatican City State)',
4165         'HN': 'Honduras',
4166         'HK': 'Hong Kong',
4167         'HU': 'Hungary',
4168         'IS': 'Iceland',
4169         'IN': 'India',
4170         'ID': 'Indonesia',
4171         'IR': 'Iran, Islamic Republic of',
4172         'IQ': 'Iraq',
4173         'IE': 'Ireland',
4174         'IM': 'Isle of Man',
4175         'IL': 'Israel',
4176         'IT': 'Italy',
4177         'JM': 'Jamaica',
4178         'JP': 'Japan',
4179         'JE': 'Jersey',
4180         'JO': 'Jordan',
4181         'KZ': 'Kazakhstan',
4182         'KE': 'Kenya',
4183         'KI': 'Kiribati',
4184         'KP': 'Korea, Democratic People\'s Republic of',
4185         'KR': 'Korea, Republic of',
4186         'KW': 'Kuwait',
4187         'KG': 'Kyrgyzstan',
4188         'LA': 'Lao People\'s Democratic Republic',
4189         'LV': 'Latvia',
4190         'LB': 'Lebanon',
4191         'LS': 'Lesotho',
4192         'LR': 'Liberia',
4193         'LY': 'Libya',
4194         'LI': 'Liechtenstein',
4195         'LT': 'Lithuania',
4196         'LU': 'Luxembourg',
4197         'MO': 'Macao',
4198         'MK': 'Macedonia, the Former Yugoslav Republic of',
4199         'MG': 'Madagascar',
4200         'MW': 'Malawi',
4201         'MY': 'Malaysia',
4202         'MV': 'Maldives',
4203         'ML': 'Mali',
4204         'MT': 'Malta',
4205         'MH': 'Marshall Islands',
4206         'MQ': 'Martinique',
4207         'MR': 'Mauritania',
4208         'MU': 'Mauritius',
4209         'YT': 'Mayotte',
4210         'MX': 'Mexico',
4211         'FM': 'Micronesia, Federated States of',
4212         'MD': 'Moldova, Republic of',
4213         'MC': 'Monaco',
4214         'MN': 'Mongolia',
4215         'ME': 'Montenegro',
4216         'MS': 'Montserrat',
4217         'MA': 'Morocco',
4218         'MZ': 'Mozambique',
4219         'MM': 'Myanmar',
4220         'NA': 'Namibia',
4221         'NR': 'Nauru',
4222         'NP': 'Nepal',
4223         'NL': 'Netherlands',
4224         'NC': 'New Caledonia',
4225         'NZ': 'New Zealand',
4226         'NI': 'Nicaragua',
4227         'NE': 'Niger',
4228         'NG': 'Nigeria',
4229         'NU': 'Niue',
4230         'NF': 'Norfolk Island',
4231         'MP': 'Northern Mariana Islands',
4232         'NO': 'Norway',
4233         'OM': 'Oman',
4234         'PK': 'Pakistan',
4235         'PW': 'Palau',
4236         'PS': 'Palestine, State of',
4237         'PA': 'Panama',
4238         'PG': 'Papua New Guinea',
4239         'PY': 'Paraguay',
4240         'PE': 'Peru',
4241         'PH': 'Philippines',
4242         'PN': 'Pitcairn',
4243         'PL': 'Poland',
4244         'PT': 'Portugal',
4245         'PR': 'Puerto Rico',
4246         'QA': 'Qatar',
4247         'RE': 'Réunion',
4248         'RO': 'Romania',
4249         'RU': 'Russian Federation',
4250         'RW': 'Rwanda',
4251         'BL': 'Saint Barthélemy',
4252         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4253         'KN': 'Saint Kitts and Nevis',
4254         'LC': 'Saint Lucia',
4255         'MF': 'Saint Martin (French part)',
4256         'PM': 'Saint Pierre and Miquelon',
4257         'VC': 'Saint Vincent and the Grenadines',
4258         'WS': 'Samoa',
4259         'SM': 'San Marino',
4260         'ST': 'Sao Tome and Principe',
4261         'SA': 'Saudi Arabia',
4262         'SN': 'Senegal',
4263         'RS': 'Serbia',
4264         'SC': 'Seychelles',
4265         'SL': 'Sierra Leone',
4266         'SG': 'Singapore',
4267         'SX': 'Sint Maarten (Dutch part)',
4268         'SK': 'Slovakia',
4269         'SI': 'Slovenia',
4270         'SB': 'Solomon Islands',
4271         'SO': 'Somalia',
4272         'ZA': 'South Africa',
4273         'GS': 'South Georgia and the South Sandwich Islands',
4274         'SS': 'South Sudan',
4275         'ES': 'Spain',
4276         'LK': 'Sri Lanka',
4277         'SD': 'Sudan',
4278         'SR': 'Suriname',
4279         'SJ': 'Svalbard and Jan Mayen',
4280         'SZ': 'Swaziland',
4281         'SE': 'Sweden',
4282         'CH': 'Switzerland',
4283         'SY': 'Syrian Arab Republic',
4284         'TW': 'Taiwan, Province of China',
4285         'TJ': 'Tajikistan',
4286         'TZ': 'Tanzania, United Republic of',
4287         'TH': 'Thailand',
4288         'TL': 'Timor-Leste',
4289         'TG': 'Togo',
4290         'TK': 'Tokelau',
4291         'TO': 'Tonga',
4292         'TT': 'Trinidad and Tobago',
4293         'TN': 'Tunisia',
4294         'TR': 'Turkey',
4295         'TM': 'Turkmenistan',
4296         'TC': 'Turks and Caicos Islands',
4297         'TV': 'Tuvalu',
4298         'UG': 'Uganda',
4299         'UA': 'Ukraine',
4300         'AE': 'United Arab Emirates',
4301         'GB': 'United Kingdom',
4302         'US': 'United States',
4303         'UM': 'United States Minor Outlying Islands',
4304         'UY': 'Uruguay',
4305         'UZ': 'Uzbekistan',
4306         'VU': 'Vanuatu',
4307         'VE': 'Venezuela, Bolivarian Republic of',
4308         'VN': 'Viet Nam',
4309         'VG': 'Virgin Islands, British',
4310         'VI': 'Virgin Islands, U.S.',
4311         'WF': 'Wallis and Futuna',
4312         'EH': 'Western Sahara',
4313         'YE': 'Yemen',
4314         'ZM': 'Zambia',
4315         'ZW': 'Zimbabwe',
4316     }
4317
4318     @classmethod
4319     def short2full(cls, code):
4320         """Convert an ISO 3166-2 country code to the corresponding full name"""
4321         return cls._country_map.get(code.upper())
4322
4323
4324 class GeoUtils(object):
4325     # Major IPv4 address blocks per country
4326     _country_ip_map = {
4327         'AD': '46.172.224.0/19',
4328         'AE': '94.200.0.0/13',
4329         'AF': '149.54.0.0/17',
4330         'AG': '209.59.64.0/18',
4331         'AI': '204.14.248.0/21',
4332         'AL': '46.99.0.0/16',
4333         'AM': '46.70.0.0/15',
4334         'AO': '105.168.0.0/13',
4335         'AP': '182.50.184.0/21',
4336         'AQ': '23.154.160.0/24',
4337         'AR': '181.0.0.0/12',
4338         'AS': '202.70.112.0/20',
4339         'AT': '77.116.0.0/14',
4340         'AU': '1.128.0.0/11',
4341         'AW': '181.41.0.0/18',
4342         'AX': '185.217.4.0/22',
4343         'AZ': '5.197.0.0/16',
4344         'BA': '31.176.128.0/17',
4345         'BB': '65.48.128.0/17',
4346         'BD': '114.130.0.0/16',
4347         'BE': '57.0.0.0/8',
4348         'BF': '102.178.0.0/15',
4349         'BG': '95.42.0.0/15',
4350         'BH': '37.131.0.0/17',
4351         'BI': '154.117.192.0/18',
4352         'BJ': '137.255.0.0/16',
4353         'BL': '185.212.72.0/23',
4354         'BM': '196.12.64.0/18',
4355         'BN': '156.31.0.0/16',
4356         'BO': '161.56.0.0/16',
4357         'BQ': '161.0.80.0/20',
4358         'BR': '191.128.0.0/12',
4359         'BS': '24.51.64.0/18',
4360         'BT': '119.2.96.0/19',
4361         'BW': '168.167.0.0/16',
4362         'BY': '178.120.0.0/13',
4363         'BZ': '179.42.192.0/18',
4364         'CA': '99.224.0.0/11',
4365         'CD': '41.243.0.0/16',
4366         'CF': '197.242.176.0/21',
4367         'CG': '160.113.0.0/16',
4368         'CH': '85.0.0.0/13',
4369         'CI': '102.136.0.0/14',
4370         'CK': '202.65.32.0/19',
4371         'CL': '152.172.0.0/14',
4372         'CM': '102.244.0.0/14',
4373         'CN': '36.128.0.0/10',
4374         'CO': '181.240.0.0/12',
4375         'CR': '201.192.0.0/12',
4376         'CU': '152.206.0.0/15',
4377         'CV': '165.90.96.0/19',
4378         'CW': '190.88.128.0/17',
4379         'CY': '31.153.0.0/16',
4380         'CZ': '88.100.0.0/14',
4381         'DE': '53.0.0.0/8',
4382         'DJ': '197.241.0.0/17',
4383         'DK': '87.48.0.0/12',
4384         'DM': '192.243.48.0/20',
4385         'DO': '152.166.0.0/15',
4386         'DZ': '41.96.0.0/12',
4387         'EC': '186.68.0.0/15',
4388         'EE': '90.190.0.0/15',
4389         'EG': '156.160.0.0/11',
4390         'ER': '196.200.96.0/20',
4391         'ES': '88.0.0.0/11',
4392         'ET': '196.188.0.0/14',
4393         'EU': '2.16.0.0/13',
4394         'FI': '91.152.0.0/13',
4395         'FJ': '144.120.0.0/16',
4396         'FK': '80.73.208.0/21',
4397         'FM': '119.252.112.0/20',
4398         'FO': '88.85.32.0/19',
4399         'FR': '90.0.0.0/9',
4400         'GA': '41.158.0.0/15',
4401         'GB': '25.0.0.0/8',
4402         'GD': '74.122.88.0/21',
4403         'GE': '31.146.0.0/16',
4404         'GF': '161.22.64.0/18',
4405         'GG': '62.68.160.0/19',
4406         'GH': '154.160.0.0/12',
4407         'GI': '95.164.0.0/16',
4408         'GL': '88.83.0.0/19',
4409         'GM': '160.182.0.0/15',
4410         'GN': '197.149.192.0/18',
4411         'GP': '104.250.0.0/19',
4412         'GQ': '105.235.224.0/20',
4413         'GR': '94.64.0.0/13',
4414         'GT': '168.234.0.0/16',
4415         'GU': '168.123.0.0/16',
4416         'GW': '197.214.80.0/20',
4417         'GY': '181.41.64.0/18',
4418         'HK': '113.252.0.0/14',
4419         'HN': '181.210.0.0/16',
4420         'HR': '93.136.0.0/13',
4421         'HT': '148.102.128.0/17',
4422         'HU': '84.0.0.0/14',
4423         'ID': '39.192.0.0/10',
4424         'IE': '87.32.0.0/12',
4425         'IL': '79.176.0.0/13',
4426         'IM': '5.62.80.0/20',
4427         'IN': '117.192.0.0/10',
4428         'IO': '203.83.48.0/21',
4429         'IQ': '37.236.0.0/14',
4430         'IR': '2.176.0.0/12',
4431         'IS': '82.221.0.0/16',
4432         'IT': '79.0.0.0/10',
4433         'JE': '87.244.64.0/18',
4434         'JM': '72.27.0.0/17',
4435         'JO': '176.29.0.0/16',
4436         'JP': '133.0.0.0/8',
4437         'KE': '105.48.0.0/12',
4438         'KG': '158.181.128.0/17',
4439         'KH': '36.37.128.0/17',
4440         'KI': '103.25.140.0/22',
4441         'KM': '197.255.224.0/20',
4442         'KN': '198.167.192.0/19',
4443         'KP': '175.45.176.0/22',
4444         'KR': '175.192.0.0/10',
4445         'KW': '37.36.0.0/14',
4446         'KY': '64.96.0.0/15',
4447         'KZ': '2.72.0.0/13',
4448         'LA': '115.84.64.0/18',
4449         'LB': '178.135.0.0/16',
4450         'LC': '24.92.144.0/20',
4451         'LI': '82.117.0.0/19',
4452         'LK': '112.134.0.0/15',
4453         'LR': '102.183.0.0/16',
4454         'LS': '129.232.0.0/17',
4455         'LT': '78.56.0.0/13',
4456         'LU': '188.42.0.0/16',
4457         'LV': '46.109.0.0/16',
4458         'LY': '41.252.0.0/14',
4459         'MA': '105.128.0.0/11',
4460         'MC': '88.209.64.0/18',
4461         'MD': '37.246.0.0/16',
4462         'ME': '178.175.0.0/17',
4463         'MF': '74.112.232.0/21',
4464         'MG': '154.126.0.0/17',
4465         'MH': '117.103.88.0/21',
4466         'MK': '77.28.0.0/15',
4467         'ML': '154.118.128.0/18',
4468         'MM': '37.111.0.0/17',
4469         'MN': '49.0.128.0/17',
4470         'MO': '60.246.0.0/16',
4471         'MP': '202.88.64.0/20',
4472         'MQ': '109.203.224.0/19',
4473         'MR': '41.188.64.0/18',
4474         'MS': '208.90.112.0/22',
4475         'MT': '46.11.0.0/16',
4476         'MU': '105.16.0.0/12',
4477         'MV': '27.114.128.0/18',
4478         'MW': '102.70.0.0/15',
4479         'MX': '187.192.0.0/11',
4480         'MY': '175.136.0.0/13',
4481         'MZ': '197.218.0.0/15',
4482         'NA': '41.182.0.0/16',
4483         'NC': '101.101.0.0/18',
4484         'NE': '197.214.0.0/18',
4485         'NF': '203.17.240.0/22',
4486         'NG': '105.112.0.0/12',
4487         'NI': '186.76.0.0/15',
4488         'NL': '145.96.0.0/11',
4489         'NO': '84.208.0.0/13',
4490         'NP': '36.252.0.0/15',
4491         'NR': '203.98.224.0/19',
4492         'NU': '49.156.48.0/22',
4493         'NZ': '49.224.0.0/14',
4494         'OM': '5.36.0.0/15',
4495         'PA': '186.72.0.0/15',
4496         'PE': '186.160.0.0/14',
4497         'PF': '123.50.64.0/18',
4498         'PG': '124.240.192.0/19',
4499         'PH': '49.144.0.0/13',
4500         'PK': '39.32.0.0/11',
4501         'PL': '83.0.0.0/11',
4502         'PM': '70.36.0.0/20',
4503         'PR': '66.50.0.0/16',
4504         'PS': '188.161.0.0/16',
4505         'PT': '85.240.0.0/13',
4506         'PW': '202.124.224.0/20',
4507         'PY': '181.120.0.0/14',
4508         'QA': '37.210.0.0/15',
4509         'RE': '102.35.0.0/16',
4510         'RO': '79.112.0.0/13',
4511         'RS': '93.86.0.0/15',
4512         'RU': '5.136.0.0/13',
4513         'RW': '41.186.0.0/16',
4514         'SA': '188.48.0.0/13',
4515         'SB': '202.1.160.0/19',
4516         'SC': '154.192.0.0/11',
4517         'SD': '102.120.0.0/13',
4518         'SE': '78.64.0.0/12',
4519         'SG': '8.128.0.0/10',
4520         'SI': '188.196.0.0/14',
4521         'SK': '78.98.0.0/15',
4522         'SL': '102.143.0.0/17',
4523         'SM': '89.186.32.0/19',
4524         'SN': '41.82.0.0/15',
4525         'SO': '154.115.192.0/18',
4526         'SR': '186.179.128.0/17',
4527         'SS': '105.235.208.0/21',
4528         'ST': '197.159.160.0/19',
4529         'SV': '168.243.0.0/16',
4530         'SX': '190.102.0.0/20',
4531         'SY': '5.0.0.0/16',
4532         'SZ': '41.84.224.0/19',
4533         'TC': '65.255.48.0/20',
4534         'TD': '154.68.128.0/19',
4535         'TG': '196.168.0.0/14',
4536         'TH': '171.96.0.0/13',
4537         'TJ': '85.9.128.0/18',
4538         'TK': '27.96.24.0/21',
4539         'TL': '180.189.160.0/20',
4540         'TM': '95.85.96.0/19',
4541         'TN': '197.0.0.0/11',
4542         'TO': '175.176.144.0/21',
4543         'TR': '78.160.0.0/11',
4544         'TT': '186.44.0.0/15',
4545         'TV': '202.2.96.0/19',
4546         'TW': '120.96.0.0/11',
4547         'TZ': '156.156.0.0/14',
4548         'UA': '37.52.0.0/14',
4549         'UG': '102.80.0.0/13',
4550         'US': '6.0.0.0/8',
4551         'UY': '167.56.0.0/13',
4552         'UZ': '84.54.64.0/18',
4553         'VA': '212.77.0.0/19',
4554         'VC': '207.191.240.0/21',
4555         'VE': '186.88.0.0/13',
4556         'VG': '66.81.192.0/20',
4557         'VI': '146.226.0.0/16',
4558         'VN': '14.160.0.0/11',
4559         'VU': '202.80.32.0/20',
4560         'WF': '117.20.32.0/21',
4561         'WS': '202.4.32.0/19',
4562         'YE': '134.35.0.0/16',
4563         'YT': '41.242.116.0/22',
4564         'ZA': '41.0.0.0/11',
4565         'ZM': '102.144.0.0/13',
4566         'ZW': '102.177.192.0/18',
4567     }
4568
4569     @classmethod
4570     def random_ipv4(cls, code_or_block):
4571         if len(code_or_block) == 2:
4572             block = cls._country_ip_map.get(code_or_block.upper())
4573             if not block:
4574                 return None
4575         else:
4576             block = code_or_block
4577         addr, preflen = block.split('/')
4578         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4579         addr_max = addr_min | (0xffffffff >> int(preflen))
4580         return compat_str(socket.inet_ntoa(
4581             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4582
4583
4584 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4585     def __init__(self, proxies=None):
4586         # Set default handlers
4587         for type in ('http', 'https'):
4588             setattr(self, '%s_open' % type,
4589                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4590                         meth(r, proxy, type))
4591         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4592
4593     def proxy_open(self, req, proxy, type):
4594         req_proxy = req.headers.get('Ytdl-request-proxy')
4595         if req_proxy is not None:
4596             proxy = req_proxy
4597             del req.headers['Ytdl-request-proxy']
4598
4599         if proxy == '__noproxy__':
4600             return None  # No Proxy
4601         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4602             req.add_header('Ytdl-socks-proxy', proxy)
4603             # yt-dlp's http/https handlers do wrapping the socket with socks
4604             return None
4605         return compat_urllib_request.ProxyHandler.proxy_open(
4606             self, req, proxy, type)
4607
4608
4609 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4610 # released into Public Domain
4611 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4612
4613 def long_to_bytes(n, blocksize=0):
4614     """long_to_bytes(n:long, blocksize:int) : string
4615     Convert a long integer to a byte string.
4616
4617     If optional blocksize is given and greater than zero, pad the front of the
4618     byte string with binary zeros so that the length is a multiple of
4619     blocksize.
4620     """
4621     # after much testing, this algorithm was deemed to be the fastest
4622     s = b''
4623     n = int(n)
4624     while n > 0:
4625         s = compat_struct_pack('>I', n & 0xffffffff) + s
4626         n = n >> 32
4627     # strip off leading zeros
4628     for i in range(len(s)):
4629         if s[i] != b'\000'[0]:
4630             break
4631     else:
4632         # only happens when n == 0
4633         s = b'\000'
4634         i = 0
4635     s = s[i:]
4636     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4637     # de-padding being done above, but sigh...
4638     if blocksize > 0 and len(s) % blocksize:
4639         s = (blocksize - len(s) % blocksize) * b'\000' + s
4640     return s
4641
4642
4643 def bytes_to_long(s):
4644     """bytes_to_long(string) : long
4645     Convert a byte string to a long integer.
4646
4647     This is (essentially) the inverse of long_to_bytes().
4648     """
4649     acc = 0
4650     length = len(s)
4651     if length % 4:
4652         extra = (4 - length % 4)
4653         s = b'\000' * extra + s
4654         length = length + extra
4655     for i in range(0, length, 4):
4656         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4657     return acc
4658
4659
4660 def ohdave_rsa_encrypt(data, exponent, modulus):
4661     '''
4662     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4663
4664     Input:
4665         data: data to encrypt, bytes-like object
4666         exponent, modulus: parameter e and N of RSA algorithm, both integer
4667     Output: hex string of encrypted data
4668
4669     Limitation: supports one block encryption only
4670     '''
4671
4672     payload = int(binascii.hexlify(data[::-1]), 16)
4673     encrypted = pow(payload, exponent, modulus)
4674     return '%x' % encrypted
4675
4676
4677 def pkcs1pad(data, length):
4678     """
4679     Padding input data with PKCS#1 scheme
4680
4681     @param {int[]} data        input data
4682     @param {int}   length      target length
4683     @returns {int[]}           padded data
4684     """
4685     if len(data) > length - 11:
4686         raise ValueError('Input data too long for PKCS#1 padding')
4687
4688     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4689     return [0, 2] + pseudo_random + [0] + data
4690
4691
4692 def encode_base_n(num, n, table=None):
4693     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4694     if not table:
4695         table = FULL_TABLE[:n]
4696
4697     if n > len(table):
4698         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4699
4700     if num == 0:
4701         return table[0]
4702
4703     ret = ''
4704     while num:
4705         ret = table[num % n] + ret
4706         num = num // n
4707     return ret
4708
4709
4710 def decode_packed_codes(code):
4711     mobj = re.search(PACKED_CODES_RE, code)
4712     obfuscated_code, base, count, symbols = mobj.groups()
4713     base = int(base)
4714     count = int(count)
4715     symbols = symbols.split('|')
4716     symbol_table = {}
4717
4718     while count:
4719         count -= 1
4720         base_n_count = encode_base_n(count, base)
4721         symbol_table[base_n_count] = symbols[count] or base_n_count
4722
4723     return re.sub(
4724         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4725         obfuscated_code)
4726
4727
4728 def caesar(s, alphabet, shift):
4729     if shift == 0:
4730         return s
4731     l = len(alphabet)
4732     return ''.join(
4733         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4734         for c in s)
4735
4736
4737 def rot47(s):
4738     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4739
4740
4741 def parse_m3u8_attributes(attrib):
4742     info = {}
4743     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4744         if val.startswith('"'):
4745             val = val[1:-1]
4746         info[key] = val
4747     return info
4748
4749
4750 def urshift(val, n):
4751     return val >> n if val >= 0 else (val + 0x100000000) >> n
4752
4753
4754 # Based on png2str() written by @gdkchan and improved by @yokrysty
4755 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4756 def decode_png(png_data):
4757     # Reference: https://www.w3.org/TR/PNG/
4758     header = png_data[8:]
4759
4760     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4761         raise IOError('Not a valid PNG file.')
4762
4763     int_map = {1: '>B', 2: '>H', 4: '>I'}
4764     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4765
4766     chunks = []
4767
4768     while header:
4769         length = unpack_integer(header[:4])
4770         header = header[4:]
4771
4772         chunk_type = header[:4]
4773         header = header[4:]
4774
4775         chunk_data = header[:length]
4776         header = header[length:]
4777
4778         header = header[4:]  # Skip CRC
4779
4780         chunks.append({
4781             'type': chunk_type,
4782             'length': length,
4783             'data': chunk_data
4784         })
4785
4786     ihdr = chunks[0]['data']
4787
4788     width = unpack_integer(ihdr[:4])
4789     height = unpack_integer(ihdr[4:8])
4790
4791     idat = b''
4792
4793     for chunk in chunks:
4794         if chunk['type'] == b'IDAT':
4795             idat += chunk['data']
4796
4797     if not idat:
4798         raise IOError('Unable to read PNG data.')
4799
4800     decompressed_data = bytearray(zlib.decompress(idat))
4801
4802     stride = width * 3
4803     pixels = []
4804
4805     def _get_pixel(idx):
4806         x = idx % stride
4807         y = idx // stride
4808         return pixels[y][x]
4809
4810     for y in range(height):
4811         basePos = y * (1 + stride)
4812         filter_type = decompressed_data[basePos]
4813
4814         current_row = []
4815
4816         pixels.append(current_row)
4817
4818         for x in range(stride):
4819             color = decompressed_data[1 + basePos + x]
4820             basex = y * stride + x
4821             left = 0
4822             up = 0
4823
4824             if x > 2:
4825                 left = _get_pixel(basex - 3)
4826             if y > 0:
4827                 up = _get_pixel(basex - stride)
4828
4829             if filter_type == 1:  # Sub
4830                 color = (color + left) & 0xff
4831             elif filter_type == 2:  # Up
4832                 color = (color + up) & 0xff
4833             elif filter_type == 3:  # Average
4834                 color = (color + ((left + up) >> 1)) & 0xff
4835             elif filter_type == 4:  # Paeth
4836                 a = left
4837                 b = up
4838                 c = 0
4839
4840                 if x > 2 and y > 0:
4841                     c = _get_pixel(basex - stride - 3)
4842
4843                 p = a + b - c
4844
4845                 pa = abs(p - a)
4846                 pb = abs(p - b)
4847                 pc = abs(p - c)
4848
4849                 if pa <= pb and pa <= pc:
4850                     color = (color + a) & 0xff
4851                 elif pb <= pc:
4852                     color = (color + b) & 0xff
4853                 else:
4854                     color = (color + c) & 0xff
4855
4856             current_row.append(color)
4857
4858     return width, height, pixels
4859
4860
4861 def write_xattr(path, key, value):
4862     # This mess below finds the best xattr tool for the job
4863     try:
4864         # try the pyxattr module...
4865         import xattr
4866
4867         if hasattr(xattr, 'set'):  # pyxattr
4868             # Unicode arguments are not supported in python-pyxattr until
4869             # version 0.5.0
4870             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4871             pyxattr_required_version = '0.5.0'
4872             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4873                 # TODO: fallback to CLI tools
4874                 raise XAttrUnavailableError(
4875                     'python-pyxattr is detected but is too old. '
4876                     'yt-dlp requires %s or above while your version is %s. '
4877                     'Falling back to other xattr implementations' % (
4878                         pyxattr_required_version, xattr.__version__))
4879
4880             setxattr = xattr.set
4881         else:  # xattr
4882             setxattr = xattr.setxattr
4883
4884         try:
4885             setxattr(path, key, value)
4886         except EnvironmentError as e:
4887             raise XAttrMetadataError(e.errno, e.strerror)
4888
4889     except ImportError:
4890         if compat_os_name == 'nt':
4891             # Write xattrs to NTFS Alternate Data Streams:
4892             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4893             assert ':' not in key
4894             assert os.path.exists(path)
4895
4896             ads_fn = path + ':' + key
4897             try:
4898                 with open(ads_fn, 'wb') as f:
4899                     f.write(value)
4900             except EnvironmentError as e:
4901                 raise XAttrMetadataError(e.errno, e.strerror)
4902         else:
4903             user_has_setfattr = check_executable('setfattr', ['--version'])
4904             user_has_xattr = check_executable('xattr', ['-h'])
4905
4906             if user_has_setfattr or user_has_xattr:
4907
4908                 value = value.decode('utf-8')
4909                 if user_has_setfattr:
4910                     executable = 'setfattr'
4911                     opts = ['-n', key, '-v', value]
4912                 elif user_has_xattr:
4913                     executable = 'xattr'
4914                     opts = ['-w', key, value]
4915
4916                 cmd = ([encodeFilename(executable, True)]
4917                        + [encodeArgument(o) for o in opts]
4918                        + [encodeFilename(path, True)])
4919
4920                 try:
4921                     p = Popen(
4922                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4923                 except EnvironmentError as e:
4924                     raise XAttrMetadataError(e.errno, e.strerror)
4925                 stdout, stderr = p.communicate_or_kill()
4926                 stderr = stderr.decode('utf-8', 'replace')
4927                 if p.returncode != 0:
4928                     raise XAttrMetadataError(p.returncode, stderr)
4929
4930             else:
4931                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4932                 if sys.platform.startswith('linux'):
4933                     raise XAttrUnavailableError(
4934                         "Couldn't find a tool to set the xattrs. "
4935                         "Install either the python 'pyxattr' or 'xattr' "
4936                         "modules, or the GNU 'attr' package "
4937                         "(which contains the 'setfattr' tool).")
4938                 else:
4939                     raise XAttrUnavailableError(
4940                         "Couldn't find a tool to set the xattrs. "
4941                         "Install either the python 'xattr' module, "
4942                         "or the 'xattr' binary.")
4943
4944
4945 def random_birthday(year_field, month_field, day_field):
4946     start_date = datetime.date(1950, 1, 1)
4947     end_date = datetime.date(1995, 12, 31)
4948     offset = random.randint(0, (end_date - start_date).days)
4949     random_date = start_date + datetime.timedelta(offset)
4950     return {
4951         year_field: str(random_date.year),
4952         month_field: str(random_date.month),
4953         day_field: str(random_date.day),
4954     }
4955
4956
4957 # Templates for internet shortcut files, which are plain text files.
4958 DOT_URL_LINK_TEMPLATE = '''
4959 [InternetShortcut]
4960 URL=%(url)s
4961 '''.lstrip()
4962
4963 DOT_WEBLOC_LINK_TEMPLATE = '''
4964 <?xml version="1.0" encoding="UTF-8"?>
4965 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4966 <plist version="1.0">
4967 <dict>
4968 \t<key>URL</key>
4969 \t<string>%(url)s</string>
4970 </dict>
4971 </plist>
4972 '''.lstrip()
4973
4974 DOT_DESKTOP_LINK_TEMPLATE = '''
4975 [Desktop Entry]
4976 Encoding=UTF-8
4977 Name=%(filename)s
4978 Type=Link
4979 URL=%(url)s
4980 Icon=text-html
4981 '''.lstrip()
4982
4983 LINK_TEMPLATES = {
4984     'url': DOT_URL_LINK_TEMPLATE,
4985     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4986     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4987 }
4988
4989
4990 def iri_to_uri(iri):
4991     """
4992     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4993
4994     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4995     """
4996
4997     iri_parts = compat_urllib_parse_urlparse(iri)
4998
4999     if '[' in iri_parts.netloc:
5000         raise ValueError('IPv6 URIs are not, yet, supported.')
5001         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5002
5003     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5004
5005     net_location = ''
5006     if iri_parts.username:
5007         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
5008         if iri_parts.password is not None:
5009             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
5010         net_location += '@'
5011
5012     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
5013     # The 'idna' encoding produces ASCII text.
5014     if iri_parts.port is not None and iri_parts.port != 80:
5015         net_location += ':' + str(iri_parts.port)
5016
5017     return compat_urllib_parse_urlunparse(
5018         (iri_parts.scheme,
5019             net_location,
5020
5021             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5022
5023             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5024             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5025
5026             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5027             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5028
5029             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5030
5031     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5032
5033
5034 def to_high_limit_path(path):
5035     if sys.platform in ['win32', 'cygwin']:
5036         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5037         return r'\\?\ '.rstrip() + os.path.abspath(path)
5038
5039     return path
5040
5041
5042 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5043     val = traverse_obj(obj, *variadic(field))
5044     if val in ignore:
5045         return default
5046     return template % (func(val) if func else val)
5047
5048
5049 def clean_podcast_url(url):
5050     return re.sub(r'''(?x)
5051         (?:
5052             (?:
5053                 chtbl\.com/track|
5054                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5055                 play\.podtrac\.com
5056             )/[^/]+|
5057             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5058             flex\.acast\.com|
5059             pd(?:
5060                 cn\.co| # https://podcorn.com/analytics-prefix/
5061                 st\.fm # https://podsights.com/docs/
5062             )/e
5063         )/''', '', url)
5064
5065
5066 _HEX_TABLE = '0123456789abcdef'
5067
5068
5069 def random_uuidv4():
5070     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5071
5072
5073 def make_dir(path, to_screen=None):
5074     try:
5075         dn = os.path.dirname(path)
5076         if dn and not os.path.exists(dn):
5077             os.makedirs(dn)
5078         return True
5079     except (OSError, IOError) as err:
5080         if callable(to_screen) is not None:
5081             to_screen('unable to create directory ' + error_to_compat_str(err))
5082         return False
5083
5084
5085 def get_executable_path():
5086     from zipimport import zipimporter
5087     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5088         path = os.path.dirname(sys.executable)
5089     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5090         path = os.path.join(os.path.dirname(__file__), '../..')
5091     else:
5092         path = os.path.join(os.path.dirname(__file__), '..')
5093     return os.path.abspath(path)
5094
5095
5096 def load_plugins(name, suffix, namespace):
5097     classes = {}
5098     try:
5099         plugins_spec = importlib.util.spec_from_file_location(
5100             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5101         plugins = importlib.util.module_from_spec(plugins_spec)
5102         sys.modules[plugins_spec.name] = plugins
5103         plugins_spec.loader.exec_module(plugins)
5104         for name in dir(plugins):
5105             if name in namespace:
5106                 continue
5107             if not name.endswith(suffix):
5108                 continue
5109             klass = getattr(plugins, name)
5110             classes[name] = namespace[name] = klass
5111     except FileNotFoundError:
5112         pass
5113     return classes
5114
5115
5116 def traverse_obj(
5117         obj, *path_list, default=None, expected_type=None, get_all=True,
5118         casesense=True, is_user_input=False, traverse_string=False):
5119     ''' Traverse nested list/dict/tuple
5120     @param path_list        A list of paths which are checked one by one.
5121                             Each path is a list of keys where each key is a string,
5122                             a function, a tuple of strings/None or "...".
5123                             When a fuction is given, it takes the key as argument and
5124                             returns whether the key matches or not. When a tuple is given,
5125                             all the keys given in the tuple are traversed, and
5126                             "..." traverses all the keys in the object
5127                             "None" returns the object without traversal
5128     @param default          Default value to return
5129     @param expected_type    Only accept final value of this type (Can also be any callable)
5130     @param get_all          Return all the values obtained from a path or only the first one
5131     @param casesense        Whether to consider dictionary keys as case sensitive
5132     @param is_user_input    Whether the keys are generated from user input. If True,
5133                             strings are converted to int/slice if necessary
5134     @param traverse_string  Whether to traverse inside strings. If True, any
5135                             non-compatible object will also be converted into a string
5136     # TODO: Write tests
5137     '''
5138     if not casesense:
5139         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5140         path_list = (map(_lower, variadic(path)) for path in path_list)
5141
5142     def _traverse_obj(obj, path, _current_depth=0):
5143         nonlocal depth
5144         path = tuple(variadic(path))
5145         for i, key in enumerate(path):
5146             if None in (key, obj):
5147                 return obj
5148             if isinstance(key, (list, tuple)):
5149                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5150                 key = ...
5151             if key is ...:
5152                 obj = (obj.values() if isinstance(obj, dict)
5153                        else obj if isinstance(obj, (list, tuple, LazyList))
5154                        else str(obj) if traverse_string else [])
5155                 _current_depth += 1
5156                 depth = max(depth, _current_depth)
5157                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5158             elif callable(key):
5159                 if isinstance(obj, (list, tuple, LazyList)):
5160                     obj = enumerate(obj)
5161                 elif isinstance(obj, dict):
5162                     obj = obj.items()
5163                 else:
5164                     if not traverse_string:
5165                         return None
5166                     obj = str(obj)
5167                 _current_depth += 1
5168                 depth = max(depth, _current_depth)
5169                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5170             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5171                 obj = (obj.get(key) if casesense or (key in obj)
5172                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5173             else:
5174                 if is_user_input:
5175                     key = (int_or_none(key) if ':' not in key
5176                            else slice(*map(int_or_none, key.split(':'))))
5177                     if key == slice(None):
5178                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5179                 if not isinstance(key, (int, slice)):
5180                     return None
5181                 if not isinstance(obj, (list, tuple, LazyList)):
5182                     if not traverse_string:
5183                         return None
5184                     obj = str(obj)
5185                 try:
5186                     obj = obj[key]
5187                 except IndexError:
5188                     return None
5189         return obj
5190
5191     if isinstance(expected_type, type):
5192         type_test = lambda val: val if isinstance(val, expected_type) else None
5193     elif expected_type is not None:
5194         type_test = expected_type
5195     else:
5196         type_test = lambda val: val
5197
5198     for path in path_list:
5199         depth = 0
5200         val = _traverse_obj(obj, path)
5201         if val is not None:
5202             if depth:
5203                 for _ in range(depth - 1):
5204                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5205                 val = [v for v in map(type_test, val) if v is not None]
5206                 if val:
5207                     return val if get_all else val[0]
5208             else:
5209                 val = type_test(val)
5210                 if val is not None:
5211                     return val
5212     return default
5213
5214
5215 def traverse_dict(dictn, keys, casesense=True):
5216     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5217                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5218     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5219
5220
5221 def get_first(obj, keys, **kwargs):
5222     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5223
5224
5225 def variadic(x, allowed_types=(str, bytes, dict)):
5226     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5227
5228
5229 def decode_base(value, digits):
5230     # This will convert given base-x string to scalar (long or int)
5231     table = {char: index for index, char in enumerate(digits)}
5232     result = 0
5233     base = len(digits)
5234     for chr in value:
5235         result *= base
5236         result += table[chr]
5237     return result
5238
5239
5240 def time_seconds(**kwargs):
5241     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5242     return t.timestamp()
5243
5244
5245 # create a JSON Web Signature (jws) with HS256 algorithm
5246 # the resulting format is in JWS Compact Serialization
5247 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5248 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5249 def jwt_encode_hs256(payload_data, key, headers={}):
5250     header_data = {
5251         'alg': 'HS256',
5252         'typ': 'JWT',
5253     }
5254     if headers:
5255         header_data.update(headers)
5256     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5257     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5258     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5259     signature_b64 = base64.b64encode(h.digest())
5260     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5261     return token
5262
5263
5264 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5265 def jwt_decode_hs256(jwt):
5266     header_b64, payload_b64, signature_b64 = jwt.split('.')
5267     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5268     return payload_data
5269
5270
5271 def supports_terminal_sequences(stream):
5272     if compat_os_name == 'nt':
5273         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5274         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5275             return False
5276     elif not os.getenv('TERM'):
5277         return False
5278     try:
5279         return stream.isatty()
5280     except BaseException:
5281         return False
5282
5283
5284 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5285
5286
5287 def remove_terminal_sequences(string):
5288     return _terminal_sequences_re.sub('', string)
5289
5290
5291 def number_of_digits(number):
5292     return len('%d' % number)
5293
5294
5295 def join_nonempty(*values, delim='-', from_dict=None):
5296     if from_dict is not None:
5297         values = map(from_dict.get, values)
5298     return delim.join(map(str, filter(None, values)))
5299
5300
5301 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5302     """
5303     Find the largest format dimensions in terms of video width and, for each thumbnail:
5304     * Modify the URL: Match the width with the provided regex and replace with the former width
5305     * Update dimensions
5306
5307     This function is useful with video services that scale the provided thumbnails on demand
5308     """
5309     _keys = ('width', 'height')
5310     max_dimensions = max(
5311         [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5312         default=(0, 0))
5313     if not max_dimensions[0]:
5314         return thumbnails
5315     return [
5316         merge_dicts(
5317             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5318             dict(zip(_keys, max_dimensions)), thumbnail)
5319         for thumbnail in thumbnails
5320     ]
5321
5322
5323 def parse_http_range(range):
5324     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5325     if not range:
5326         return None, None, None
5327     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5328     if not crg:
5329         return None, None, None
5330     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5331
5332
5333 class Config:
5334     own_args = None
5335     filename = None
5336     __initialized = False
5337
5338     def __init__(self, parser, label=None):
5339         self._parser, self.label = parser, label
5340         self._loaded_paths, self.configs = set(), []
5341
5342     def init(self, args=None, filename=None):
5343         assert not self.__initialized
5344         directory = ''
5345         if filename:
5346             location = os.path.realpath(filename)
5347             directory = os.path.dirname(location)
5348             if location in self._loaded_paths:
5349                 return False
5350             self._loaded_paths.add(location)
5351
5352         self.__initialized = True
5353         self.own_args, self.filename = args, filename
5354         for location in self._parser.parse_args(args)[0].config_locations or []:
5355             location = os.path.join(directory, expand_path(location))
5356             if os.path.isdir(location):
5357                 location = os.path.join(location, 'yt-dlp.conf')
5358             if not os.path.exists(location):
5359                 self._parser.error(f'config location {location} does not exist')
5360             self.append_config(self.read_file(location), location)
5361         return True
5362
5363     def __str__(self):
5364         label = join_nonempty(
5365             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5366             delim=' ')
5367         return join_nonempty(
5368             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5369             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5370             delim='\n')
5371
5372     @staticmethod
5373     def read_file(filename, default=[]):
5374         try:
5375             optionf = open(filename)
5376         except IOError:
5377             return default  # silently skip if file is not present
5378         try:
5379             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5380             contents = optionf.read()
5381             if sys.version_info < (3,):
5382                 contents = contents.decode(preferredencoding())
5383             res = compat_shlex_split(contents, comments=True)
5384         finally:
5385             optionf.close()
5386         return res
5387
5388     @staticmethod
5389     def hide_login_info(opts):
5390         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5391         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5392
5393         def _scrub_eq(o):
5394             m = eqre.match(o)
5395             if m:
5396                 return m.group('key') + '=PRIVATE'
5397             else:
5398                 return o
5399
5400         opts = list(map(_scrub_eq, opts))
5401         for idx, opt in enumerate(opts):
5402             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5403                 opts[idx + 1] = 'PRIVATE'
5404         return opts
5405
5406     def append_config(self, *args, label=None):
5407         config = type(self)(self._parser, label)
5408         config._loaded_paths = self._loaded_paths
5409         if config.init(*args):
5410             self.configs.append(config)
5411
5412     @property
5413     def all_args(self):
5414         for config in reversed(self.configs):
5415             yield from config.all_args
5416         yield from self.own_args or []
5417
5418     def parse_args(self):
5419         return self._parser.parse_args(list(self.all_args))
5420
5421
5422 class WebSocketsWrapper():
5423     """Wraps websockets module to use in non-async scopes"""
5424
5425     def __init__(self, url, headers=None):
5426         self.loop = asyncio.events.new_event_loop()
5427         self.conn = compat_websockets.connect(
5428             url, extra_headers=headers, ping_interval=None,
5429             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5430         atexit.register(self.__exit__, None, None, None)
5431
5432     def __enter__(self):
5433         self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5434         return self
5435
5436     def send(self, *args):
5437         self.run_with_loop(self.pool.send(*args), self.loop)
5438
5439     def recv(self, *args):
5440         return self.run_with_loop(self.pool.recv(*args), self.loop)
5441
5442     def __exit__(self, type, value, traceback):
5443         try:
5444             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5445         finally:
5446             self.loop.close()
5447             self._cancel_all_tasks(self.loop)
5448
5449     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5450     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5451     @staticmethod
5452     def run_with_loop(main, loop):
5453         if not asyncio.coroutines.iscoroutine(main):
5454             raise ValueError(f'a coroutine was expected, got {main!r}')
5455
5456         try:
5457             return loop.run_until_complete(main)
5458         finally:
5459             loop.run_until_complete(loop.shutdown_asyncgens())
5460             if hasattr(loop, 'shutdown_default_executor'):
5461                 loop.run_until_complete(loop.shutdown_default_executor())
5462
5463     @staticmethod
5464     def _cancel_all_tasks(loop):
5465         to_cancel = asyncio.tasks.all_tasks(loop)
5466
5467         if not to_cancel:
5468             return
5469
5470         for task in to_cancel:
5471             task.cancel()
5472
5473         loop.run_until_complete(
5474             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5475
5476         for task in to_cancel:
5477             if task.cancelled():
5478                 continue
5479             if task.exception() is not None:
5480                 loop.call_exception_handler({
5481                     'message': 'unhandled exception during asyncio.run() shutdown',
5482                     'exception': task.exception(),
5483                     'task': task,
5484                 })
5485
5486
5487 has_websockets = bool(compat_websockets)
5488
5489
5490 def merge_headers(*dicts):
5491     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5492     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}