yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import asyncio
   7 import atexit
   8 import base64
   9 import binascii
  10 import calendar
  11 import codecs
  12 import collections
  13 import contextlib
  14 import ctypes
  15 import datetime
  16 import email.utils
  17 import email.header
  18 import errno
  19 import functools
  20 import gzip
  21 import hashlib
  22 import hmac
  23 import importlib.util
  24 import io
  25 import itertools
  26 import json
  27 import locale
  28 import math
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import socket
  35 import ssl
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import xml.etree.ElementTree
  42 import zlib
  43 import mimetypes
  44
  45 from .compat import (
  46     compat_HTMLParseError,
  47     compat_HTMLParser,
  48     compat_HTTPError,
  49     compat_basestring,
  50     compat_chr,
  51     compat_cookiejar,
  52     compat_ctypes_WINFUNCTYPE,
  53     compat_etree_fromstring,
  54     compat_expanduser,
  55     compat_html_entities,
  56     compat_html_entities_html5,
  57     compat_http_client,
  58     compat_integer_types,
  59     compat_numeric_types,
  60     compat_kwargs,
  61     compat_os_name,
  62     compat_parse_qs,
  63     compat_shlex_split,
  64     compat_shlex_quote,
  65     compat_str,
  66     compat_struct_pack,
  67     compat_struct_unpack,
  68     compat_urllib_error,
  69     compat_urllib_parse,
  70     compat_urllib_parse_urlencode,
  71     compat_urllib_parse_urlparse,
  72     compat_urllib_parse_urlunparse,
  73     compat_urllib_parse_quote,
  74     compat_urllib_parse_quote_plus,
  75     compat_urllib_parse_unquote_plus,
  76     compat_urllib_request,
  77     compat_urlparse,
  78     compat_websockets,
  79     compat_xpath,
  80 )
  81
  82 from .socks import (
  83     ProxyType,
  84     sockssocket,
  85 )
  86
  87
  88 def register_socks_protocols():
  89     # "Register" SOCKS protocols
  90     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  91     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  92     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  93         if scheme not in compat_urlparse.uses_netloc:
  94             compat_urlparse.uses_netloc.append(scheme)
  95
  96
  97 # This is not clearly defined otherwise
  98 compiled_regex_type = type(re.compile(''))
  99
 100
 101 def random_user_agent():
 102     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
 103     _CHROME_VERSIONS = (
 104         '90.0.4430.212',
 105         '90.0.4430.24',
 106         '90.0.4430.70',
 107         '90.0.4430.72',
 108         '90.0.4430.85',
 109         '90.0.4430.93',
 110         '91.0.4472.101',
 111         '91.0.4472.106',
 112         '91.0.4472.114',
 113         '91.0.4472.124',
 114         '91.0.4472.164',
 115         '91.0.4472.19',
 116         '91.0.4472.77',
 117         '92.0.4515.107',
 118         '92.0.4515.115',
 119         '92.0.4515.131',
 120         '92.0.4515.159',
 121         '92.0.4515.43',
 122         '93.0.4556.0',
 123         '93.0.4577.15',
 124         '93.0.4577.63',
 125         '93.0.4577.82',
 126         '94.0.4606.41',
 127         '94.0.4606.54',
 128         '94.0.4606.61',
 129         '94.0.4606.71',
 130         '94.0.4606.81',
 131         '94.0.4606.85',
 132         '95.0.4638.17',
 133         '95.0.4638.50',
 134         '95.0.4638.54',
 135         '95.0.4638.69',
 136         '95.0.4638.74',
 137         '96.0.4664.18',
 138         '96.0.4664.45',
 139         '96.0.4664.55',
 140         '96.0.4664.93',
 141         '97.0.4692.20',
 142     )
 143     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 144
 145
 146 std_headers = {
 147     'User-Agent': random_user_agent(),
 148     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 149     'Accept-Encoding': 'gzip, deflate',
 150     'Accept-Language': 'en-us,en;q=0.5',
 151     'Sec-Fetch-Mode': 'navigate',
 152 }
 153
 154
 155 USER_AGENTS = {
 156     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 157 }
 158
 159
 160 NO_DEFAULT = object()
 161
 162 ENGLISH_MONTH_NAMES = [
 163     'January', 'February', 'March', 'April', 'May', 'June',
 164     'July', 'August', 'September', 'October', 'November', 'December']
 165
 166 MONTH_NAMES = {
 167     'en': ENGLISH_MONTH_NAMES,
 168     'fr': [
 169         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 170         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 171 }
 172
 173 KNOWN_EXTENSIONS = (
 174     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 175     'flv', 'f4v', 'f4a', 'f4b',
 176     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 177     'mkv', 'mka', 'mk3d',
 178     'avi', 'divx',
 179     'mov',
 180     'asf', 'wmv', 'wma',
 181     '3gp', '3g2',
 182     'mp3',
 183     'flac',
 184     'ape',
 185     'wav',
 186     'f4f', 'f4m', 'm3u8', 'smil')
 187
 188 # needed for sanitizing filenames in restricted mode
 189 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 190                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 191                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 192
 193 DATE_FORMATS = (
 194     '%d %B %Y',
 195     '%d %b %Y',
 196     '%B %d %Y',
 197     '%B %dst %Y',
 198     '%B %dnd %Y',
 199     '%B %drd %Y',
 200     '%B %dth %Y',
 201     '%b %d %Y',
 202     '%b %dst %Y',
 203     '%b %dnd %Y',
 204     '%b %drd %Y',
 205     '%b %dth %Y',
 206     '%b %dst %Y %I:%M',
 207     '%b %dnd %Y %I:%M',
 208     '%b %drd %Y %I:%M',
 209     '%b %dth %Y %I:%M',
 210     '%Y %m %d',
 211     '%Y-%m-%d',
 212     '%Y.%m.%d.',
 213     '%Y/%m/%d',
 214     '%Y/%m/%d %H:%M',
 215     '%Y/%m/%d %H:%M:%S',
 216     '%Y%m%d%H%M',
 217     '%Y%m%d%H%M%S',
 218     '%Y%m%d',
 219     '%Y-%m-%d %H:%M',
 220     '%Y-%m-%d %H:%M:%S',
 221     '%Y-%m-%d %H:%M:%S.%f',
 222     '%Y-%m-%d %H:%M:%S:%f',
 223     '%d.%m.%Y %H:%M',
 224     '%d.%m.%Y %H.%M',
 225     '%Y-%m-%dT%H:%M:%SZ',
 226     '%Y-%m-%dT%H:%M:%S.%fZ',
 227     '%Y-%m-%dT%H:%M:%S.%f0Z',
 228     '%Y-%m-%dT%H:%M:%S',
 229     '%Y-%m-%dT%H:%M:%S.%f',
 230     '%Y-%m-%dT%H:%M',
 231     '%b %d %Y at %H:%M',
 232     '%b %d %Y at %H:%M:%S',
 233     '%B %d %Y at %H:%M',
 234     '%B %d %Y at %H:%M:%S',
 235     '%H:%M %d-%b-%Y',
 236 )
 237
 238 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 239 DATE_FORMATS_DAY_FIRST.extend([
 240     '%d-%m-%Y',
 241     '%d.%m.%Y',
 242     '%d.%m.%y',
 243     '%d/%m/%Y',
 244     '%d/%m/%y',
 245     '%d/%m/%Y %H:%M:%S',
 246 ])
 247
 248 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 249 DATE_FORMATS_MONTH_FIRST.extend([
 250     '%m-%d-%Y',
 251     '%m.%d.%Y',
 252     '%m/%d/%Y',
 253     '%m/%d/%y',
 254     '%m/%d/%Y %H:%M:%S',
 255 ])
 256
 257 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 258 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 259
 260
 261 def preferredencoding():
 262     """Get preferred encoding.
 263
 264     Returns the best encoding scheme for the system, based on
 265     locale.getpreferredencoding() and some further tweaks.
 266     """
 267     try:
 268         pref = locale.getpreferredencoding()
 269         'TEST'.encode(pref)
 270     except Exception:
 271         pref = 'UTF-8'
 272
 273     return pref
 274
 275
 276 def write_json_file(obj, fn):
 277     """ Encode obj as JSON and write it to fn, atomically if possible """
 278
 279     fn = encodeFilename(fn)
 280     if sys.version_info < (3, 0) and sys.platform != 'win32':
 281         encoding = get_filesystem_encoding()
 282         # os.path.basename returns a bytes object, but NamedTemporaryFile
 283         # will fail if the filename contains non ascii characters unless we
 284         # use a unicode object
 285         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 286         # the same for os.path.dirname
 287         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 288     else:
 289         path_basename = os.path.basename
 290         path_dirname = os.path.dirname
 291
 292     args = {
 293         'suffix': '.tmp',
 294         'prefix': path_basename(fn) + '.',
 295         'dir': path_dirname(fn),
 296         'delete': False,
 297     }
 298
 299     # In Python 2.x, json.dump expects a bytestream.
 300     # In Python 3.x, it writes to a character stream
 301     if sys.version_info < (3, 0):
 302         args['mode'] = 'wb'
 303     else:
 304         args.update({
 305             'mode': 'w',
 306             'encoding': 'utf-8',
 307         })
 308
 309     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 310
 311     try:
 312         with tf:
 313             json.dump(obj, tf, ensure_ascii=False)
 314         if sys.platform == 'win32':
 315             # Need to remove existing file on Windows, else os.rename raises
 316             # WindowsError or FileExistsError.
 317             try:
 318                 os.unlink(fn)
 319             except OSError:
 320                 pass
 321         try:
 322             mask = os.umask(0)
 323             os.umask(mask)
 324             os.chmod(tf.name, 0o666 & ~mask)
 325         except OSError:
 326             pass
 327         os.rename(tf.name, fn)
 328     except Exception:
 329         try:
 330             os.remove(tf.name)
 331         except OSError:
 332             pass
 333         raise
 334
 335
 336 if sys.version_info >= (2, 7):
 337     def find_xpath_attr(node, xpath, key, val=None):
 338         """ Find the xpath xpath[@key=val] """
 339         assert re.match(r'^[a-zA-Z_-]+$', key)
 340         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 341         return node.find(expr)
 342 else:
 343     def find_xpath_attr(node, xpath, key, val=None):
 344         for f in node.findall(compat_xpath(xpath)):
 345             if key not in f.attrib:
 346                 continue
 347             if val is None or f.attrib.get(key) == val:
 348                 return f
 349         return None
 350
 351 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 352 # the namespace parameter
 353
 354
 355 def xpath_with_ns(path, ns_map):
 356     components = [c.split(':') for c in path.split('/')]
 357     replaced = []
 358     for c in components:
 359         if len(c) == 1:
 360             replaced.append(c[0])
 361         else:
 362             ns, tag = c
 363             replaced.append('{%s}%s' % (ns_map[ns], tag))
 364     return '/'.join(replaced)
 365
 366
 367 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 368     def _find_xpath(xpath):
 369         return node.find(compat_xpath(xpath))
 370
 371     if isinstance(xpath, (str, compat_str)):
 372         n = _find_xpath(xpath)
 373     else:
 374         for xp in xpath:
 375             n = _find_xpath(xp)
 376             if n is not None:
 377                 break
 378
 379     if n is None:
 380         if default is not NO_DEFAULT:
 381             return default
 382         elif fatal:
 383             name = xpath if name is None else name
 384             raise ExtractorError('Could not find XML element %s' % name)
 385         else:
 386             return None
 387     return n
 388
 389
 390 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 391     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 392     if n is None or n == default:
 393         return n
 394     if n.text is None:
 395         if default is not NO_DEFAULT:
 396             return default
 397         elif fatal:
 398             name = xpath if name is None else name
 399             raise ExtractorError('Could not find XML element\'s text %s' % name)
 400         else:
 401             return None
 402     return n.text
 403
 404
 405 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 406     n = find_xpath_attr(node, xpath, key)
 407     if n is None:
 408         if default is not NO_DEFAULT:
 409             return default
 410         elif fatal:
 411             name = '%s[@%s]' % (xpath, key) if name is None else name
 412             raise ExtractorError('Could not find XML attribute %s' % name)
 413         else:
 414             return None
 415     return n.attrib[key]
 416
 417
 418 def get_element_by_id(id, html):
 419     """Return the content of the tag with the specified ID in the passed HTML document"""
 420     return get_element_by_attribute('id', id, html)
 421
 422
 423 def get_element_html_by_id(id, html):
 424     """Return the html of the tag with the specified ID in the passed HTML document"""
 425     return get_element_html_by_attribute('id', id, html)
 426
 427
 428 def get_element_by_class(class_name, html):
 429     """Return the content of the first tag with the specified class in the passed HTML document"""
 430     retval = get_elements_by_class(class_name, html)
 431     return retval[0] if retval else None
 432
 433
 434 def get_element_html_by_class(class_name, html):
 435     """Return the html of the first tag with the specified class in the passed HTML document"""
 436     retval = get_elements_html_by_class(class_name, html)
 437     return retval[0] if retval else None
 438
 439
 440 def get_element_by_attribute(attribute, value, html, escape_value=True):
 441     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 442     return retval[0] if retval else None
 443
 444
 445 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
 446     retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
 447     return retval[0] if retval else None
 448
 449
 450 def get_elements_by_class(class_name, html):
 451     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 452     return get_elements_by_attribute(
 453         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 454         html, escape_value=False)
 455
 456
 457 def get_elements_html_by_class(class_name, html):
 458     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 459     return get_elements_html_by_attribute(
 460         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 461         html, escape_value=False)
 462
 463
 464 def get_elements_by_attribute(*args, **kwargs):
 465     """Return the content of the tag with the specified attribute in the passed HTML document"""
 466     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 467
 468
 469 def get_elements_html_by_attribute(*args, **kwargs):
 470     """Return the html of the tag with the specified attribute in the passed HTML document"""
 471     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 472
 473
 474 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 475     """
 476     Return the text (content) and the html (whole) of the tag with the specified
 477     attribute in the passed HTML document
 478     """
 479
 480     value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 481
 482     value = re.escape(value) if escape_value else value
 483
 484     partial_element_re = r'''(?x)
 485         <(?P<tag>[a-zA-Z0-9:._-]+)
 486          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 487          \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
 488         ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
 489
 490     for m in re.finditer(partial_element_re, html):
 491         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 492
 493         yield (
 494             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 495             whole
 496         )
 497
 498
 499 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
 500     """
 501     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 502     closing tag for the first opening tag it has encountered, and can be used
 503     as a context manager
 504     """
 505
 506     class HTMLBreakOnClosingTagException(Exception):
 507         pass
 508
 509     def __init__(self):
 510         self.tagstack = collections.deque()
 511         compat_HTMLParser.__init__(self)
 512
 513     def __enter__(self):
 514         return self
 515
 516     def __exit__(self, *_):
 517         self.close()
 518
 519     def close(self):
 520         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 521         # so data remains buffered; we no longer have any interest in it, thus
 522         # override this method to discard it
 523         pass
 524
 525     def handle_starttag(self, tag, _):
 526         self.tagstack.append(tag)
 527
 528     def handle_endtag(self, tag):
 529         if not self.tagstack:
 530             raise compat_HTMLParseError('no tags in the stack')
 531         while self.tagstack:
 532             inner_tag = self.tagstack.pop()
 533             if inner_tag == tag:
 534                 break
 535         else:
 536             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 537         if not self.tagstack:
 538             raise self.HTMLBreakOnClosingTagException()
 539
 540
 541 def get_element_text_and_html_by_tag(tag, html):
 542     """
 543     For the first element with the specified tag in the passed HTML document
 544     return its' content (text) and the whole element (html)
 545     """
 546     def find_or_raise(haystack, needle, exc):
 547         try:
 548             return haystack.index(needle)
 549         except ValueError:
 550             raise exc
 551     closing_tag = f'</{tag}>'
 552     whole_start = find_or_raise(
 553         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 554     content_start = find_or_raise(
 555         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 556     content_start += whole_start + 1
 557     with HTMLBreakOnClosingTagParser() as parser:
 558         parser.feed(html[whole_start:content_start])
 559         if not parser.tagstack or parser.tagstack[0] != tag:
 560             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 561         offset = content_start
 562         while offset < len(html):
 563             next_closing_tag_start = find_or_raise(
 564                 html[offset:], closing_tag,
 565                 compat_HTMLParseError(f'closing {tag} tag not found'))
 566             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 567             try:
 568                 parser.feed(html[offset:offset + next_closing_tag_end])
 569                 offset += next_closing_tag_end
 570             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 571                 return html[content_start:offset + next_closing_tag_start], \
 572                     html[whole_start:offset + next_closing_tag_end]
 573         raise compat_HTMLParseError('unexpected end of html')
 574
 575
 576 class HTMLAttributeParser(compat_HTMLParser):
 577     """Trivial HTML parser to gather the attributes for a single element"""
 578
 579     def __init__(self):
 580         self.attrs = {}
 581         compat_HTMLParser.__init__(self)
 582
 583     def handle_starttag(self, tag, attrs):
 584         self.attrs = dict(attrs)
 585
 586
 587 class HTMLListAttrsParser(compat_HTMLParser):
 588     """HTML parser to gather the attributes for the elements of a list"""
 589
 590     def __init__(self):
 591         compat_HTMLParser.__init__(self)
 592         self.items = []
 593         self._level = 0
 594
 595     def handle_starttag(self, tag, attrs):
 596         if tag == 'li' and self._level == 0:
 597             self.items.append(dict(attrs))
 598         self._level += 1
 599
 600     def handle_endtag(self, tag):
 601         self._level -= 1
 602
 603
 604 def extract_attributes(html_element):
 605     """Given a string for an HTML element such as
 606     <el
 607          a="foo" B="bar" c="&98;az" d=boz
 608          empty= noval entity="&amp;"
 609          sq='"' dq="'"
 610     >
 611     Decode and return a dictionary of attributes.
 612     {
 613         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 614         'empty': '', 'noval': None, 'entity': '&',
 615         'sq': '"', 'dq': '\''
 616     }.
 617     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 618     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 619     """
 620     parser = HTMLAttributeParser()
 621     try:
 622         parser.feed(html_element)
 623         parser.close()
 624     # Older Python may throw HTMLParseError in case of malformed HTML
 625     except compat_HTMLParseError:
 626         pass
 627     return parser.attrs
 628
 629
 630 def parse_list(webpage):
 631     """Given a string for an series of HTML <li> elements,
 632     return a dictionary of their attributes"""
 633     parser = HTMLListAttrsParser()
 634     parser.feed(webpage)
 635     parser.close()
 636     return parser.items
 637
 638
 639 def clean_html(html):
 640     """Clean an HTML snippet into a readable string"""
 641
 642     if html is None:  # Convenience for sanitizing descriptions etc.
 643         return html
 644
 645     html = re.sub(r'\s+', ' ', html)
 646     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 647     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 648     # Strip html tags
 649     html = re.sub('<.*?>', '', html)
 650     # Replace html entities
 651     html = unescapeHTML(html)
 652     return html.strip()
 653
 654
 655 def sanitize_open(filename, open_mode):
 656     """Try to open the given filename, and slightly tweak it if this fails.
 657
 658     Attempts to open the given filename. If this fails, it tries to change
 659     the filename slightly, step by step, until it's either able to open it
 660     or it fails and raises a final exception, like the standard open()
 661     function.
 662
 663     It returns the tuple (stream, definitive_file_name).
 664     """
 665     try:
 666         if filename == '-':
 667             if sys.platform == 'win32':
 668                 import msvcrt
 669                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 670             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 671         stream = locked_file(filename, open_mode, block=False).open()
 672         return (stream, filename)
 673     except (IOError, OSError) as err:
 674         if err.errno in (errno.EACCES,):
 675             raise
 676
 677         # In case of error, try to remove win32 forbidden chars
 678         alt_filename = sanitize_path(filename)
 679         if alt_filename == filename:
 680             raise
 681         else:
 682             # An exception here should be caught in the caller
 683             stream = locked_file(filename, open_mode, block=False).open()
 684             return (stream, alt_filename)
 685
 686
 687 def timeconvert(timestr):
 688     """Convert RFC 2822 defined time string into system timestamp"""
 689     timestamp = None
 690     timetuple = email.utils.parsedate_tz(timestr)
 691     if timetuple is not None:
 692         timestamp = email.utils.mktime_tz(timetuple)
 693     return timestamp
 694
 695
 696 def sanitize_filename(s, restricted=False, is_id=False):
 697     """Sanitizes a string so it could be used as part of a filename.
 698     If restricted is set, use a stricter subset of allowed characters.
 699     Set is_id if this is not an arbitrary string, but an ID that should be kept
 700     if possible.
 701     """
 702     def replace_insane(char):
 703         if restricted and char in ACCENT_CHARS:
 704             return ACCENT_CHARS[char]
 705         elif not restricted and char == '\n':
 706             return ' '
 707         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 708             return ''
 709         elif char == '"':
 710             return '' if restricted else '\''
 711         elif char == ':':
 712             return '_-' if restricted else ' -'
 713         elif char in '\\/|*<>':
 714             return '_'
 715         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 716             return '_'
 717         if restricted and ord(char) > 127:
 718             return '_'
 719         return char
 720
 721     if s == '':
 722         return ''
 723     # Handle timestamps
 724     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 725     result = ''.join(map(replace_insane, s))
 726     if not is_id:
 727         while '__' in result:
 728             result = result.replace('__', '_')
 729         result = result.strip('_')
 730         # Common case of "Foreign band name - English song title"
 731         if restricted and result.startswith('-_'):
 732             result = result[2:]
 733         if result.startswith('-'):
 734             result = '_' + result[len('-'):]
 735         result = result.lstrip('.')
 736         if not result:
 737             result = '_'
 738     return result
 739
 740
 741 def sanitize_path(s, force=False):
 742     """Sanitizes and normalizes path on Windows"""
 743     if sys.platform == 'win32':
 744         force = False
 745         drive_or_unc, _ = os.path.splitdrive(s)
 746         if sys.version_info < (2, 7) and not drive_or_unc:
 747             drive_or_unc, _ = os.path.splitunc(s)
 748     elif force:
 749         drive_or_unc = ''
 750     else:
 751         return s
 752
 753     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 754     if drive_or_unc:
 755         norm_path.pop(0)
 756     sanitized_path = [
 757         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 758         for path_part in norm_path]
 759     if drive_or_unc:
 760         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 761     elif force and s[0] == os.path.sep:
 762         sanitized_path.insert(0, os.path.sep)
 763     return os.path.join(*sanitized_path)
 764
 765
 766 def sanitize_url(url):
 767     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 768     # the number of unwanted failures due to missing protocol
 769     if url.startswith('//'):
 770         return 'http:%s' % url
 771     # Fix some common typos seen so far
 772     COMMON_TYPOS = (
 773         # https://github.com/ytdl-org/youtube-dl/issues/15649
 774         (r'^httpss://', r'https://'),
 775         # https://bx1.be/lives/direct-tv/
 776         (r'^rmtp([es]?)://', r'rtmp\1://'),
 777     )
 778     for mistake, fixup in COMMON_TYPOS:
 779         if re.match(mistake, url):
 780             return re.sub(mistake, fixup, url)
 781     return url
 782
 783
 784 def extract_basic_auth(url):
 785     parts = compat_urlparse.urlsplit(url)
 786     if parts.username is None:
 787         return url, None
 788     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 789         parts.hostname if parts.port is None
 790         else '%s:%d' % (parts.hostname, parts.port))))
 791     auth_payload = base64.b64encode(
 792         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 793     return url, 'Basic ' + auth_payload.decode('utf-8')
 794
 795
 796 def sanitized_Request(url, *args, **kwargs):
 797     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 798     if auth_header is not None:
 799         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 800         headers['Authorization'] = auth_header
 801     return compat_urllib_request.Request(url, *args, **kwargs)
 802
 803
 804 def expand_path(s):
 805     """Expand shell variables and ~"""
 806     return os.path.expandvars(compat_expanduser(s))
 807
 808
 809 def orderedSet(iterable):
 810     """ Remove all duplicates from the input iterable """
 811     res = []
 812     for el in iterable:
 813         if el not in res:
 814             res.append(el)
 815     return res
 816
 817
 818 def _htmlentity_transform(entity_with_semicolon):
 819     """Transforms an HTML entity to a character."""
 820     entity = entity_with_semicolon[:-1]
 821
 822     # Known non-numeric HTML entity
 823     if entity in compat_html_entities.name2codepoint:
 824         return compat_chr(compat_html_entities.name2codepoint[entity])
 825
 826     # TODO: HTML5 allows entities without a semicolon. For example,
 827     # '&Eacuteric' should be decoded as 'Éric'.
 828     if entity_with_semicolon in compat_html_entities_html5:
 829         return compat_html_entities_html5[entity_with_semicolon]
 830
 831     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 832     if mobj is not None:
 833         numstr = mobj.group(1)
 834         if numstr.startswith('x'):
 835             base = 16
 836             numstr = '0%s' % numstr
 837         else:
 838             base = 10
 839         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 840         try:
 841             return compat_chr(int(numstr, base))
 842         except ValueError:
 843             pass
 844
 845     # Unknown entity in name, return its literal representation
 846     return '&%s;' % entity
 847
 848
 849 def unescapeHTML(s):
 850     if s is None:
 851         return None
 852     assert type(s) == compat_str
 853
 854     return re.sub(
 855         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 856
 857
 858 def escapeHTML(text):
 859     return (
 860         text
 861         .replace('&', '&amp;')
 862         .replace('<', '&lt;')
 863         .replace('>', '&gt;')
 864         .replace('"', '&quot;')
 865         .replace("'", '&#39;')
 866     )
 867
 868
 869 def process_communicate_or_kill(p, *args, **kwargs):
 870     try:
 871         return p.communicate(*args, **kwargs)
 872     except BaseException:  # Including KeyboardInterrupt
 873         p.kill()
 874         p.wait()
 875         raise
 876
 877
 878 class Popen(subprocess.Popen):
 879     if sys.platform == 'win32':
 880         _startupinfo = subprocess.STARTUPINFO()
 881         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 882     else:
 883         _startupinfo = None
 884
 885     def __init__(self, *args, **kwargs):
 886         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 887
 888     def communicate_or_kill(self, *args, **kwargs):
 889         return process_communicate_or_kill(self, *args, **kwargs)
 890
 891
 892 def get_subprocess_encoding():
 893     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 894         # For subprocess calls, encode with locale encoding
 895         # Refer to http://stackoverflow.com/a/9951851/35070
 896         encoding = preferredencoding()
 897     else:
 898         encoding = sys.getfilesystemencoding()
 899     if encoding is None:
 900         encoding = 'utf-8'
 901     return encoding
 902
 903
 904 def encodeFilename(s, for_subprocess=False):
 905     """
 906     @param s The name of the file
 907     """
 908
 909     assert type(s) == compat_str
 910
 911     # Python 3 has a Unicode API
 912     if sys.version_info >= (3, 0):
 913         return s
 914
 915     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 916     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 917     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 918     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 919         return s
 920
 921     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 922     if sys.platform.startswith('java'):
 923         return s
 924
 925     return s.encode(get_subprocess_encoding(), 'ignore')
 926
 927
 928 def decodeFilename(b, for_subprocess=False):
 929
 930     if sys.version_info >= (3, 0):
 931         return b
 932
 933     if not isinstance(b, bytes):
 934         return b
 935
 936     return b.decode(get_subprocess_encoding(), 'ignore')
 937
 938
 939 def encodeArgument(s):
 940     if not isinstance(s, compat_str):
 941         # Legacy code that uses byte strings
 942         # Uncomment the following line after fixing all post processors
 943         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 944         s = s.decode('ascii')
 945     return encodeFilename(s, True)
 946
 947
 948 def decodeArgument(b):
 949     return decodeFilename(b, True)
 950
 951
 952 def decodeOption(optval):
 953     if optval is None:
 954         return optval
 955     if isinstance(optval, bytes):
 956         optval = optval.decode(preferredencoding())
 957
 958     assert isinstance(optval, compat_str)
 959     return optval
 960
 961
 962 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 963
 964
 965 def timetuple_from_msec(msec):
 966     secs, msec = divmod(msec, 1000)
 967     mins, secs = divmod(secs, 60)
 968     hrs, mins = divmod(mins, 60)
 969     return _timetuple(hrs, mins, secs, msec)
 970
 971
 972 def formatSeconds(secs, delim=':', msec=False):
 973     time = timetuple_from_msec(secs * 1000)
 974     if time.hours:
 975         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 976     elif time.minutes:
 977         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 978     else:
 979         ret = '%d' % time.seconds
 980     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 981
 982
 983 def _ssl_load_windows_store_certs(ssl_context, storename):
 984     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 985     try:
 986         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 987                  if encoding == 'x509_asn' and (
 988                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 989     except PermissionError:
 990         return
 991     for cert in certs:
 992         try:
 993             ssl_context.load_verify_locations(cadata=cert)
 994         except ssl.SSLError:
 995             pass
 996
 997
 998 def make_HTTPS_handler(params, **kwargs):
 999     opts_check_certificate = not params.get('nocheckcertificate')
1000     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1001     context.check_hostname = opts_check_certificate
1002     if params.get('legacyserverconnect'):
1003         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
1004     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1005     if opts_check_certificate:
1006         try:
1007             context.load_default_certs()
1008             # Work around the issue in load_default_certs when there are bad certificates. See:
1009             # https://github.com/yt-dlp/yt-dlp/issues/1060,
1010             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1011         except ssl.SSLError:
1012             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1013             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1014                 # Create a new context to discard any certificates that were already loaded
1015                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1016                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1017                 for storename in ('CA', 'ROOT'):
1018                     _ssl_load_windows_store_certs(context, storename)
1019             context.set_default_verify_paths()
1020     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1021
1022
1023 def bug_reports_message(before=';'):
1024     msg = ('please report this issue on  https://github.com/yt-dlp/yt-dlp , '
1025            'filling out the "Broken site" issue template properly. '
1026            'Confirm you are on the latest version using -U')
1027
1028     before = before.rstrip()
1029     if not before or before.endswith(('.', '!', '?')):
1030         msg = msg[0].title() + msg[1:]
1031
1032     return (before + ' ' if before else '') + msg
1033
1034
1035 class YoutubeDLError(Exception):
1036     """Base exception for YoutubeDL errors."""
1037     msg = None
1038
1039     def __init__(self, msg=None):
1040         if msg is not None:
1041             self.msg = msg
1042         elif self.msg is None:
1043             self.msg = type(self).__name__
1044         super().__init__(self.msg)
1045
1046
1047 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1048 if hasattr(ssl, 'CertificateError'):
1049     network_exceptions.append(ssl.CertificateError)
1050 network_exceptions = tuple(network_exceptions)
1051
1052
1053 class ExtractorError(YoutubeDLError):
1054     """Error during info extraction."""
1055
1056     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1057         """ tb, if given, is the original traceback (so that it can be printed out).
1058         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1059         """
1060         if sys.exc_info()[0] in network_exceptions:
1061             expected = True
1062
1063         self.orig_msg = str(msg)
1064         self.traceback = tb
1065         self.expected = expected
1066         self.cause = cause
1067         self.video_id = video_id
1068         self.ie = ie
1069         self.exc_info = sys.exc_info()  # preserve original exception
1070
1071         super(ExtractorError, self).__init__(''.join((
1072             format_field(ie, template='[%s] '),
1073             format_field(video_id, template='%s: '),
1074             msg,
1075             format_field(cause, template=' (caused by %r)'),
1076             '' if expected else bug_reports_message())))
1077
1078     def format_traceback(self):
1079         return join_nonempty(
1080             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1081             self.cause and ''.join(traceback.format_exception(self.cause)[1:]),
1082             delim='\n') or None
1083
1084
1085 class UnsupportedError(ExtractorError):
1086     def __init__(self, url):
1087         super(UnsupportedError, self).__init__(
1088             'Unsupported URL: %s' % url, expected=True)
1089         self.url = url
1090
1091
1092 class RegexNotFoundError(ExtractorError):
1093     """Error when a regex didn't match"""
1094     pass
1095
1096
1097 class GeoRestrictedError(ExtractorError):
1098     """Geographic restriction Error exception.
1099
1100     This exception may be thrown when a video is not available from your
1101     geographic location due to geographic restrictions imposed by a website.
1102     """
1103
1104     def __init__(self, msg, countries=None, **kwargs):
1105         kwargs['expected'] = True
1106         super(GeoRestrictedError, self).__init__(msg, **kwargs)
1107         self.countries = countries
1108
1109
1110 class DownloadError(YoutubeDLError):
1111     """Download Error exception.
1112
1113     This exception may be thrown by FileDownloader objects if they are not
1114     configured to continue on errors. They will contain the appropriate
1115     error message.
1116     """
1117
1118     def __init__(self, msg, exc_info=None):
1119         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1120         super(DownloadError, self).__init__(msg)
1121         self.exc_info = exc_info
1122
1123
1124 class EntryNotInPlaylist(YoutubeDLError):
1125     """Entry not in playlist exception.
1126
1127     This exception will be thrown by YoutubeDL when a requested entry
1128     is not found in the playlist info_dict
1129     """
1130     msg = 'Entry not found in info'
1131
1132
1133 class SameFileError(YoutubeDLError):
1134     """Same File exception.
1135
1136     This exception will be thrown by FileDownloader objects if they detect
1137     multiple files would have to be downloaded to the same file on disk.
1138     """
1139     msg = 'Fixed output name but more than one file to download'
1140
1141     def __init__(self, filename=None):
1142         if filename is not None:
1143             self.msg += f': {filename}'
1144         super().__init__(self.msg)
1145
1146
1147 class PostProcessingError(YoutubeDLError):
1148     """Post Processing exception.
1149
1150     This exception may be raised by PostProcessor's .run() method to
1151     indicate an error in the postprocessing task.
1152     """
1153
1154
1155 class DownloadCancelled(YoutubeDLError):
1156     """ Exception raised when the download queue should be interrupted """
1157     msg = 'The download was cancelled'
1158
1159
1160 class ExistingVideoReached(DownloadCancelled):
1161     """ --break-on-existing triggered """
1162     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1163
1164
1165 class RejectedVideoReached(DownloadCancelled):
1166     """ --break-on-reject triggered """
1167     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1168
1169
1170 class MaxDownloadsReached(DownloadCancelled):
1171     """ --max-downloads limit has been reached. """
1172     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1173
1174
1175 class ReExtractInfo(YoutubeDLError):
1176     """ Video info needs to be re-extracted. """
1177
1178     def __init__(self, msg, expected=False):
1179         super().__init__(msg)
1180         self.expected = expected
1181
1182
1183 class ThrottledDownload(ReExtractInfo):
1184     """ Download speed below --throttled-rate. """
1185     msg = 'The download speed is below throttle limit'
1186
1187     def __init__(self):
1188         super().__init__(self.msg, expected=False)
1189
1190
1191 class UnavailableVideoError(YoutubeDLError):
1192     """Unavailable Format exception.
1193
1194     This exception will be thrown when a video is requested
1195     in a format that is not available for that video.
1196     """
1197     msg = 'Unable to download video'
1198
1199     def __init__(self, err=None):
1200         if err is not None:
1201             self.msg += f': {err}'
1202         super().__init__(self.msg)
1203
1204
1205 class ContentTooShortError(YoutubeDLError):
1206     """Content Too Short exception.
1207
1208     This exception may be raised by FileDownloader objects when a file they
1209     download is too small for what the server announced first, indicating
1210     the connection was probably interrupted.
1211     """
1212
1213     def __init__(self, downloaded, expected):
1214         super(ContentTooShortError, self).__init__(
1215             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1216         )
1217         # Both in bytes
1218         self.downloaded = downloaded
1219         self.expected = expected
1220
1221
1222 class XAttrMetadataError(YoutubeDLError):
1223     def __init__(self, code=None, msg='Unknown error'):
1224         super(XAttrMetadataError, self).__init__(msg)
1225         self.code = code
1226         self.msg = msg
1227
1228         # Parsing code and msg
1229         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1230                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1231             self.reason = 'NO_SPACE'
1232         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1233             self.reason = 'VALUE_TOO_LONG'
1234         else:
1235             self.reason = 'NOT_SUPPORTED'
1236
1237
1238 class XAttrUnavailableError(YoutubeDLError):
1239     pass
1240
1241
1242 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1243     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1244     # expected HTTP responses to meet HTTP/1.0 or later (see also
1245     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1246     if sys.version_info < (3, 0):
1247         kwargs['strict'] = True
1248     hc = http_class(*args, **compat_kwargs(kwargs))
1249     source_address = ydl_handler._params.get('source_address')
1250
1251     if source_address is not None:
1252         # This is to workaround _create_connection() from socket where it will try all
1253         # address data from getaddrinfo() including IPv6. This filters the result from
1254         # getaddrinfo() based on the source_address value.
1255         # This is based on the cpython socket.create_connection() function.
1256         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1257         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1258             host, port = address
1259             err = None
1260             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1261             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1262             ip_addrs = [addr for addr in addrs if addr[0] == af]
1263             if addrs and not ip_addrs:
1264                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1265                 raise socket.error(
1266                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1267                     % (ip_version, source_address[0]))
1268             for res in ip_addrs:
1269                 af, socktype, proto, canonname, sa = res
1270                 sock = None
1271                 try:
1272                     sock = socket.socket(af, socktype, proto)
1273                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1274                         sock.settimeout(timeout)
1275                     sock.bind(source_address)
1276                     sock.connect(sa)
1277                     err = None  # Explicitly break reference cycle
1278                     return sock
1279                 except socket.error as _:
1280                     err = _
1281                     if sock is not None:
1282                         sock.close()
1283             if err is not None:
1284                 raise err
1285             else:
1286                 raise socket.error('getaddrinfo returns an empty list')
1287         if hasattr(hc, '_create_connection'):
1288             hc._create_connection = _create_connection
1289         sa = (source_address, 0)
1290         if hasattr(hc, 'source_address'):  # Python 2.7+
1291             hc.source_address = sa
1292         else:  # Python 2.6
1293             def _hc_connect(self, *args, **kwargs):
1294                 sock = _create_connection(
1295                     (self.host, self.port), self.timeout, sa)
1296                 if is_https:
1297                     self.sock = ssl.wrap_socket(
1298                         sock, self.key_file, self.cert_file,
1299                         ssl_version=ssl.PROTOCOL_TLSv1)
1300                 else:
1301                     self.sock = sock
1302             hc.connect = functools.partial(_hc_connect, hc)
1303
1304     return hc
1305
1306
1307 def handle_youtubedl_headers(headers):
1308     filtered_headers = headers
1309
1310     if 'Youtubedl-no-compression' in filtered_headers:
1311         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1312         del filtered_headers['Youtubedl-no-compression']
1313
1314     return filtered_headers
1315
1316
1317 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1318     """Handler for HTTP requests and responses.
1319
1320     This class, when installed with an OpenerDirector, automatically adds
1321     the standard headers to every HTTP request and handles gzipped and
1322     deflated responses from web servers. If compression is to be avoided in
1323     a particular request, the original request in the program code only has
1324     to include the HTTP header "Youtubedl-no-compression", which will be
1325     removed before making the real request.
1326
1327     Part of this code was copied from:
1328
1329     http://techknack.net/python-urllib2-handlers/
1330
1331     Andrew Rowls, the author of that code, agreed to release it to the
1332     public domain.
1333     """
1334
1335     def __init__(self, params, *args, **kwargs):
1336         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1337         self._params = params
1338
1339     def http_open(self, req):
1340         conn_class = compat_http_client.HTTPConnection
1341
1342         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1343         if socks_proxy:
1344             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1345             del req.headers['Ytdl-socks-proxy']
1346
1347         return self.do_open(functools.partial(
1348             _create_http_connection, self, conn_class, False),
1349             req)
1350
1351     @staticmethod
1352     def deflate(data):
1353         if not data:
1354             return data
1355         try:
1356             return zlib.decompress(data, -zlib.MAX_WBITS)
1357         except zlib.error:
1358             return zlib.decompress(data)
1359
1360     def http_request(self, req):
1361         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1362         # always respected by websites, some tend to give out URLs with non percent-encoded
1363         # non-ASCII characters (see telemb.py, ard.py [#3412])
1364         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1365         # To work around aforementioned issue we will replace request's original URL with
1366         # percent-encoded one
1367         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1368         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1369         url = req.get_full_url()
1370         url_escaped = escape_url(url)
1371
1372         # Substitute URL if any change after escaping
1373         if url != url_escaped:
1374             req = update_Request(req, url=url_escaped)
1375
1376         for h, v in self._params.get('http_headers', std_headers).items():
1377             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1378             # The dict keys are capitalized because of this bug by urllib
1379             if h.capitalize() not in req.headers:
1380                 req.add_header(h, v)
1381
1382         req.headers = handle_youtubedl_headers(req.headers)
1383
1384         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1385             # Python 2.6 is brain-dead when it comes to fragments
1386             req._Request__original = req._Request__original.partition('#')[0]
1387             req._Request__r_type = req._Request__r_type.partition('#')[0]
1388
1389         return req
1390
1391     def http_response(self, req, resp):
1392         old_resp = resp
1393         # gzip
1394         if resp.headers.get('Content-encoding', '') == 'gzip':
1395             content = resp.read()
1396             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1397             try:
1398                 uncompressed = io.BytesIO(gz.read())
1399             except IOError as original_ioerror:
1400                 # There may be junk add the end of the file
1401                 # See http://stackoverflow.com/q/4928560/35070 for details
1402                 for i in range(1, 1024):
1403                     try:
1404                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1405                         uncompressed = io.BytesIO(gz.read())
1406                     except IOError:
1407                         continue
1408                     break
1409                 else:
1410                     raise original_ioerror
1411             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1412             resp.msg = old_resp.msg
1413             del resp.headers['Content-encoding']
1414         # deflate
1415         if resp.headers.get('Content-encoding', '') == 'deflate':
1416             gz = io.BytesIO(self.deflate(resp.read()))
1417             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1418             resp.msg = old_resp.msg
1419             del resp.headers['Content-encoding']
1420         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1421         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1422         if 300 <= resp.code < 400:
1423             location = resp.headers.get('Location')
1424             if location:
1425                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1426                 if sys.version_info >= (3, 0):
1427                     location = location.encode('iso-8859-1').decode('utf-8')
1428                 else:
1429                     location = location.decode('utf-8')
1430                 location_escaped = escape_url(location)
1431                 if location != location_escaped:
1432                     del resp.headers['Location']
1433                     if sys.version_info < (3, 0):
1434                         location_escaped = location_escaped.encode('utf-8')
1435                     resp.headers['Location'] = location_escaped
1436         return resp
1437
1438     https_request = http_request
1439     https_response = http_response
1440
1441
1442 def make_socks_conn_class(base_class, socks_proxy):
1443     assert issubclass(base_class, (
1444         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1445
1446     url_components = compat_urlparse.urlparse(socks_proxy)
1447     if url_components.scheme.lower() == 'socks5':
1448         socks_type = ProxyType.SOCKS5
1449     elif url_components.scheme.lower() in ('socks', 'socks4'):
1450         socks_type = ProxyType.SOCKS4
1451     elif url_components.scheme.lower() == 'socks4a':
1452         socks_type = ProxyType.SOCKS4A
1453
1454     def unquote_if_non_empty(s):
1455         if not s:
1456             return s
1457         return compat_urllib_parse_unquote_plus(s)
1458
1459     proxy_args = (
1460         socks_type,
1461         url_components.hostname, url_components.port or 1080,
1462         True,  # Remote DNS
1463         unquote_if_non_empty(url_components.username),
1464         unquote_if_non_empty(url_components.password),
1465     )
1466
1467     class SocksConnection(base_class):
1468         def connect(self):
1469             self.sock = sockssocket()
1470             self.sock.setproxy(*proxy_args)
1471             if type(self.timeout) in (int, float):
1472                 self.sock.settimeout(self.timeout)
1473             self.sock.connect((self.host, self.port))
1474
1475             if isinstance(self, compat_http_client.HTTPSConnection):
1476                 if hasattr(self, '_context'):  # Python > 2.6
1477                     self.sock = self._context.wrap_socket(
1478                         self.sock, server_hostname=self.host)
1479                 else:
1480                     self.sock = ssl.wrap_socket(self.sock)
1481
1482     return SocksConnection
1483
1484
1485 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1486     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1487         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1488         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1489         self._params = params
1490
1491     def https_open(self, req):
1492         kwargs = {}
1493         conn_class = self._https_conn_class
1494
1495         if hasattr(self, '_context'):  # python > 2.6
1496             kwargs['context'] = self._context
1497         if hasattr(self, '_check_hostname'):  # python 3.x
1498             kwargs['check_hostname'] = self._check_hostname
1499
1500         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1501         if socks_proxy:
1502             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1503             del req.headers['Ytdl-socks-proxy']
1504
1505         return self.do_open(functools.partial(
1506             _create_http_connection, self, conn_class, True),
1507             req, **kwargs)
1508
1509
1510 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1511     """
1512     See [1] for cookie file format.
1513
1514     1. https://curl.haxx.se/docs/http-cookies.html
1515     """
1516     _HTTPONLY_PREFIX = '#HttpOnly_'
1517     _ENTRY_LEN = 7
1518     _HEADER = '''# Netscape HTTP Cookie File
1519 # This file is generated by yt-dlp.  Do not edit.
1520
1521 '''
1522     _CookieFileEntry = collections.namedtuple(
1523         'CookieFileEntry',
1524         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1525
1526     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1527         """
1528         Save cookies to a file.
1529
1530         Most of the code is taken from CPython 3.8 and slightly adapted
1531         to support cookie files with UTF-8 in both python 2 and 3.
1532         """
1533         if filename is None:
1534             if self.filename is not None:
1535                 filename = self.filename
1536             else:
1537                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1538
1539         # Store session cookies with `expires` set to 0 instead of an empty
1540         # string
1541         for cookie in self:
1542             if cookie.expires is None:
1543                 cookie.expires = 0
1544
1545         with io.open(filename, 'w', encoding='utf-8') as f:
1546             f.write(self._HEADER)
1547             now = time.time()
1548             for cookie in self:
1549                 if not ignore_discard and cookie.discard:
1550                     continue
1551                 if not ignore_expires and cookie.is_expired(now):
1552                     continue
1553                 if cookie.secure:
1554                     secure = 'TRUE'
1555                 else:
1556                     secure = 'FALSE'
1557                 if cookie.domain.startswith('.'):
1558                     initial_dot = 'TRUE'
1559                 else:
1560                     initial_dot = 'FALSE'
1561                 if cookie.expires is not None:
1562                     expires = compat_str(cookie.expires)
1563                 else:
1564                     expires = ''
1565                 if cookie.value is None:
1566                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1567                     # with no name, whereas http.cookiejar regards it as a
1568                     # cookie with no value.
1569                     name = ''
1570                     value = cookie.name
1571                 else:
1572                     name = cookie.name
1573                     value = cookie.value
1574                 f.write(
1575                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1576                                secure, expires, name, value]) + '\n')
1577
1578     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1579         """Load cookies from a file."""
1580         if filename is None:
1581             if self.filename is not None:
1582                 filename = self.filename
1583             else:
1584                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1585
1586         def prepare_line(line):
1587             if line.startswith(self._HTTPONLY_PREFIX):
1588                 line = line[len(self._HTTPONLY_PREFIX):]
1589             # comments and empty lines are fine
1590             if line.startswith('#') or not line.strip():
1591                 return line
1592             cookie_list = line.split('\t')
1593             if len(cookie_list) != self._ENTRY_LEN:
1594                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1595             cookie = self._CookieFileEntry(*cookie_list)
1596             if cookie.expires_at and not cookie.expires_at.isdigit():
1597                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1598             return line
1599
1600         cf = io.StringIO()
1601         with io.open(filename, encoding='utf-8') as f:
1602             for line in f:
1603                 try:
1604                     cf.write(prepare_line(line))
1605                 except compat_cookiejar.LoadError as e:
1606                     write_string(
1607                         'WARNING: skipping cookie file entry due to %s: %r\n'
1608                         % (e, line), sys.stderr)
1609                     continue
1610         cf.seek(0)
1611         self._really_load(cf, filename, ignore_discard, ignore_expires)
1612         # Session cookies are denoted by either `expires` field set to
1613         # an empty string or 0. MozillaCookieJar only recognizes the former
1614         # (see [1]). So we need force the latter to be recognized as session
1615         # cookies on our own.
1616         # Session cookies may be important for cookies-based authentication,
1617         # e.g. usually, when user does not check 'Remember me' check box while
1618         # logging in on a site, some important cookies are stored as session
1619         # cookies so that not recognizing them will result in failed login.
1620         # 1. https://bugs.python.org/issue17164
1621         for cookie in self:
1622             # Treat `expires=0` cookies as session cookies
1623             if cookie.expires == 0:
1624                 cookie.expires = None
1625                 cookie.discard = True
1626
1627
1628 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1629     def __init__(self, cookiejar=None):
1630         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1631
1632     def http_response(self, request, response):
1633         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1634         # characters in Set-Cookie HTTP header of last response (see
1635         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1636         # In order to at least prevent crashing we will percent encode Set-Cookie
1637         # header before HTTPCookieProcessor starts processing it.
1638         # if sys.version_info < (3, 0) and response.headers:
1639         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1640         #         set_cookie = response.headers.get(set_cookie_header)
1641         #         if set_cookie:
1642         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1643         #             if set_cookie != set_cookie_escaped:
1644         #                 del response.headers[set_cookie_header]
1645         #                 response.headers[set_cookie_header] = set_cookie_escaped
1646         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1647
1648     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1649     https_response = http_response
1650
1651
1652 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1653     """YoutubeDL redirect handler
1654
1655     The code is based on HTTPRedirectHandler implementation from CPython [1].
1656
1657     This redirect handler solves two issues:
1658      - ensures redirect URL is always unicode under python 2
1659      - introduces support for experimental HTTP response status code
1660        308 Permanent Redirect [2] used by some sites [3]
1661
1662     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1663     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1664     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1665     """
1666
1667     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1668
1669     def redirect_request(self, req, fp, code, msg, headers, newurl):
1670         """Return a Request or None in response to a redirect.
1671
1672         This is called by the http_error_30x methods when a
1673         redirection response is received.  If a redirection should
1674         take place, return a new Request to allow http_error_30x to
1675         perform the redirect.  Otherwise, raise HTTPError if no-one
1676         else should try to handle this url.  Return None if you can't
1677         but another Handler might.
1678         """
1679         m = req.get_method()
1680         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1681                  or code in (301, 302, 303) and m == "POST")):
1682             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1683         # Strictly (according to RFC 2616), 301 or 302 in response to
1684         # a POST MUST NOT cause a redirection without confirmation
1685         # from the user (of urllib.request, in this case).  In practice,
1686         # essentially all clients do redirect in this case, so we do
1687         # the same.
1688
1689         # On python 2 urlh.geturl() may sometimes return redirect URL
1690         # as byte string instead of unicode. This workaround allows
1691         # to force it always return unicode.
1692         if sys.version_info[0] < 3:
1693             newurl = compat_str(newurl)
1694
1695         # Be conciliant with URIs containing a space.  This is mainly
1696         # redundant with the more complete encoding done in http_error_302(),
1697         # but it is kept for compatibility with other callers.
1698         newurl = newurl.replace(' ', '%20')
1699
1700         CONTENT_HEADERS = ("content-length", "content-type")
1701         # NB: don't use dict comprehension for python 2.6 compatibility
1702         newheaders = dict((k, v) for k, v in req.headers.items()
1703                           if k.lower() not in CONTENT_HEADERS)
1704         return compat_urllib_request.Request(
1705             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1706             unverifiable=True)
1707
1708
1709 def extract_timezone(date_str):
1710     m = re.search(
1711         r'''(?x)
1712             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1713             (?P<tz>Z|                                            # just the UTC Z, or
1714                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1715                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1716                    [ ]?                                          # optional space
1717                 (?P<sign>\+|-)                                   # +/-
1718                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1719             $)
1720         ''', date_str)
1721     if not m:
1722         timezone = datetime.timedelta()
1723     else:
1724         date_str = date_str[:-len(m.group('tz'))]
1725         if not m.group('sign'):
1726             timezone = datetime.timedelta()
1727         else:
1728             sign = 1 if m.group('sign') == '+' else -1
1729             timezone = datetime.timedelta(
1730                 hours=sign * int(m.group('hours')),
1731                 minutes=sign * int(m.group('minutes')))
1732     return timezone, date_str
1733
1734
1735 def parse_iso8601(date_str, delimiter='T', timezone=None):
1736     """ Return a UNIX timestamp from the given date """
1737
1738     if date_str is None:
1739         return None
1740
1741     date_str = re.sub(r'\.[0-9]+', '', date_str)
1742
1743     if timezone is None:
1744         timezone, date_str = extract_timezone(date_str)
1745
1746     try:
1747         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1748         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1749         return calendar.timegm(dt.timetuple())
1750     except ValueError:
1751         pass
1752
1753
1754 def date_formats(day_first=True):
1755     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1756
1757
1758 def unified_strdate(date_str, day_first=True):
1759     """Return a string with the date in the format YYYYMMDD"""
1760
1761     if date_str is None:
1762         return None
1763     upload_date = None
1764     # Replace commas
1765     date_str = date_str.replace(',', ' ')
1766     # Remove AM/PM + timezone
1767     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1768     _, date_str = extract_timezone(date_str)
1769
1770     for expression in date_formats(day_first):
1771         try:
1772             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1773         except ValueError:
1774             pass
1775     if upload_date is None:
1776         timetuple = email.utils.parsedate_tz(date_str)
1777         if timetuple:
1778             try:
1779                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1780             except ValueError:
1781                 pass
1782     if upload_date is not None:
1783         return compat_str(upload_date)
1784
1785
1786 def unified_timestamp(date_str, day_first=True):
1787     if date_str is None:
1788         return None
1789
1790     date_str = re.sub(r'[,|]', '', date_str)
1791
1792     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1793     timezone, date_str = extract_timezone(date_str)
1794
1795     # Remove AM/PM + timezone
1796     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1797
1798     # Remove unrecognized timezones from ISO 8601 alike timestamps
1799     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1800     if m:
1801         date_str = date_str[:-len(m.group('tz'))]
1802
1803     # Python only supports microseconds, so remove nanoseconds
1804     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1805     if m:
1806         date_str = m.group(1)
1807
1808     for expression in date_formats(day_first):
1809         try:
1810             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1811             return calendar.timegm(dt.timetuple())
1812         except ValueError:
1813             pass
1814     timetuple = email.utils.parsedate_tz(date_str)
1815     if timetuple:
1816         return calendar.timegm(timetuple) + pm_delta * 3600
1817
1818
1819 def determine_ext(url, default_ext='unknown_video'):
1820     if url is None or '.' not in url:
1821         return default_ext
1822     guess = url.partition('?')[0].rpartition('.')[2]
1823     if re.match(r'^[A-Za-z0-9]+$', guess):
1824         return guess
1825     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1826     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1827         return guess.rstrip('/')
1828     else:
1829         return default_ext
1830
1831
1832 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1833     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1834
1835
1836 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1837     """
1838     Return a datetime object from a string in the format YYYYMMDD or
1839     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1840
1841     format: string date format used to return datetime object from
1842     precision: round the time portion of a datetime object.
1843                 auto|microsecond|second|minute|hour|day.
1844                 auto: round to the unit provided in date_str (if applicable).
1845     """
1846     auto_precision = False
1847     if precision == 'auto':
1848         auto_precision = True
1849         precision = 'microsecond'
1850     today = datetime_round(datetime.datetime.utcnow(), precision)
1851     if date_str in ('now', 'today'):
1852         return today
1853     if date_str == 'yesterday':
1854         return today - datetime.timedelta(days=1)
1855     match = re.match(
1856         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1857         date_str)
1858     if match is not None:
1859         start_time = datetime_from_str(match.group('start'), precision, format)
1860         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1861         unit = match.group('unit')
1862         if unit == 'month' or unit == 'year':
1863             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1864             unit = 'day'
1865         else:
1866             if unit == 'week':
1867                 unit = 'day'
1868                 time *= 7
1869             delta = datetime.timedelta(**{unit + 's': time})
1870             new_date = start_time + delta
1871         if auto_precision:
1872             return datetime_round(new_date, unit)
1873         return new_date
1874
1875     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1876
1877
1878 def date_from_str(date_str, format='%Y%m%d', strict=False):
1879     """
1880     Return a datetime object from a string in the format YYYYMMDD or
1881     (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1882
1883     If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1884
1885     format: string date format used to return datetime object from
1886     """
1887     if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1888         raise ValueError(f'Invalid date format {date_str}')
1889     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1890
1891
1892 def datetime_add_months(dt, months):
1893     """Increment/Decrement a datetime object by months."""
1894     month = dt.month + months - 1
1895     year = dt.year + month // 12
1896     month = month % 12 + 1
1897     day = min(dt.day, calendar.monthrange(year, month)[1])
1898     return dt.replace(year, month, day)
1899
1900
1901 def datetime_round(dt, precision='day'):
1902     """
1903     Round a datetime object's time to a specific precision
1904     """
1905     if precision == 'microsecond':
1906         return dt
1907
1908     unit_seconds = {
1909         'day': 86400,
1910         'hour': 3600,
1911         'minute': 60,
1912         'second': 1,
1913     }
1914     roundto = lambda x, n: ((x + n / 2) // n) * n
1915     timestamp = calendar.timegm(dt.timetuple())
1916     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1917
1918
1919 def hyphenate_date(date_str):
1920     """
1921     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1922     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1923     if match is not None:
1924         return '-'.join(match.groups())
1925     else:
1926         return date_str
1927
1928
1929 class DateRange(object):
1930     """Represents a time interval between two dates"""
1931
1932     def __init__(self, start=None, end=None):
1933         """start and end must be strings in the format accepted by date"""
1934         if start is not None:
1935             self.start = date_from_str(start, strict=True)
1936         else:
1937             self.start = datetime.datetime.min.date()
1938         if end is not None:
1939             self.end = date_from_str(end, strict=True)
1940         else:
1941             self.end = datetime.datetime.max.date()
1942         if self.start > self.end:
1943             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1944
1945     @classmethod
1946     def day(cls, day):
1947         """Returns a range that only contains the given day"""
1948         return cls(day, day)
1949
1950     def __contains__(self, date):
1951         """Check if the date is in the range"""
1952         if not isinstance(date, datetime.date):
1953             date = date_from_str(date)
1954         return self.start <= date <= self.end
1955
1956     def __str__(self):
1957         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1958
1959
1960 def platform_name():
1961     """ Returns the platform name as a compat_str """
1962     res = platform.platform()
1963     if isinstance(res, bytes):
1964         res = res.decode(preferredencoding())
1965
1966     assert isinstance(res, compat_str)
1967     return res
1968
1969
1970 def get_windows_version():
1971     ''' Get Windows version. None if it's not running on Windows '''
1972     if compat_os_name == 'nt':
1973         return version_tuple(platform.win32_ver()[1])
1974     else:
1975         return None
1976
1977
1978 def _windows_write_string(s, out):
1979     """ Returns True if the string was written using special methods,
1980     False if it has yet to be written out."""
1981     # Adapted from http://stackoverflow.com/a/3259271/35070
1982
1983     import ctypes.wintypes
1984
1985     WIN_OUTPUT_IDS = {
1986         1: -11,
1987         2: -12,
1988     }
1989
1990     try:
1991         fileno = out.fileno()
1992     except AttributeError:
1993         # If the output stream doesn't have a fileno, it's virtual
1994         return False
1995     except io.UnsupportedOperation:
1996         # Some strange Windows pseudo files?
1997         return False
1998     if fileno not in WIN_OUTPUT_IDS:
1999         return False
2000
2001     GetStdHandle = compat_ctypes_WINFUNCTYPE(
2002         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2003         ('GetStdHandle', ctypes.windll.kernel32))
2004     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2005
2006     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2007         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2008         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2009         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2010     written = ctypes.wintypes.DWORD(0)
2011
2012     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2013     FILE_TYPE_CHAR = 0x0002
2014     FILE_TYPE_REMOTE = 0x8000
2015     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2016         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2017         ctypes.POINTER(ctypes.wintypes.DWORD))(
2018         ('GetConsoleMode', ctypes.windll.kernel32))
2019     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2020
2021     def not_a_console(handle):
2022         if handle == INVALID_HANDLE_VALUE or handle is None:
2023             return True
2024         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2025                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2026
2027     if not_a_console(h):
2028         return False
2029
2030     def next_nonbmp_pos(s):
2031         try:
2032             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2033         except StopIteration:
2034             return len(s)
2035
2036     while s:
2037         count = min(next_nonbmp_pos(s), 1024)
2038
2039         ret = WriteConsoleW(
2040             h, s, count if count else 2, ctypes.byref(written), None)
2041         if ret == 0:
2042             raise OSError('Failed to write string')
2043         if not count:  # We just wrote a non-BMP character
2044             assert written.value == 2
2045             s = s[1:]
2046         else:
2047             assert written.value > 0
2048             s = s[written.value:]
2049     return True
2050
2051
2052 def write_string(s, out=None, encoding=None):
2053     if out is None:
2054         out = sys.stderr
2055     assert type(s) == compat_str
2056
2057     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2058         if _windows_write_string(s, out):
2059             return
2060
2061     if ('b' in getattr(out, 'mode', '')
2062             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
2063         byt = s.encode(encoding or preferredencoding(), 'ignore')
2064         out.write(byt)
2065     elif hasattr(out, 'buffer'):
2066         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2067         byt = s.encode(enc, 'ignore')
2068         out.buffer.write(byt)
2069     else:
2070         out.write(s)
2071     out.flush()
2072
2073
2074 def bytes_to_intlist(bs):
2075     if not bs:
2076         return []
2077     if isinstance(bs[0], int):  # Python 3
2078         return list(bs)
2079     else:
2080         return [ord(c) for c in bs]
2081
2082
2083 def intlist_to_bytes(xs):
2084     if not xs:
2085         return b''
2086     return compat_struct_pack('%dB' % len(xs), *xs)
2087
2088
2089 # Cross-platform file locking
2090 if sys.platform == 'win32':
2091     import ctypes.wintypes
2092     import msvcrt
2093
2094     class OVERLAPPED(ctypes.Structure):
2095         _fields_ = [
2096             ('Internal', ctypes.wintypes.LPVOID),
2097             ('InternalHigh', ctypes.wintypes.LPVOID),
2098             ('Offset', ctypes.wintypes.DWORD),
2099             ('OffsetHigh', ctypes.wintypes.DWORD),
2100             ('hEvent', ctypes.wintypes.HANDLE),
2101         ]
2102
2103     kernel32 = ctypes.windll.kernel32
2104     LockFileEx = kernel32.LockFileEx
2105     LockFileEx.argtypes = [
2106         ctypes.wintypes.HANDLE,     # hFile
2107         ctypes.wintypes.DWORD,      # dwFlags
2108         ctypes.wintypes.DWORD,      # dwReserved
2109         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2110         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2111         ctypes.POINTER(OVERLAPPED)  # Overlapped
2112     ]
2113     LockFileEx.restype = ctypes.wintypes.BOOL
2114     UnlockFileEx = kernel32.UnlockFileEx
2115     UnlockFileEx.argtypes = [
2116         ctypes.wintypes.HANDLE,     # hFile
2117         ctypes.wintypes.DWORD,      # dwReserved
2118         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2119         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2120         ctypes.POINTER(OVERLAPPED)  # Overlapped
2121     ]
2122     UnlockFileEx.restype = ctypes.wintypes.BOOL
2123     whole_low = 0xffffffff
2124     whole_high = 0x7fffffff
2125
2126     def _lock_file(f, exclusive, block):
2127         overlapped = OVERLAPPED()
2128         overlapped.Offset = 0
2129         overlapped.OffsetHigh = 0
2130         overlapped.hEvent = 0
2131         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2132
2133         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2134                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2135                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2136             raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2137
2138     def _unlock_file(f):
2139         assert f._lock_file_overlapped_p
2140         handle = msvcrt.get_osfhandle(f.fileno())
2141         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2142             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2143
2144 else:
2145     try:
2146         import fcntl
2147
2148         def _lock_file(f, exclusive, block):
2149             try:
2150                 fcntl.flock(f,
2151                             fcntl.LOCK_SH if not exclusive
2152                             else fcntl.LOCK_EX if block
2153                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2154             except BlockingIOError:
2155                 raise
2156             except OSError:  # AOSP does not have flock()
2157                 fcntl.lockf(f,
2158                             fcntl.LOCK_SH if not exclusive
2159                             else fcntl.LOCK_EX if block
2160                             else fcntl.LOCK_EX | fcntl.LOCK_NB)
2161
2162         def _unlock_file(f):
2163             try:
2164                 fcntl.flock(f, fcntl.LOCK_UN)
2165             except OSError:
2166                 fcntl.lockf(f, fcntl.LOCK_UN)
2167
2168     except ImportError:
2169         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2170
2171         def _lock_file(f, exclusive, block):
2172             raise IOError(UNSUPPORTED_MSG)
2173
2174         def _unlock_file(f):
2175             raise IOError(UNSUPPORTED_MSG)
2176
2177
2178 class locked_file(object):
2179     _closed = False
2180
2181     def __init__(self, filename, mode, block=True, encoding=None):
2182         assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2183         self.f = io.open(filename, mode, encoding=encoding)
2184         self.mode = mode
2185         self.block = block
2186
2187     def __enter__(self):
2188         exclusive = 'r' not in self.mode
2189         try:
2190             _lock_file(self.f, exclusive, self.block)
2191         except IOError:
2192             self.f.close()
2193             raise
2194         return self
2195
2196     def __exit__(self, etype, value, traceback):
2197         try:
2198             if not self._closed:
2199                 _unlock_file(self.f)
2200         finally:
2201             self.f.close()
2202             self._closed = True
2203
2204     def __iter__(self):
2205         return iter(self.f)
2206
2207     def write(self, *args):
2208         return self.f.write(*args)
2209
2210     def read(self, *args):
2211         return self.f.read(*args)
2212
2213     def flush(self):
2214         self.f.flush()
2215
2216     def open(self):
2217         return self.__enter__()
2218
2219     def close(self, *args):
2220         self.__exit__(self, *args, value=False, traceback=False)
2221
2222
2223 def get_filesystem_encoding():
2224     encoding = sys.getfilesystemencoding()
2225     return encoding if encoding is not None else 'utf-8'
2226
2227
2228 def shell_quote(args):
2229     quoted_args = []
2230     encoding = get_filesystem_encoding()
2231     for a in args:
2232         if isinstance(a, bytes):
2233             # We may get a filename encoded with 'encodeFilename'
2234             a = a.decode(encoding)
2235         quoted_args.append(compat_shlex_quote(a))
2236     return ' '.join(quoted_args)
2237
2238
2239 def smuggle_url(url, data):
2240     """ Pass additional data in a URL for internal use. """
2241
2242     url, idata = unsmuggle_url(url, {})
2243     data.update(idata)
2244     sdata = compat_urllib_parse_urlencode(
2245         {'__youtubedl_smuggle': json.dumps(data)})
2246     return url + '#' + sdata
2247
2248
2249 def unsmuggle_url(smug_url, default=None):
2250     if '#__youtubedl_smuggle' not in smug_url:
2251         return smug_url, default
2252     url, _, sdata = smug_url.rpartition('#')
2253     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2254     data = json.loads(jsond)
2255     return url, data
2256
2257
2258 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2259     """ Formats numbers with decimal sufixes like K, M, etc """
2260     num, factor = float_or_none(num), float(factor)
2261     if num is None or num < 0:
2262         return None
2263     exponent = 0 if num == 0 else int(math.log(num, factor))
2264     suffix = ['', *'kMGTPEZY'][exponent]
2265     if factor == 1024:
2266         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2267     converted = num / (factor ** exponent)
2268     return fmt % (converted, suffix)
2269
2270
2271 def format_bytes(bytes):
2272     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2273
2274
2275 def lookup_unit_table(unit_table, s):
2276     units_re = '|'.join(re.escape(u) for u in unit_table)
2277     m = re.match(
2278         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2279     if not m:
2280         return None
2281     num_str = m.group('num').replace(',', '.')
2282     mult = unit_table[m.group('unit')]
2283     return int(float(num_str) * mult)
2284
2285
2286 def parse_filesize(s):
2287     if s is None:
2288         return None
2289
2290     # The lower-case forms are of course incorrect and unofficial,
2291     # but we support those too
2292     _UNIT_TABLE = {
2293         'B': 1,
2294         'b': 1,
2295         'bytes': 1,
2296         'KiB': 1024,
2297         'KB': 1000,
2298         'kB': 1024,
2299         'Kb': 1000,
2300         'kb': 1000,
2301         'kilobytes': 1000,
2302         'kibibytes': 1024,
2303         'MiB': 1024 ** 2,
2304         'MB': 1000 ** 2,
2305         'mB': 1024 ** 2,
2306         'Mb': 1000 ** 2,
2307         'mb': 1000 ** 2,
2308         'megabytes': 1000 ** 2,
2309         'mebibytes': 1024 ** 2,
2310         'GiB': 1024 ** 3,
2311         'GB': 1000 ** 3,
2312         'gB': 1024 ** 3,
2313         'Gb': 1000 ** 3,
2314         'gb': 1000 ** 3,
2315         'gigabytes': 1000 ** 3,
2316         'gibibytes': 1024 ** 3,
2317         'TiB': 1024 ** 4,
2318         'TB': 1000 ** 4,
2319         'tB': 1024 ** 4,
2320         'Tb': 1000 ** 4,
2321         'tb': 1000 ** 4,
2322         'terabytes': 1000 ** 4,
2323         'tebibytes': 1024 ** 4,
2324         'PiB': 1024 ** 5,
2325         'PB': 1000 ** 5,
2326         'pB': 1024 ** 5,
2327         'Pb': 1000 ** 5,
2328         'pb': 1000 ** 5,
2329         'petabytes': 1000 ** 5,
2330         'pebibytes': 1024 ** 5,
2331         'EiB': 1024 ** 6,
2332         'EB': 1000 ** 6,
2333         'eB': 1024 ** 6,
2334         'Eb': 1000 ** 6,
2335         'eb': 1000 ** 6,
2336         'exabytes': 1000 ** 6,
2337         'exbibytes': 1024 ** 6,
2338         'ZiB': 1024 ** 7,
2339         'ZB': 1000 ** 7,
2340         'zB': 1024 ** 7,
2341         'Zb': 1000 ** 7,
2342         'zb': 1000 ** 7,
2343         'zettabytes': 1000 ** 7,
2344         'zebibytes': 1024 ** 7,
2345         'YiB': 1024 ** 8,
2346         'YB': 1000 ** 8,
2347         'yB': 1024 ** 8,
2348         'Yb': 1000 ** 8,
2349         'yb': 1000 ** 8,
2350         'yottabytes': 1000 ** 8,
2351         'yobibytes': 1024 ** 8,
2352     }
2353
2354     return lookup_unit_table(_UNIT_TABLE, s)
2355
2356
2357 def parse_count(s):
2358     if s is None:
2359         return None
2360
2361     s = re.sub(r'^[^\d]+\s', '', s).strip()
2362
2363     if re.match(r'^[\d,.]+$', s):
2364         return str_to_int(s)
2365
2366     _UNIT_TABLE = {
2367         'k': 1000,
2368         'K': 1000,
2369         'm': 1000 ** 2,
2370         'M': 1000 ** 2,
2371         'kk': 1000 ** 2,
2372         'KK': 1000 ** 2,
2373         'b': 1000 ** 3,
2374         'B': 1000 ** 3,
2375     }
2376
2377     ret = lookup_unit_table(_UNIT_TABLE, s)
2378     if ret is not None:
2379         return ret
2380
2381     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2382     if mobj:
2383         return str_to_int(mobj.group(1))
2384
2385
2386 def parse_resolution(s):
2387     if s is None:
2388         return {}
2389
2390     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2391     if mobj:
2392         return {
2393             'width': int(mobj.group('w')),
2394             'height': int(mobj.group('h')),
2395         }
2396
2397     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2398     if mobj:
2399         return {'height': int(mobj.group(1))}
2400
2401     mobj = re.search(r'\b([48])[kK]\b', s)
2402     if mobj:
2403         return {'height': int(mobj.group(1)) * 540}
2404
2405     return {}
2406
2407
2408 def parse_bitrate(s):
2409     if not isinstance(s, compat_str):
2410         return
2411     mobj = re.search(r'\b(\d+)\s*kbps', s)
2412     if mobj:
2413         return int(mobj.group(1))
2414
2415
2416 def month_by_name(name, lang='en'):
2417     """ Return the number of a month by (locale-independently) English name """
2418
2419     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2420
2421     try:
2422         return month_names.index(name) + 1
2423     except ValueError:
2424         return None
2425
2426
2427 def month_by_abbreviation(abbrev):
2428     """ Return the number of a month by (locale-independently) English
2429         abbreviations """
2430
2431     try:
2432         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2433     except ValueError:
2434         return None
2435
2436
2437 def fix_xml_ampersands(xml_str):
2438     """Replace all the '&' by '&amp;' in XML"""
2439     return re.sub(
2440         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2441         '&amp;',
2442         xml_str)
2443
2444
2445 def setproctitle(title):
2446     assert isinstance(title, compat_str)
2447
2448     # ctypes in Jython is not complete
2449     # http://bugs.jython.org/issue2148
2450     if sys.platform.startswith('java'):
2451         return
2452
2453     try:
2454         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2455     except OSError:
2456         return
2457     except TypeError:
2458         # LoadLibrary in Windows Python 2.7.13 only expects
2459         # a bytestring, but since unicode_literals turns
2460         # every string into a unicode string, it fails.
2461         return
2462     title_bytes = title.encode('utf-8')
2463     buf = ctypes.create_string_buffer(len(title_bytes))
2464     buf.value = title_bytes
2465     try:
2466         libc.prctl(15, buf, 0, 0, 0)
2467     except AttributeError:
2468         return  # Strange libc, just skip this
2469
2470
2471 def remove_start(s, start):
2472     return s[len(start):] if s is not None and s.startswith(start) else s
2473
2474
2475 def remove_end(s, end):
2476     return s[:-len(end)] if s is not None and s.endswith(end) else s
2477
2478
2479 def remove_quotes(s):
2480     if s is None or len(s) < 2:
2481         return s
2482     for quote in ('"', "'", ):
2483         if s[0] == quote and s[-1] == quote:
2484             return s[1:-1]
2485     return s
2486
2487
2488 def get_domain(url):
2489     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2490     return domain.group('domain') if domain else None
2491
2492
2493 def url_basename(url):
2494     path = compat_urlparse.urlparse(url).path
2495     return path.strip('/').split('/')[-1]
2496
2497
2498 def base_url(url):
2499     return re.match(r'https?://[^?#&]+/', url).group()
2500
2501
2502 def urljoin(base, path):
2503     if isinstance(path, bytes):
2504         path = path.decode('utf-8')
2505     if not isinstance(path, compat_str) or not path:
2506         return None
2507     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2508         return path
2509     if isinstance(base, bytes):
2510         base = base.decode('utf-8')
2511     if not isinstance(base, compat_str) or not re.match(
2512             r'^(?:https?:)?//', base):
2513         return None
2514     return compat_urlparse.urljoin(base, path)
2515
2516
2517 class HEADRequest(compat_urllib_request.Request):
2518     def get_method(self):
2519         return 'HEAD'
2520
2521
2522 class PUTRequest(compat_urllib_request.Request):
2523     def get_method(self):
2524         return 'PUT'
2525
2526
2527 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2528     if get_attr and v is not None:
2529         v = getattr(v, get_attr, None)
2530     try:
2531         return int(v) * invscale // scale
2532     except (ValueError, TypeError, OverflowError):
2533         return default
2534
2535
2536 def str_or_none(v, default=None):
2537     return default if v is None else compat_str(v)
2538
2539
2540 def str_to_int(int_str):
2541     """ A more relaxed version of int_or_none """
2542     if isinstance(int_str, compat_integer_types):
2543         return int_str
2544     elif isinstance(int_str, compat_str):
2545         int_str = re.sub(r'[,\.\+]', '', int_str)
2546         return int_or_none(int_str)
2547
2548
2549 def float_or_none(v, scale=1, invscale=1, default=None):
2550     if v is None:
2551         return default
2552     try:
2553         return float(v) * invscale / scale
2554     except (ValueError, TypeError):
2555         return default
2556
2557
2558 def bool_or_none(v, default=None):
2559     return v if isinstance(v, bool) else default
2560
2561
2562 def strip_or_none(v, default=None):
2563     return v.strip() if isinstance(v, compat_str) else default
2564
2565
2566 def url_or_none(url):
2567     if not url or not isinstance(url, compat_str):
2568         return None
2569     url = url.strip()
2570     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2571
2572
2573 def request_to_url(req):
2574     if isinstance(req, compat_urllib_request.Request):
2575         return req.get_full_url()
2576     else:
2577         return req
2578
2579
2580 def strftime_or_none(timestamp, date_format, default=None):
2581     datetime_object = None
2582     try:
2583         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2584             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2585         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2586             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2587         return datetime_object.strftime(date_format)
2588     except (ValueError, TypeError, AttributeError):
2589         return default
2590
2591
2592 def parse_duration(s):
2593     if not isinstance(s, compat_basestring):
2594         return None
2595     s = s.strip()
2596     if not s:
2597         return None
2598
2599     days, hours, mins, secs, ms = [None] * 5
2600     m = re.match(r'''(?x)
2601             (?P<before_secs>
2602                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2603             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2604             (?P<ms>[.:][0-9]+)?Z?$
2605         ''', s)
2606     if m:
2607         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2608     else:
2609         m = re.match(
2610             r'''(?ix)(?:P?
2611                 (?:
2612                     [0-9]+\s*y(?:ears?)?\s*
2613                 )?
2614                 (?:
2615                     [0-9]+\s*m(?:onths?)?\s*
2616                 )?
2617                 (?:
2618                     [0-9]+\s*w(?:eeks?)?\s*
2619                 )?
2620                 (?:
2621                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2622                 )?
2623                 T)?
2624                 (?:
2625                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2626                 )?
2627                 (?:
2628                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2629                 )?
2630                 (?:
2631                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2632                 )?Z?$''', s)
2633         if m:
2634             days, hours, mins, secs, ms = m.groups()
2635         else:
2636             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2637             if m:
2638                 hours, mins = m.groups()
2639             else:
2640                 return None
2641
2642     duration = 0
2643     if secs:
2644         duration += float(secs)
2645     if mins:
2646         duration += float(mins) * 60
2647     if hours:
2648         duration += float(hours) * 60 * 60
2649     if days:
2650         duration += float(days) * 24 * 60 * 60
2651     if ms:
2652         duration += float(ms.replace(':', '.'))
2653     return duration
2654
2655
2656 def prepend_extension(filename, ext, expected_real_ext=None):
2657     name, real_ext = os.path.splitext(filename)
2658     return (
2659         '{0}.{1}{2}'.format(name, ext, real_ext)
2660         if not expected_real_ext or real_ext[1:] == expected_real_ext
2661         else '{0}.{1}'.format(filename, ext))
2662
2663
2664 def replace_extension(filename, ext, expected_real_ext=None):
2665     name, real_ext = os.path.splitext(filename)
2666     return '{0}.{1}'.format(
2667         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2668         ext)
2669
2670
2671 def check_executable(exe, args=[]):
2672     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2673     args can be a list of arguments for a short output (like -version) """
2674     try:
2675         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2676     except OSError:
2677         return False
2678     return exe
2679
2680
2681 def _get_exe_version_output(exe, args):
2682     try:
2683         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2684         # SIGTTOU if yt-dlp is run in the background.
2685         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2686         out, _ = Popen(
2687             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2688             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2689     except OSError:
2690         return False
2691     if isinstance(out, bytes):  # Python 2.x
2692         out = out.decode('ascii', 'ignore')
2693     return out
2694
2695
2696 def detect_exe_version(output, version_re=None, unrecognized='present'):
2697     assert isinstance(output, compat_str)
2698     if version_re is None:
2699         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2700     m = re.search(version_re, output)
2701     if m:
2702         return m.group(1)
2703     else:
2704         return unrecognized
2705
2706
2707 def get_exe_version(exe, args=['--version'],
2708                     version_re=None, unrecognized='present'):
2709     """ Returns the version of the specified executable,
2710     or False if the executable is not present """
2711     out = _get_exe_version_output(exe, args)
2712     return detect_exe_version(out, version_re, unrecognized) if out else False
2713
2714
2715 class LazyList(collections.abc.Sequence):
2716     ''' Lazy immutable list from an iterable
2717     Note that slices of a LazyList are lists and not LazyList'''
2718
2719     class IndexError(IndexError):
2720         pass
2721
2722     def __init__(self, iterable, *, reverse=False, _cache=None):
2723         self.__iterable = iter(iterable)
2724         self.__cache = [] if _cache is None else _cache
2725         self.__reversed = reverse
2726
2727     def __iter__(self):
2728         if self.__reversed:
2729             # We need to consume the entire iterable to iterate in reverse
2730             yield from self.exhaust()
2731             return
2732         yield from self.__cache
2733         for item in self.__iterable:
2734             self.__cache.append(item)
2735             yield item
2736
2737     def __exhaust(self):
2738         self.__cache.extend(self.__iterable)
2739         # Discard the emptied iterable to make it pickle-able
2740         self.__iterable = []
2741         return self.__cache
2742
2743     def exhaust(self):
2744         ''' Evaluate the entire iterable '''
2745         return self.__exhaust()[::-1 if self.__reversed else 1]
2746
2747     @staticmethod
2748     def __reverse_index(x):
2749         return None if x is None else -(x + 1)
2750
2751     def __getitem__(self, idx):
2752         if isinstance(idx, slice):
2753             if self.__reversed:
2754                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2755             start, stop, step = idx.start, idx.stop, idx.step or 1
2756         elif isinstance(idx, int):
2757             if self.__reversed:
2758                 idx = self.__reverse_index(idx)
2759             start, stop, step = idx, idx, 0
2760         else:
2761             raise TypeError('indices must be integers or slices')
2762         if ((start or 0) < 0 or (stop or 0) < 0
2763                 or (start is None and step < 0)
2764                 or (stop is None and step > 0)):
2765             # We need to consume the entire iterable to be able to slice from the end
2766             # Obviously, never use this with infinite iterables
2767             self.__exhaust()
2768             try:
2769                 return self.__cache[idx]
2770             except IndexError as e:
2771                 raise self.IndexError(e) from e
2772         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2773         if n > 0:
2774             self.__cache.extend(itertools.islice(self.__iterable, n))
2775         try:
2776             return self.__cache[idx]
2777         except IndexError as e:
2778             raise self.IndexError(e) from e
2779
2780     def __bool__(self):
2781         try:
2782             self[-1] if self.__reversed else self[0]
2783         except self.IndexError:
2784             return False
2785         return True
2786
2787     def __len__(self):
2788         self.__exhaust()
2789         return len(self.__cache)
2790
2791     def __reversed__(self):
2792         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2793
2794     def __copy__(self):
2795         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2796
2797     def __repr__(self):
2798         # repr and str should mimic a list. So we exhaust the iterable
2799         return repr(self.exhaust())
2800
2801     def __str__(self):
2802         return repr(self.exhaust())
2803
2804
2805 class PagedList:
2806
2807     class IndexError(IndexError):
2808         pass
2809
2810     def __len__(self):
2811         # This is only useful for tests
2812         return len(self.getslice())
2813
2814     def __init__(self, pagefunc, pagesize, use_cache=True):
2815         self._pagefunc = pagefunc
2816         self._pagesize = pagesize
2817         self._pagecount = float('inf')
2818         self._use_cache = use_cache
2819         self._cache = {}
2820
2821     def getpage(self, pagenum):
2822         page_results = self._cache.get(pagenum)
2823         if page_results is None:
2824             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2825         if self._use_cache:
2826             self._cache[pagenum] = page_results
2827         return page_results
2828
2829     def getslice(self, start=0, end=None):
2830         return list(self._getslice(start, end))
2831
2832     def _getslice(self, start, end):
2833         raise NotImplementedError('This method must be implemented by subclasses')
2834
2835     def __getitem__(self, idx):
2836         assert self._use_cache, 'Indexing PagedList requires cache'
2837         if not isinstance(idx, int) or idx < 0:
2838             raise TypeError('indices must be non-negative integers')
2839         entries = self.getslice(idx, idx + 1)
2840         if not entries:
2841             raise self.IndexError()
2842         return entries[0]
2843
2844
2845 class OnDemandPagedList(PagedList):
2846     def _getslice(self, start, end):
2847         for pagenum in itertools.count(start // self._pagesize):
2848             firstid = pagenum * self._pagesize
2849             nextfirstid = pagenum * self._pagesize + self._pagesize
2850             if start >= nextfirstid:
2851                 continue
2852
2853             startv = (
2854                 start % self._pagesize
2855                 if firstid <= start < nextfirstid
2856                 else 0)
2857             endv = (
2858                 ((end - 1) % self._pagesize) + 1
2859                 if (end is not None and firstid <= end <= nextfirstid)
2860                 else None)
2861
2862             try:
2863                 page_results = self.getpage(pagenum)
2864             except Exception:
2865                 self._pagecount = pagenum - 1
2866                 raise
2867             if startv != 0 or endv is not None:
2868                 page_results = page_results[startv:endv]
2869             yield from page_results
2870
2871             # A little optimization - if current page is not "full", ie. does
2872             # not contain page_size videos then we can assume that this page
2873             # is the last one - there are no more ids on further pages -
2874             # i.e. no need to query again.
2875             if len(page_results) + startv < self._pagesize:
2876                 break
2877
2878             # If we got the whole page, but the next page is not interesting,
2879             # break out early as well
2880             if end == nextfirstid:
2881                 break
2882
2883
2884 class InAdvancePagedList(PagedList):
2885     def __init__(self, pagefunc, pagecount, pagesize):
2886         PagedList.__init__(self, pagefunc, pagesize, True)
2887         self._pagecount = pagecount
2888
2889     def _getslice(self, start, end):
2890         start_page = start // self._pagesize
2891         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2892         skip_elems = start - start_page * self._pagesize
2893         only_more = None if end is None else end - start
2894         for pagenum in range(start_page, end_page):
2895             page_results = self.getpage(pagenum)
2896             if skip_elems:
2897                 page_results = page_results[skip_elems:]
2898                 skip_elems = None
2899             if only_more is not None:
2900                 if len(page_results) < only_more:
2901                     only_more -= len(page_results)
2902                 else:
2903                     yield from page_results[:only_more]
2904                     break
2905             yield from page_results
2906
2907
2908 def uppercase_escape(s):
2909     unicode_escape = codecs.getdecoder('unicode_escape')
2910     return re.sub(
2911         r'\\U[0-9a-fA-F]{8}',
2912         lambda m: unicode_escape(m.group(0))[0],
2913         s)
2914
2915
2916 def lowercase_escape(s):
2917     unicode_escape = codecs.getdecoder('unicode_escape')
2918     return re.sub(
2919         r'\\u[0-9a-fA-F]{4}',
2920         lambda m: unicode_escape(m.group(0))[0],
2921         s)
2922
2923
2924 def escape_rfc3986(s):
2925     """Escape non-ASCII characters as suggested by RFC 3986"""
2926     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2927         s = s.encode('utf-8')
2928     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2929
2930
2931 def escape_url(url):
2932     """Escape URL as suggested by RFC 3986"""
2933     url_parsed = compat_urllib_parse_urlparse(url)
2934     return url_parsed._replace(
2935         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2936         path=escape_rfc3986(url_parsed.path),
2937         params=escape_rfc3986(url_parsed.params),
2938         query=escape_rfc3986(url_parsed.query),
2939         fragment=escape_rfc3986(url_parsed.fragment)
2940     ).geturl()
2941
2942
2943 def parse_qs(url):
2944     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2945
2946
2947 def read_batch_urls(batch_fd):
2948     def fixup(url):
2949         if not isinstance(url, compat_str):
2950             url = url.decode('utf-8', 'replace')
2951         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2952         for bom in BOM_UTF8:
2953             if url.startswith(bom):
2954                 url = url[len(bom):]
2955         url = url.lstrip()
2956         if not url or url.startswith(('#', ';', ']')):
2957             return False
2958         # "#" cannot be stripped out since it is part of the URI
2959         # However, it can be safely stipped out if follwing a whitespace
2960         return re.split(r'\s#', url, 1)[0].rstrip()
2961
2962     with contextlib.closing(batch_fd) as fd:
2963         return [url for url in map(fixup, fd) if url]
2964
2965
2966 def urlencode_postdata(*args, **kargs):
2967     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2968
2969
2970 def update_url_query(url, query):
2971     if not query:
2972         return url
2973     parsed_url = compat_urlparse.urlparse(url)
2974     qs = compat_parse_qs(parsed_url.query)
2975     qs.update(query)
2976     return compat_urlparse.urlunparse(parsed_url._replace(
2977         query=compat_urllib_parse_urlencode(qs, True)))
2978
2979
2980 def update_Request(req, url=None, data=None, headers={}, query={}):
2981     req_headers = req.headers.copy()
2982     req_headers.update(headers)
2983     req_data = data or req.data
2984     req_url = update_url_query(url or req.get_full_url(), query)
2985     req_get_method = req.get_method()
2986     if req_get_method == 'HEAD':
2987         req_type = HEADRequest
2988     elif req_get_method == 'PUT':
2989         req_type = PUTRequest
2990     else:
2991         req_type = compat_urllib_request.Request
2992     new_req = req_type(
2993         req_url, data=req_data, headers=req_headers,
2994         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2995     if hasattr(req, 'timeout'):
2996         new_req.timeout = req.timeout
2997     return new_req
2998
2999
3000 def _multipart_encode_impl(data, boundary):
3001     content_type = 'multipart/form-data; boundary=%s' % boundary
3002
3003     out = b''
3004     for k, v in data.items():
3005         out += b'--' + boundary.encode('ascii') + b'\r\n'
3006         if isinstance(k, compat_str):
3007             k = k.encode('utf-8')
3008         if isinstance(v, compat_str):
3009             v = v.encode('utf-8')
3010         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3011         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3012         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3013         if boundary.encode('ascii') in content:
3014             raise ValueError('Boundary overlaps with data')
3015         out += content
3016
3017     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3018
3019     return out, content_type
3020
3021
3022 def multipart_encode(data, boundary=None):
3023     '''
3024     Encode a dict to RFC 7578-compliant form-data
3025
3026     data:
3027         A dict where keys and values can be either Unicode or bytes-like
3028         objects.
3029     boundary:
3030         If specified a Unicode object, it's used as the boundary. Otherwise
3031         a random boundary is generated.
3032
3033     Reference: https://tools.ietf.org/html/rfc7578
3034     '''
3035     has_specified_boundary = boundary is not None
3036
3037     while True:
3038         if boundary is None:
3039             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3040
3041         try:
3042             out, content_type = _multipart_encode_impl(data, boundary)
3043             break
3044         except ValueError:
3045             if has_specified_boundary:
3046                 raise
3047             boundary = None
3048
3049     return out, content_type
3050
3051
3052 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3053     if isinstance(key_or_keys, (list, tuple)):
3054         for key in key_or_keys:
3055             if key not in d or d[key] is None or skip_false_values and not d[key]:
3056                 continue
3057             return d[key]
3058         return default
3059     return d.get(key_or_keys, default)
3060
3061
3062 def try_get(src, getter, expected_type=None):
3063     for get in variadic(getter):
3064         try:
3065             v = get(src)
3066         except (AttributeError, KeyError, TypeError, IndexError):
3067             pass
3068         else:
3069             if expected_type is None or isinstance(v, expected_type):
3070                 return v
3071
3072
3073 def merge_dicts(*dicts):
3074     merged = {}
3075     for a_dict in dicts:
3076         for k, v in a_dict.items():
3077             if v is None:
3078                 continue
3079             if (k not in merged
3080                     or (isinstance(v, compat_str) and v
3081                         and isinstance(merged[k], compat_str)
3082                         and not merged[k])):
3083                 merged[k] = v
3084     return merged
3085
3086
3087 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3088     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3089
3090
3091 US_RATINGS = {
3092     'G': 0,
3093     'PG': 10,
3094     'PG-13': 13,
3095     'R': 16,
3096     'NC': 18,
3097 }
3098
3099
3100 TV_PARENTAL_GUIDELINES = {
3101     'TV-Y': 0,
3102     'TV-Y7': 7,
3103     'TV-G': 0,
3104     'TV-PG': 0,
3105     'TV-14': 14,
3106     'TV-MA': 17,
3107 }
3108
3109
3110 def parse_age_limit(s):
3111     if type(s) == int:
3112         return s if 0 <= s <= 21 else None
3113     if not isinstance(s, compat_basestring):
3114         return None
3115     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3116     if m:
3117         return int(m.group('age'))
3118     s = s.upper()
3119     if s in US_RATINGS:
3120         return US_RATINGS[s]
3121     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3122     if m:
3123         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3124     return None
3125
3126
3127 def strip_jsonp(code):
3128     return re.sub(
3129         r'''(?sx)^
3130             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3131             (?:\s*&&\s*(?P=func_name))?
3132             \s*\(\s*(?P<callback_data>.*)\);?
3133             \s*?(?://[^\n]*)*$''',
3134         r'\g<callback_data>', code)
3135
3136
3137 def js_to_json(code, vars={}):
3138     # vars is a dict of var, val pairs to substitute
3139     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3140     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3141     INTEGER_TABLE = (
3142         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3143         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3144     )
3145
3146     def fix_kv(m):
3147         v = m.group(0)
3148         if v in ('true', 'false', 'null'):
3149             return v
3150         elif v in ('undefined', 'void 0'):
3151             return 'null'
3152         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3153             return ""
3154
3155         if v[0] in ("'", '"'):
3156             v = re.sub(r'(?s)\\.|"', lambda m: {
3157                 '"': '\\"',
3158                 "\\'": "'",
3159                 '\\\n': '',
3160                 '\\x': '\\u00',
3161             }.get(m.group(0), m.group(0)), v[1:-1])
3162         else:
3163             for regex, base in INTEGER_TABLE:
3164                 im = re.match(regex, v)
3165                 if im:
3166                     i = int(im.group(1), base)
3167                     return '"%d":' % i if v.endswith(':') else '%d' % i
3168
3169             if v in vars:
3170                 return vars[v]
3171
3172         return '"%s"' % v
3173
3174     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3175
3176     return re.sub(r'''(?sx)
3177         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3178         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3179         {comment}|,(?={skip}[\]}}])|
3180         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3181         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3182         [0-9]+(?={skip}:)|
3183         !+
3184         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3185
3186
3187 def qualities(quality_ids):
3188     """ Get a numeric quality value out of a list of possible values """
3189     def q(qid):
3190         try:
3191             return quality_ids.index(qid)
3192         except ValueError:
3193             return -1
3194     return q
3195
3196
3197 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3198
3199
3200 DEFAULT_OUTTMPL = {
3201     'default': '%(title)s [%(id)s].%(ext)s',
3202     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3203 }
3204 OUTTMPL_TYPES = {
3205     'chapter': None,
3206     'subtitle': None,
3207     'thumbnail': None,
3208     'description': 'description',
3209     'annotation': 'annotations.xml',
3210     'infojson': 'info.json',
3211     'link': None,
3212     'pl_video': None,
3213     'pl_thumbnail': None,
3214     'pl_description': 'description',
3215     'pl_infojson': 'info.json',
3216 }
3217
3218 # As of [1] format syntax is:
3219 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3220 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3221 STR_FORMAT_RE_TMPL = r'''(?x)
3222     (?<!%)(?P<prefix>(?:%%)*)
3223     %
3224     (?P<has_key>\((?P<key>{0})\))?
3225     (?P<format>
3226         (?P<conversion>[#0\-+ ]+)?
3227         (?P<min_width>\d+)?
3228         (?P<precision>\.\d+)?
3229         (?P<len_mod>[hlL])?  # unused in python
3230         {1}  # conversion type
3231     )
3232 '''
3233
3234
3235 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3236
3237
3238 def limit_length(s, length):
3239     """ Add ellipses to overly long strings """
3240     if s is None:
3241         return None
3242     ELLIPSES = '...'
3243     if len(s) > length:
3244         return s[:length - len(ELLIPSES)] + ELLIPSES
3245     return s
3246
3247
3248 def version_tuple(v):
3249     return tuple(int(e) for e in re.split(r'[-.]', v))
3250
3251
3252 def is_outdated_version(version, limit, assume_new=True):
3253     if not version:
3254         return not assume_new
3255     try:
3256         return version_tuple(version) < version_tuple(limit)
3257     except ValueError:
3258         return not assume_new
3259
3260
3261 def ytdl_is_updateable():
3262     """ Returns if yt-dlp can be updated with -U """
3263
3264     from .update import is_non_updateable
3265
3266     return not is_non_updateable()
3267
3268
3269 def args_to_str(args):
3270     # Get a short string representation for a subprocess command
3271     return ' '.join(compat_shlex_quote(a) for a in args)
3272
3273
3274 def error_to_compat_str(err):
3275     err_str = str(err)
3276     # On python 2 error byte string must be decoded with proper
3277     # encoding rather than ascii
3278     if sys.version_info[0] < 3:
3279         err_str = err_str.decode(preferredencoding())
3280     return err_str
3281
3282
3283 def mimetype2ext(mt):
3284     if mt is None:
3285         return None
3286
3287     mt, _, params = mt.partition(';')
3288     mt = mt.strip()
3289
3290     FULL_MAP = {
3291         'audio/mp4': 'm4a',
3292         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3293         # it's the most popular one
3294         'audio/mpeg': 'mp3',
3295         'audio/x-wav': 'wav',
3296         'audio/wav': 'wav',
3297         'audio/wave': 'wav',
3298     }
3299
3300     ext = FULL_MAP.get(mt)
3301     if ext is not None:
3302         return ext
3303
3304     SUBTYPE_MAP = {
3305         '3gpp': '3gp',
3306         'smptett+xml': 'tt',
3307         'ttaf+xml': 'dfxp',
3308         'ttml+xml': 'ttml',
3309         'x-flv': 'flv',
3310         'x-mp4-fragmented': 'mp4',
3311         'x-ms-sami': 'sami',
3312         'x-ms-wmv': 'wmv',
3313         'mpegurl': 'm3u8',
3314         'x-mpegurl': 'm3u8',
3315         'vnd.apple.mpegurl': 'm3u8',
3316         'dash+xml': 'mpd',
3317         'f4m+xml': 'f4m',
3318         'hds+xml': 'f4m',
3319         'vnd.ms-sstr+xml': 'ism',
3320         'quicktime': 'mov',
3321         'mp2t': 'ts',
3322         'x-wav': 'wav',
3323         'filmstrip+json': 'fs',
3324         'svg+xml': 'svg',
3325     }
3326
3327     _, _, subtype = mt.rpartition('/')
3328     ext = SUBTYPE_MAP.get(subtype.lower())
3329     if ext is not None:
3330         return ext
3331
3332     SUFFIX_MAP = {
3333         'json': 'json',
3334         'xml': 'xml',
3335         'zip': 'zip',
3336         'gzip': 'gz',
3337     }
3338
3339     _, _, suffix = subtype.partition('+')
3340     ext = SUFFIX_MAP.get(suffix)
3341     if ext is not None:
3342         return ext
3343
3344     return subtype.replace('+', '.')
3345
3346
3347 def ext2mimetype(ext_or_url):
3348     if not ext_or_url:
3349         return None
3350     if '.' not in ext_or_url:
3351         ext_or_url = f'file.{ext_or_url}'
3352     return mimetypes.guess_type(ext_or_url)[0]
3353
3354
3355 def parse_codecs(codecs_str):
3356     # http://tools.ietf.org/html/rfc6381
3357     if not codecs_str:
3358         return {}
3359     split_codecs = list(filter(None, map(
3360         str.strip, codecs_str.strip().strip(',').split(','))))
3361     vcodec, acodec, tcodec, hdr = None, None, None, None
3362     for full_codec in split_codecs:
3363         parts = full_codec.split('.')
3364         codec = parts[0].replace('0', '')
3365         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3366                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3367             if not vcodec:
3368                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3369                 if codec in ('dvh1', 'dvhe'):
3370                     hdr = 'DV'
3371                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3372                     hdr = 'HDR10'
3373                 elif full_codec.replace('0', '').startswith('vp9.2'):
3374                     hdr = 'HDR10'
3375         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3376             if not acodec:
3377                 acodec = full_codec
3378         elif codec in ('stpp', 'wvtt',):
3379             if not tcodec:
3380                 tcodec = full_codec
3381         else:
3382             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3383     if vcodec or acodec or tcodec:
3384         return {
3385             'vcodec': vcodec or 'none',
3386             'acodec': acodec or 'none',
3387             'dynamic_range': hdr,
3388             **({'tcodec': tcodec} if tcodec is not None else {}),
3389         }
3390     elif len(split_codecs) == 2:
3391         return {
3392             'vcodec': split_codecs[0],
3393             'acodec': split_codecs[1],
3394         }
3395     return {}
3396
3397
3398 def urlhandle_detect_ext(url_handle):
3399     getheader = url_handle.headers.get
3400
3401     cd = getheader('Content-Disposition')
3402     if cd:
3403         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3404         if m:
3405             e = determine_ext(m.group('filename'), default_ext=None)
3406             if e:
3407                 return e
3408
3409     return mimetype2ext(getheader('Content-Type'))
3410
3411
3412 def encode_data_uri(data, mime_type):
3413     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3414
3415
3416 def age_restricted(content_limit, age_limit):
3417     """ Returns True iff the content should be blocked """
3418
3419     if age_limit is None:  # No limit set
3420         return False
3421     if content_limit is None:
3422         return False  # Content available for everyone
3423     return age_limit < content_limit
3424
3425
3426 def is_html(first_bytes):
3427     """ Detect whether a file contains HTML by examining its first bytes. """
3428
3429     BOMS = [
3430         (b'\xef\xbb\xbf', 'utf-8'),
3431         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3432         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3433         (b'\xff\xfe', 'utf-16-le'),
3434         (b'\xfe\xff', 'utf-16-be'),
3435     ]
3436     for bom, enc in BOMS:
3437         if first_bytes.startswith(bom):
3438             s = first_bytes[len(bom):].decode(enc, 'replace')
3439             break
3440     else:
3441         s = first_bytes.decode('utf-8', 'replace')
3442
3443     return re.match(r'^\s*<', s)
3444
3445
3446 def determine_protocol(info_dict):
3447     protocol = info_dict.get('protocol')
3448     if protocol is not None:
3449         return protocol
3450
3451     url = sanitize_url(info_dict['url'])
3452     if url.startswith('rtmp'):
3453         return 'rtmp'
3454     elif url.startswith('mms'):
3455         return 'mms'
3456     elif url.startswith('rtsp'):
3457         return 'rtsp'
3458
3459     ext = determine_ext(url)
3460     if ext == 'm3u8':
3461         return 'm3u8'
3462     elif ext == 'f4m':
3463         return 'f4m'
3464
3465     return compat_urllib_parse_urlparse(url).scheme
3466
3467
3468 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3469     """ Render a list of rows, each as a list of values.
3470     Text after a \t will be right aligned """
3471     def width(string):
3472         return len(remove_terminal_sequences(string).replace('\t', ''))
3473
3474     def get_max_lens(table):
3475         return [max(width(str(v)) for v in col) for col in zip(*table)]
3476
3477     def filter_using_list(row, filterArray):
3478         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3479
3480     max_lens = get_max_lens(data) if hide_empty else []
3481     header_row = filter_using_list(header_row, max_lens)
3482     data = [filter_using_list(row, max_lens) for row in data]
3483
3484     table = [header_row] + data
3485     max_lens = get_max_lens(table)
3486     extra_gap += 1
3487     if delim:
3488         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3489         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3490     for row in table:
3491         for pos, text in enumerate(map(str, row)):
3492             if '\t' in text:
3493                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3494             else:
3495                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3496     ret = '\n'.join(''.join(row).rstrip() for row in table)
3497     return ret
3498
3499
3500 def _match_one(filter_part, dct, incomplete):
3501     # TODO: Generalize code with YoutubeDL._build_format_filter
3502     STRING_OPERATORS = {
3503         '*=': operator.contains,
3504         '^=': lambda attr, value: attr.startswith(value),
3505         '$=': lambda attr, value: attr.endswith(value),
3506         '~=': lambda attr, value: re.search(value, attr),
3507     }
3508     COMPARISON_OPERATORS = {
3509         **STRING_OPERATORS,
3510         '<=': operator.le,  # "<=" must be defined above "<"
3511         '<': operator.lt,
3512         '>=': operator.ge,
3513         '>': operator.gt,
3514         '=': operator.eq,
3515     }
3516
3517     operator_rex = re.compile(r'''(?x)\s*
3518         (?P<key>[a-z_]+)
3519         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3520         (?:
3521             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3522             (?P<strval>.+?)
3523         )
3524         \s*$
3525         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3526     m = operator_rex.search(filter_part)
3527     if m:
3528         m = m.groupdict()
3529         unnegated_op = COMPARISON_OPERATORS[m['op']]
3530         if m['negation']:
3531             op = lambda attr, value: not unnegated_op(attr, value)
3532         else:
3533             op = unnegated_op
3534         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3535         if m['quote']:
3536             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3537         actual_value = dct.get(m['key'])
3538         numeric_comparison = None
3539         if isinstance(actual_value, compat_numeric_types):
3540             # If the original field is a string and matching comparisonvalue is
3541             # a number we should respect the origin of the original field
3542             # and process comparison value as a string (see
3543             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3544             try:
3545                 numeric_comparison = int(comparison_value)
3546             except ValueError:
3547                 numeric_comparison = parse_filesize(comparison_value)
3548                 if numeric_comparison is None:
3549                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3550                 if numeric_comparison is None:
3551                     numeric_comparison = parse_duration(comparison_value)
3552         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3553             raise ValueError('Operator %s only supports string values!' % m['op'])
3554         if actual_value is None:
3555             return incomplete or m['none_inclusive']
3556         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3557
3558     UNARY_OPERATORS = {
3559         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3560         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3561     }
3562     operator_rex = re.compile(r'''(?x)\s*
3563         (?P<op>%s)\s*(?P<key>[a-z_]+)
3564         \s*$
3565         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3566     m = operator_rex.search(filter_part)
3567     if m:
3568         op = UNARY_OPERATORS[m.group('op')]
3569         actual_value = dct.get(m.group('key'))
3570         if incomplete and actual_value is None:
3571             return True
3572         return op(actual_value)
3573
3574     raise ValueError('Invalid filter part %r' % filter_part)
3575
3576
3577 def match_str(filter_str, dct, incomplete=False):
3578     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3579         When incomplete, all conditions passes on missing fields
3580     """
3581     return all(
3582         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3583         for filter_part in re.split(r'(?<!\\)&', filter_str))
3584
3585
3586 def match_filter_func(filter_str):
3587     def _match_func(info_dict, *args, **kwargs):
3588         if match_str(filter_str, info_dict, *args, **kwargs):
3589             return None
3590         else:
3591             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3592             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3593     return _match_func
3594
3595
3596 def parse_dfxp_time_expr(time_expr):
3597     if not time_expr:
3598         return
3599
3600     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3601     if mobj:
3602         return float(mobj.group('time_offset'))
3603
3604     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3605     if mobj:
3606         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3607
3608
3609 def srt_subtitles_timecode(seconds):
3610     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3611
3612
3613 def ass_subtitles_timecode(seconds):
3614     time = timetuple_from_msec(seconds * 1000)
3615     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3616
3617
3618 def dfxp2srt(dfxp_data):
3619     '''
3620     @param dfxp_data A bytes-like object containing DFXP data
3621     @returns A unicode object containing converted SRT data
3622     '''
3623     LEGACY_NAMESPACES = (
3624         (b'http://www.w3.org/ns/ttml', [
3625             b'http://www.w3.org/2004/11/ttaf1',
3626             b'http://www.w3.org/2006/04/ttaf1',
3627             b'http://www.w3.org/2006/10/ttaf1',
3628         ]),
3629         (b'http://www.w3.org/ns/ttml#styling', [
3630             b'http://www.w3.org/ns/ttml#style',
3631         ]),
3632     )
3633
3634     SUPPORTED_STYLING = [
3635         'color',
3636         'fontFamily',
3637         'fontSize',
3638         'fontStyle',
3639         'fontWeight',
3640         'textDecoration'
3641     ]
3642
3643     _x = functools.partial(xpath_with_ns, ns_map={
3644         'xml': 'http://www.w3.org/XML/1998/namespace',
3645         'ttml': 'http://www.w3.org/ns/ttml',
3646         'tts': 'http://www.w3.org/ns/ttml#styling',
3647     })
3648
3649     styles = {}
3650     default_style = {}
3651
3652     class TTMLPElementParser(object):
3653         _out = ''
3654         _unclosed_elements = []
3655         _applied_styles = []
3656
3657         def start(self, tag, attrib):
3658             if tag in (_x('ttml:br'), 'br'):
3659                 self._out += '\n'
3660             else:
3661                 unclosed_elements = []
3662                 style = {}
3663                 element_style_id = attrib.get('style')
3664                 if default_style:
3665                     style.update(default_style)
3666                 if element_style_id:
3667                     style.update(styles.get(element_style_id, {}))
3668                 for prop in SUPPORTED_STYLING:
3669                     prop_val = attrib.get(_x('tts:' + prop))
3670                     if prop_val:
3671                         style[prop] = prop_val
3672                 if style:
3673                     font = ''
3674                     for k, v in sorted(style.items()):
3675                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3676                             continue
3677                         if k == 'color':
3678                             font += ' color="%s"' % v
3679                         elif k == 'fontSize':
3680                             font += ' size="%s"' % v
3681                         elif k == 'fontFamily':
3682                             font += ' face="%s"' % v
3683                         elif k == 'fontWeight' and v == 'bold':
3684                             self._out += '<b>'
3685                             unclosed_elements.append('b')
3686                         elif k == 'fontStyle' and v == 'italic':
3687                             self._out += '<i>'
3688                             unclosed_elements.append('i')
3689                         elif k == 'textDecoration' and v == 'underline':
3690                             self._out += '<u>'
3691                             unclosed_elements.append('u')
3692                     if font:
3693                         self._out += '<font' + font + '>'
3694                         unclosed_elements.append('font')
3695                     applied_style = {}
3696                     if self._applied_styles:
3697                         applied_style.update(self._applied_styles[-1])
3698                     applied_style.update(style)
3699                     self._applied_styles.append(applied_style)
3700                 self._unclosed_elements.append(unclosed_elements)
3701
3702         def end(self, tag):
3703             if tag not in (_x('ttml:br'), 'br'):
3704                 unclosed_elements = self._unclosed_elements.pop()
3705                 for element in reversed(unclosed_elements):
3706                     self._out += '</%s>' % element
3707                 if unclosed_elements and self._applied_styles:
3708                     self._applied_styles.pop()
3709
3710         def data(self, data):
3711             self._out += data
3712
3713         def close(self):
3714             return self._out.strip()
3715
3716     def parse_node(node):
3717         target = TTMLPElementParser()
3718         parser = xml.etree.ElementTree.XMLParser(target=target)
3719         parser.feed(xml.etree.ElementTree.tostring(node))
3720         return parser.close()
3721
3722     for k, v in LEGACY_NAMESPACES:
3723         for ns in v:
3724             dfxp_data = dfxp_data.replace(ns, k)
3725
3726     dfxp = compat_etree_fromstring(dfxp_data)
3727     out = []
3728     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3729
3730     if not paras:
3731         raise ValueError('Invalid dfxp/TTML subtitle')
3732
3733     repeat = False
3734     while True:
3735         for style in dfxp.findall(_x('.//ttml:style')):
3736             style_id = style.get('id') or style.get(_x('xml:id'))
3737             if not style_id:
3738                 continue
3739             parent_style_id = style.get('style')
3740             if parent_style_id:
3741                 if parent_style_id not in styles:
3742                     repeat = True
3743                     continue
3744                 styles[style_id] = styles[parent_style_id].copy()
3745             for prop in SUPPORTED_STYLING:
3746                 prop_val = style.get(_x('tts:' + prop))
3747                 if prop_val:
3748                     styles.setdefault(style_id, {})[prop] = prop_val
3749         if repeat:
3750             repeat = False
3751         else:
3752             break
3753
3754     for p in ('body', 'div'):
3755         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3756         if ele is None:
3757             continue
3758         style = styles.get(ele.get('style'))
3759         if not style:
3760             continue
3761         default_style.update(style)
3762
3763     for para, index in zip(paras, itertools.count(1)):
3764         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3765         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3766         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3767         if begin_time is None:
3768             continue
3769         if not end_time:
3770             if not dur:
3771                 continue
3772             end_time = begin_time + dur
3773         out.append('%d\n%s --> %s\n%s\n\n' % (
3774             index,
3775             srt_subtitles_timecode(begin_time),
3776             srt_subtitles_timecode(end_time),
3777             parse_node(para)))
3778
3779     return ''.join(out)
3780
3781
3782 def cli_option(params, command_option, param):
3783     param = params.get(param)
3784     if param:
3785         param = compat_str(param)
3786     return [command_option, param] if param is not None else []
3787
3788
3789 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3790     param = params.get(param)
3791     if param is None:
3792         return []
3793     assert isinstance(param, bool)
3794     if separator:
3795         return [command_option + separator + (true_value if param else false_value)]
3796     return [command_option, true_value if param else false_value]
3797
3798
3799 def cli_valueless_option(params, command_option, param, expected_value=True):
3800     param = params.get(param)
3801     return [command_option] if param == expected_value else []
3802
3803
3804 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3805     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3806         if use_compat:
3807             return argdict
3808         else:
3809             argdict = None
3810     if argdict is None:
3811         return default
3812     assert isinstance(argdict, dict)
3813
3814     assert isinstance(keys, (list, tuple))
3815     for key_list in keys:
3816         arg_list = list(filter(
3817             lambda x: x is not None,
3818             [argdict.get(key.lower()) for key in variadic(key_list)]))
3819         if arg_list:
3820             return [arg for args in arg_list for arg in args]
3821     return default
3822
3823
3824 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3825     main_key, exe = main_key.lower(), exe.lower()
3826     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3827     keys = [f'{root_key}{k}' for k in (keys or [''])]
3828     if root_key in keys:
3829         if main_key != exe:
3830             keys.append((main_key, exe))
3831         keys.append('default')
3832     else:
3833         use_compat = False
3834     return cli_configuration_args(argdict, keys, default, use_compat)
3835
3836
3837 class ISO639Utils(object):
3838     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3839     _lang_map = {
3840         'aa': 'aar',
3841         'ab': 'abk',
3842         'ae': 'ave',
3843         'af': 'afr',
3844         'ak': 'aka',
3845         'am': 'amh',
3846         'an': 'arg',
3847         'ar': 'ara',
3848         'as': 'asm',
3849         'av': 'ava',
3850         'ay': 'aym',
3851         'az': 'aze',
3852         'ba': 'bak',
3853         'be': 'bel',
3854         'bg': 'bul',
3855         'bh': 'bih',
3856         'bi': 'bis',
3857         'bm': 'bam',
3858         'bn': 'ben',
3859         'bo': 'bod',
3860         'br': 'bre',
3861         'bs': 'bos',
3862         'ca': 'cat',
3863         'ce': 'che',
3864         'ch': 'cha',
3865         'co': 'cos',
3866         'cr': 'cre',
3867         'cs': 'ces',
3868         'cu': 'chu',
3869         'cv': 'chv',
3870         'cy': 'cym',
3871         'da': 'dan',
3872         'de': 'deu',
3873         'dv': 'div',
3874         'dz': 'dzo',
3875         'ee': 'ewe',
3876         'el': 'ell',
3877         'en': 'eng',
3878         'eo': 'epo',
3879         'es': 'spa',
3880         'et': 'est',
3881         'eu': 'eus',
3882         'fa': 'fas',
3883         'ff': 'ful',
3884         'fi': 'fin',
3885         'fj': 'fij',
3886         'fo': 'fao',
3887         'fr': 'fra',
3888         'fy': 'fry',
3889         'ga': 'gle',
3890         'gd': 'gla',
3891         'gl': 'glg',
3892         'gn': 'grn',
3893         'gu': 'guj',
3894         'gv': 'glv',
3895         'ha': 'hau',
3896         'he': 'heb',
3897         'iw': 'heb',  # Replaced by he in 1989 revision
3898         'hi': 'hin',
3899         'ho': 'hmo',
3900         'hr': 'hrv',
3901         'ht': 'hat',
3902         'hu': 'hun',
3903         'hy': 'hye',
3904         'hz': 'her',
3905         'ia': 'ina',
3906         'id': 'ind',
3907         'in': 'ind',  # Replaced by id in 1989 revision
3908         'ie': 'ile',
3909         'ig': 'ibo',
3910         'ii': 'iii',
3911         'ik': 'ipk',
3912         'io': 'ido',
3913         'is': 'isl',
3914         'it': 'ita',
3915         'iu': 'iku',
3916         'ja': 'jpn',
3917         'jv': 'jav',
3918         'ka': 'kat',
3919         'kg': 'kon',
3920         'ki': 'kik',
3921         'kj': 'kua',
3922         'kk': 'kaz',
3923         'kl': 'kal',
3924         'km': 'khm',
3925         'kn': 'kan',
3926         'ko': 'kor',
3927         'kr': 'kau',
3928         'ks': 'kas',
3929         'ku': 'kur',
3930         'kv': 'kom',
3931         'kw': 'cor',
3932         'ky': 'kir',
3933         'la': 'lat',
3934         'lb': 'ltz',
3935         'lg': 'lug',
3936         'li': 'lim',
3937         'ln': 'lin',
3938         'lo': 'lao',
3939         'lt': 'lit',
3940         'lu': 'lub',
3941         'lv': 'lav',
3942         'mg': 'mlg',
3943         'mh': 'mah',
3944         'mi': 'mri',
3945         'mk': 'mkd',
3946         'ml': 'mal',
3947         'mn': 'mon',
3948         'mr': 'mar',
3949         'ms': 'msa',
3950         'mt': 'mlt',
3951         'my': 'mya',
3952         'na': 'nau',
3953         'nb': 'nob',
3954         'nd': 'nde',
3955         'ne': 'nep',
3956         'ng': 'ndo',
3957         'nl': 'nld',
3958         'nn': 'nno',
3959         'no': 'nor',
3960         'nr': 'nbl',
3961         'nv': 'nav',
3962         'ny': 'nya',
3963         'oc': 'oci',
3964         'oj': 'oji',
3965         'om': 'orm',
3966         'or': 'ori',
3967         'os': 'oss',
3968         'pa': 'pan',
3969         'pi': 'pli',
3970         'pl': 'pol',
3971         'ps': 'pus',
3972         'pt': 'por',
3973         'qu': 'que',
3974         'rm': 'roh',
3975         'rn': 'run',
3976         'ro': 'ron',
3977         'ru': 'rus',
3978         'rw': 'kin',
3979         'sa': 'san',
3980         'sc': 'srd',
3981         'sd': 'snd',
3982         'se': 'sme',
3983         'sg': 'sag',
3984         'si': 'sin',
3985         'sk': 'slk',
3986         'sl': 'slv',
3987         'sm': 'smo',
3988         'sn': 'sna',
3989         'so': 'som',
3990         'sq': 'sqi',
3991         'sr': 'srp',
3992         'ss': 'ssw',
3993         'st': 'sot',
3994         'su': 'sun',
3995         'sv': 'swe',
3996         'sw': 'swa',
3997         'ta': 'tam',
3998         'te': 'tel',
3999         'tg': 'tgk',
4000         'th': 'tha',
4001         'ti': 'tir',
4002         'tk': 'tuk',
4003         'tl': 'tgl',
4004         'tn': 'tsn',
4005         'to': 'ton',
4006         'tr': 'tur',
4007         'ts': 'tso',
4008         'tt': 'tat',
4009         'tw': 'twi',
4010         'ty': 'tah',
4011         'ug': 'uig',
4012         'uk': 'ukr',
4013         'ur': 'urd',
4014         'uz': 'uzb',
4015         've': 'ven',
4016         'vi': 'vie',
4017         'vo': 'vol',
4018         'wa': 'wln',
4019         'wo': 'wol',
4020         'xh': 'xho',
4021         'yi': 'yid',
4022         'ji': 'yid',  # Replaced by yi in 1989 revision
4023         'yo': 'yor',
4024         'za': 'zha',
4025         'zh': 'zho',
4026         'zu': 'zul',
4027     }
4028
4029     @classmethod
4030     def short2long(cls, code):
4031         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4032         return cls._lang_map.get(code[:2])
4033
4034     @classmethod
4035     def long2short(cls, code):
4036         """Convert language code from ISO 639-2/T to ISO 639-1"""
4037         for short_name, long_name in cls._lang_map.items():
4038             if long_name == code:
4039                 return short_name
4040
4041
4042 class ISO3166Utils(object):
4043     # From http://data.okfn.org/data/core/country-list
4044     _country_map = {
4045         'AF': 'Afghanistan',
4046         'AX': 'Åland Islands',
4047         'AL': 'Albania',
4048         'DZ': 'Algeria',
4049         'AS': 'American Samoa',
4050         'AD': 'Andorra',
4051         'AO': 'Angola',
4052         'AI': 'Anguilla',
4053         'AQ': 'Antarctica',
4054         'AG': 'Antigua and Barbuda',
4055         'AR': 'Argentina',
4056         'AM': 'Armenia',
4057         'AW': 'Aruba',
4058         'AU': 'Australia',
4059         'AT': 'Austria',
4060         'AZ': 'Azerbaijan',
4061         'BS': 'Bahamas',
4062         'BH': 'Bahrain',
4063         'BD': 'Bangladesh',
4064         'BB': 'Barbados',
4065         'BY': 'Belarus',
4066         'BE': 'Belgium',
4067         'BZ': 'Belize',
4068         'BJ': 'Benin',
4069         'BM': 'Bermuda',
4070         'BT': 'Bhutan',
4071         'BO': 'Bolivia, Plurinational State of',
4072         'BQ': 'Bonaire, Sint Eustatius and Saba',
4073         'BA': 'Bosnia and Herzegovina',
4074         'BW': 'Botswana',
4075         'BV': 'Bouvet Island',
4076         'BR': 'Brazil',
4077         'IO': 'British Indian Ocean Territory',
4078         'BN': 'Brunei Darussalam',
4079         'BG': 'Bulgaria',
4080         'BF': 'Burkina Faso',
4081         'BI': 'Burundi',
4082         'KH': 'Cambodia',
4083         'CM': 'Cameroon',
4084         'CA': 'Canada',
4085         'CV': 'Cape Verde',
4086         'KY': 'Cayman Islands',
4087         'CF': 'Central African Republic',
4088         'TD': 'Chad',
4089         'CL': 'Chile',
4090         'CN': 'China',
4091         'CX': 'Christmas Island',
4092         'CC': 'Cocos (Keeling) Islands',
4093         'CO': 'Colombia',
4094         'KM': 'Comoros',
4095         'CG': 'Congo',
4096         'CD': 'Congo, the Democratic Republic of the',
4097         'CK': 'Cook Islands',
4098         'CR': 'Costa Rica',
4099         'CI': 'Côte d\'Ivoire',
4100         'HR': 'Croatia',
4101         'CU': 'Cuba',
4102         'CW': 'Curaçao',
4103         'CY': 'Cyprus',
4104         'CZ': 'Czech Republic',
4105         'DK': 'Denmark',
4106         'DJ': 'Djibouti',
4107         'DM': 'Dominica',
4108         'DO': 'Dominican Republic',
4109         'EC': 'Ecuador',
4110         'EG': 'Egypt',
4111         'SV': 'El Salvador',
4112         'GQ': 'Equatorial Guinea',
4113         'ER': 'Eritrea',
4114         'EE': 'Estonia',
4115         'ET': 'Ethiopia',
4116         'FK': 'Falkland Islands (Malvinas)',
4117         'FO': 'Faroe Islands',
4118         'FJ': 'Fiji',
4119         'FI': 'Finland',
4120         'FR': 'France',
4121         'GF': 'French Guiana',
4122         'PF': 'French Polynesia',
4123         'TF': 'French Southern Territories',
4124         'GA': 'Gabon',
4125         'GM': 'Gambia',
4126         'GE': 'Georgia',
4127         'DE': 'Germany',
4128         'GH': 'Ghana',
4129         'GI': 'Gibraltar',
4130         'GR': 'Greece',
4131         'GL': 'Greenland',
4132         'GD': 'Grenada',
4133         'GP': 'Guadeloupe',
4134         'GU': 'Guam',
4135         'GT': 'Guatemala',
4136         'GG': 'Guernsey',
4137         'GN': 'Guinea',
4138         'GW': 'Guinea-Bissau',
4139         'GY': 'Guyana',
4140         'HT': 'Haiti',
4141         'HM': 'Heard Island and McDonald Islands',
4142         'VA': 'Holy See (Vatican City State)',
4143         'HN': 'Honduras',
4144         'HK': 'Hong Kong',
4145         'HU': 'Hungary',
4146         'IS': 'Iceland',
4147         'IN': 'India',
4148         'ID': 'Indonesia',
4149         'IR': 'Iran, Islamic Republic of',
4150         'IQ': 'Iraq',
4151         'IE': 'Ireland',
4152         'IM': 'Isle of Man',
4153         'IL': 'Israel',
4154         'IT': 'Italy',
4155         'JM': 'Jamaica',
4156         'JP': 'Japan',
4157         'JE': 'Jersey',
4158         'JO': 'Jordan',
4159         'KZ': 'Kazakhstan',
4160         'KE': 'Kenya',
4161         'KI': 'Kiribati',
4162         'KP': 'Korea, Democratic People\'s Republic of',
4163         'KR': 'Korea, Republic of',
4164         'KW': 'Kuwait',
4165         'KG': 'Kyrgyzstan',
4166         'LA': 'Lao People\'s Democratic Republic',
4167         'LV': 'Latvia',
4168         'LB': 'Lebanon',
4169         'LS': 'Lesotho',
4170         'LR': 'Liberia',
4171         'LY': 'Libya',
4172         'LI': 'Liechtenstein',
4173         'LT': 'Lithuania',
4174         'LU': 'Luxembourg',
4175         'MO': 'Macao',
4176         'MK': 'Macedonia, the Former Yugoslav Republic of',
4177         'MG': 'Madagascar',
4178         'MW': 'Malawi',
4179         'MY': 'Malaysia',
4180         'MV': 'Maldives',
4181         'ML': 'Mali',
4182         'MT': 'Malta',
4183         'MH': 'Marshall Islands',
4184         'MQ': 'Martinique',
4185         'MR': 'Mauritania',
4186         'MU': 'Mauritius',
4187         'YT': 'Mayotte',
4188         'MX': 'Mexico',
4189         'FM': 'Micronesia, Federated States of',
4190         'MD': 'Moldova, Republic of',
4191         'MC': 'Monaco',
4192         'MN': 'Mongolia',
4193         'ME': 'Montenegro',
4194         'MS': 'Montserrat',
4195         'MA': 'Morocco',
4196         'MZ': 'Mozambique',
4197         'MM': 'Myanmar',
4198         'NA': 'Namibia',
4199         'NR': 'Nauru',
4200         'NP': 'Nepal',
4201         'NL': 'Netherlands',
4202         'NC': 'New Caledonia',
4203         'NZ': 'New Zealand',
4204         'NI': 'Nicaragua',
4205         'NE': 'Niger',
4206         'NG': 'Nigeria',
4207         'NU': 'Niue',
4208         'NF': 'Norfolk Island',
4209         'MP': 'Northern Mariana Islands',
4210         'NO': 'Norway',
4211         'OM': 'Oman',
4212         'PK': 'Pakistan',
4213         'PW': 'Palau',
4214         'PS': 'Palestine, State of',
4215         'PA': 'Panama',
4216         'PG': 'Papua New Guinea',
4217         'PY': 'Paraguay',
4218         'PE': 'Peru',
4219         'PH': 'Philippines',
4220         'PN': 'Pitcairn',
4221         'PL': 'Poland',
4222         'PT': 'Portugal',
4223         'PR': 'Puerto Rico',
4224         'QA': 'Qatar',
4225         'RE': 'Réunion',
4226         'RO': 'Romania',
4227         'RU': 'Russian Federation',
4228         'RW': 'Rwanda',
4229         'BL': 'Saint Barthélemy',
4230         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4231         'KN': 'Saint Kitts and Nevis',
4232         'LC': 'Saint Lucia',
4233         'MF': 'Saint Martin (French part)',
4234         'PM': 'Saint Pierre and Miquelon',
4235         'VC': 'Saint Vincent and the Grenadines',
4236         'WS': 'Samoa',
4237         'SM': 'San Marino',
4238         'ST': 'Sao Tome and Principe',
4239         'SA': 'Saudi Arabia',
4240         'SN': 'Senegal',
4241         'RS': 'Serbia',
4242         'SC': 'Seychelles',
4243         'SL': 'Sierra Leone',
4244         'SG': 'Singapore',
4245         'SX': 'Sint Maarten (Dutch part)',
4246         'SK': 'Slovakia',
4247         'SI': 'Slovenia',
4248         'SB': 'Solomon Islands',
4249         'SO': 'Somalia',
4250         'ZA': 'South Africa',
4251         'GS': 'South Georgia and the South Sandwich Islands',
4252         'SS': 'South Sudan',
4253         'ES': 'Spain',
4254         'LK': 'Sri Lanka',
4255         'SD': 'Sudan',
4256         'SR': 'Suriname',
4257         'SJ': 'Svalbard and Jan Mayen',
4258         'SZ': 'Swaziland',
4259         'SE': 'Sweden',
4260         'CH': 'Switzerland',
4261         'SY': 'Syrian Arab Republic',
4262         'TW': 'Taiwan, Province of China',
4263         'TJ': 'Tajikistan',
4264         'TZ': 'Tanzania, United Republic of',
4265         'TH': 'Thailand',
4266         'TL': 'Timor-Leste',
4267         'TG': 'Togo',
4268         'TK': 'Tokelau',
4269         'TO': 'Tonga',
4270         'TT': 'Trinidad and Tobago',
4271         'TN': 'Tunisia',
4272         'TR': 'Turkey',
4273         'TM': 'Turkmenistan',
4274         'TC': 'Turks and Caicos Islands',
4275         'TV': 'Tuvalu',
4276         'UG': 'Uganda',
4277         'UA': 'Ukraine',
4278         'AE': 'United Arab Emirates',
4279         'GB': 'United Kingdom',
4280         'US': 'United States',
4281         'UM': 'United States Minor Outlying Islands',
4282         'UY': 'Uruguay',
4283         'UZ': 'Uzbekistan',
4284         'VU': 'Vanuatu',
4285         'VE': 'Venezuela, Bolivarian Republic of',
4286         'VN': 'Viet Nam',
4287         'VG': 'Virgin Islands, British',
4288         'VI': 'Virgin Islands, U.S.',
4289         'WF': 'Wallis and Futuna',
4290         'EH': 'Western Sahara',
4291         'YE': 'Yemen',
4292         'ZM': 'Zambia',
4293         'ZW': 'Zimbabwe',
4294     }
4295
4296     @classmethod
4297     def short2full(cls, code):
4298         """Convert an ISO 3166-2 country code to the corresponding full name"""
4299         return cls._country_map.get(code.upper())
4300
4301
4302 class GeoUtils(object):
4303     # Major IPv4 address blocks per country
4304     _country_ip_map = {
4305         'AD': '46.172.224.0/19',
4306         'AE': '94.200.0.0/13',
4307         'AF': '149.54.0.0/17',
4308         'AG': '209.59.64.0/18',
4309         'AI': '204.14.248.0/21',
4310         'AL': '46.99.0.0/16',
4311         'AM': '46.70.0.0/15',
4312         'AO': '105.168.0.0/13',
4313         'AP': '182.50.184.0/21',
4314         'AQ': '23.154.160.0/24',
4315         'AR': '181.0.0.0/12',
4316         'AS': '202.70.112.0/20',
4317         'AT': '77.116.0.0/14',
4318         'AU': '1.128.0.0/11',
4319         'AW': '181.41.0.0/18',
4320         'AX': '185.217.4.0/22',
4321         'AZ': '5.197.0.0/16',
4322         'BA': '31.176.128.0/17',
4323         'BB': '65.48.128.0/17',
4324         'BD': '114.130.0.0/16',
4325         'BE': '57.0.0.0/8',
4326         'BF': '102.178.0.0/15',
4327         'BG': '95.42.0.0/15',
4328         'BH': '37.131.0.0/17',
4329         'BI': '154.117.192.0/18',
4330         'BJ': '137.255.0.0/16',
4331         'BL': '185.212.72.0/23',
4332         'BM': '196.12.64.0/18',
4333         'BN': '156.31.0.0/16',
4334         'BO': '161.56.0.0/16',
4335         'BQ': '161.0.80.0/20',
4336         'BR': '191.128.0.0/12',
4337         'BS': '24.51.64.0/18',
4338         'BT': '119.2.96.0/19',
4339         'BW': '168.167.0.0/16',
4340         'BY': '178.120.0.0/13',
4341         'BZ': '179.42.192.0/18',
4342         'CA': '99.224.0.0/11',
4343         'CD': '41.243.0.0/16',
4344         'CF': '197.242.176.0/21',
4345         'CG': '160.113.0.0/16',
4346         'CH': '85.0.0.0/13',
4347         'CI': '102.136.0.0/14',
4348         'CK': '202.65.32.0/19',
4349         'CL': '152.172.0.0/14',
4350         'CM': '102.244.0.0/14',
4351         'CN': '36.128.0.0/10',
4352         'CO': '181.240.0.0/12',
4353         'CR': '201.192.0.0/12',
4354         'CU': '152.206.0.0/15',
4355         'CV': '165.90.96.0/19',
4356         'CW': '190.88.128.0/17',
4357         'CY': '31.153.0.0/16',
4358         'CZ': '88.100.0.0/14',
4359         'DE': '53.0.0.0/8',
4360         'DJ': '197.241.0.0/17',
4361         'DK': '87.48.0.0/12',
4362         'DM': '192.243.48.0/20',
4363         'DO': '152.166.0.0/15',
4364         'DZ': '41.96.0.0/12',
4365         'EC': '186.68.0.0/15',
4366         'EE': '90.190.0.0/15',
4367         'EG': '156.160.0.0/11',
4368         'ER': '196.200.96.0/20',
4369         'ES': '88.0.0.0/11',
4370         'ET': '196.188.0.0/14',
4371         'EU': '2.16.0.0/13',
4372         'FI': '91.152.0.0/13',
4373         'FJ': '144.120.0.0/16',
4374         'FK': '80.73.208.0/21',
4375         'FM': '119.252.112.0/20',
4376         'FO': '88.85.32.0/19',
4377         'FR': '90.0.0.0/9',
4378         'GA': '41.158.0.0/15',
4379         'GB': '25.0.0.0/8',
4380         'GD': '74.122.88.0/21',
4381         'GE': '31.146.0.0/16',
4382         'GF': '161.22.64.0/18',
4383         'GG': '62.68.160.0/19',
4384         'GH': '154.160.0.0/12',
4385         'GI': '95.164.0.0/16',
4386         'GL': '88.83.0.0/19',
4387         'GM': '160.182.0.0/15',
4388         'GN': '197.149.192.0/18',
4389         'GP': '104.250.0.0/19',
4390         'GQ': '105.235.224.0/20',
4391         'GR': '94.64.0.0/13',
4392         'GT': '168.234.0.0/16',
4393         'GU': '168.123.0.0/16',
4394         'GW': '197.214.80.0/20',
4395         'GY': '181.41.64.0/18',
4396         'HK': '113.252.0.0/14',
4397         'HN': '181.210.0.0/16',
4398         'HR': '93.136.0.0/13',
4399         'HT': '148.102.128.0/17',
4400         'HU': '84.0.0.0/14',
4401         'ID': '39.192.0.0/10',
4402         'IE': '87.32.0.0/12',
4403         'IL': '79.176.0.0/13',
4404         'IM': '5.62.80.0/20',
4405         'IN': '117.192.0.0/10',
4406         'IO': '203.83.48.0/21',
4407         'IQ': '37.236.0.0/14',
4408         'IR': '2.176.0.0/12',
4409         'IS': '82.221.0.0/16',
4410         'IT': '79.0.0.0/10',
4411         'JE': '87.244.64.0/18',
4412         'JM': '72.27.0.0/17',
4413         'JO': '176.29.0.0/16',
4414         'JP': '133.0.0.0/8',
4415         'KE': '105.48.0.0/12',
4416         'KG': '158.181.128.0/17',
4417         'KH': '36.37.128.0/17',
4418         'KI': '103.25.140.0/22',
4419         'KM': '197.255.224.0/20',
4420         'KN': '198.167.192.0/19',
4421         'KP': '175.45.176.0/22',
4422         'KR': '175.192.0.0/10',
4423         'KW': '37.36.0.0/14',
4424         'KY': '64.96.0.0/15',
4425         'KZ': '2.72.0.0/13',
4426         'LA': '115.84.64.0/18',
4427         'LB': '178.135.0.0/16',
4428         'LC': '24.92.144.0/20',
4429         'LI': '82.117.0.0/19',
4430         'LK': '112.134.0.0/15',
4431         'LR': '102.183.0.0/16',
4432         'LS': '129.232.0.0/17',
4433         'LT': '78.56.0.0/13',
4434         'LU': '188.42.0.0/16',
4435         'LV': '46.109.0.0/16',
4436         'LY': '41.252.0.0/14',
4437         'MA': '105.128.0.0/11',
4438         'MC': '88.209.64.0/18',
4439         'MD': '37.246.0.0/16',
4440         'ME': '178.175.0.0/17',
4441         'MF': '74.112.232.0/21',
4442         'MG': '154.126.0.0/17',
4443         'MH': '117.103.88.0/21',
4444         'MK': '77.28.0.0/15',
4445         'ML': '154.118.128.0/18',
4446         'MM': '37.111.0.0/17',
4447         'MN': '49.0.128.0/17',
4448         'MO': '60.246.0.0/16',
4449         'MP': '202.88.64.0/20',
4450         'MQ': '109.203.224.0/19',
4451         'MR': '41.188.64.0/18',
4452         'MS': '208.90.112.0/22',
4453         'MT': '46.11.0.0/16',
4454         'MU': '105.16.0.0/12',
4455         'MV': '27.114.128.0/18',
4456         'MW': '102.70.0.0/15',
4457         'MX': '187.192.0.0/11',
4458         'MY': '175.136.0.0/13',
4459         'MZ': '197.218.0.0/15',
4460         'NA': '41.182.0.0/16',
4461         'NC': '101.101.0.0/18',
4462         'NE': '197.214.0.0/18',
4463         'NF': '203.17.240.0/22',
4464         'NG': '105.112.0.0/12',
4465         'NI': '186.76.0.0/15',
4466         'NL': '145.96.0.0/11',
4467         'NO': '84.208.0.0/13',
4468         'NP': '36.252.0.0/15',
4469         'NR': '203.98.224.0/19',
4470         'NU': '49.156.48.0/22',
4471         'NZ': '49.224.0.0/14',
4472         'OM': '5.36.0.0/15',
4473         'PA': '186.72.0.0/15',
4474         'PE': '186.160.0.0/14',
4475         'PF': '123.50.64.0/18',
4476         'PG': '124.240.192.0/19',
4477         'PH': '49.144.0.0/13',
4478         'PK': '39.32.0.0/11',
4479         'PL': '83.0.0.0/11',
4480         'PM': '70.36.0.0/20',
4481         'PR': '66.50.0.0/16',
4482         'PS': '188.161.0.0/16',
4483         'PT': '85.240.0.0/13',
4484         'PW': '202.124.224.0/20',
4485         'PY': '181.120.0.0/14',
4486         'QA': '37.210.0.0/15',
4487         'RE': '102.35.0.0/16',
4488         'RO': '79.112.0.0/13',
4489         'RS': '93.86.0.0/15',
4490         'RU': '5.136.0.0/13',
4491         'RW': '41.186.0.0/16',
4492         'SA': '188.48.0.0/13',
4493         'SB': '202.1.160.0/19',
4494         'SC': '154.192.0.0/11',
4495         'SD': '102.120.0.0/13',
4496         'SE': '78.64.0.0/12',
4497         'SG': '8.128.0.0/10',
4498         'SI': '188.196.0.0/14',
4499         'SK': '78.98.0.0/15',
4500         'SL': '102.143.0.0/17',
4501         'SM': '89.186.32.0/19',
4502         'SN': '41.82.0.0/15',
4503         'SO': '154.115.192.0/18',
4504         'SR': '186.179.128.0/17',
4505         'SS': '105.235.208.0/21',
4506         'ST': '197.159.160.0/19',
4507         'SV': '168.243.0.0/16',
4508         'SX': '190.102.0.0/20',
4509         'SY': '5.0.0.0/16',
4510         'SZ': '41.84.224.0/19',
4511         'TC': '65.255.48.0/20',
4512         'TD': '154.68.128.0/19',
4513         'TG': '196.168.0.0/14',
4514         'TH': '171.96.0.0/13',
4515         'TJ': '85.9.128.0/18',
4516         'TK': '27.96.24.0/21',
4517         'TL': '180.189.160.0/20',
4518         'TM': '95.85.96.0/19',
4519         'TN': '197.0.0.0/11',
4520         'TO': '175.176.144.0/21',
4521         'TR': '78.160.0.0/11',
4522         'TT': '186.44.0.0/15',
4523         'TV': '202.2.96.0/19',
4524         'TW': '120.96.0.0/11',
4525         'TZ': '156.156.0.0/14',
4526         'UA': '37.52.0.0/14',
4527         'UG': '102.80.0.0/13',
4528         'US': '6.0.0.0/8',
4529         'UY': '167.56.0.0/13',
4530         'UZ': '84.54.64.0/18',
4531         'VA': '212.77.0.0/19',
4532         'VC': '207.191.240.0/21',
4533         'VE': '186.88.0.0/13',
4534         'VG': '66.81.192.0/20',
4535         'VI': '146.226.0.0/16',
4536         'VN': '14.160.0.0/11',
4537         'VU': '202.80.32.0/20',
4538         'WF': '117.20.32.0/21',
4539         'WS': '202.4.32.0/19',
4540         'YE': '134.35.0.0/16',
4541         'YT': '41.242.116.0/22',
4542         'ZA': '41.0.0.0/11',
4543         'ZM': '102.144.0.0/13',
4544         'ZW': '102.177.192.0/18',
4545     }
4546
4547     @classmethod
4548     def random_ipv4(cls, code_or_block):
4549         if len(code_or_block) == 2:
4550             block = cls._country_ip_map.get(code_or_block.upper())
4551             if not block:
4552                 return None
4553         else:
4554             block = code_or_block
4555         addr, preflen = block.split('/')
4556         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4557         addr_max = addr_min | (0xffffffff >> int(preflen))
4558         return compat_str(socket.inet_ntoa(
4559             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4560
4561
4562 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4563     def __init__(self, proxies=None):
4564         # Set default handlers
4565         for type in ('http', 'https'):
4566             setattr(self, '%s_open' % type,
4567                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4568                         meth(r, proxy, type))
4569         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4570
4571     def proxy_open(self, req, proxy, type):
4572         req_proxy = req.headers.get('Ytdl-request-proxy')
4573         if req_proxy is not None:
4574             proxy = req_proxy
4575             del req.headers['Ytdl-request-proxy']
4576
4577         if proxy == '__noproxy__':
4578             return None  # No Proxy
4579         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4580             req.add_header('Ytdl-socks-proxy', proxy)
4581             # yt-dlp's http/https handlers do wrapping the socket with socks
4582             return None
4583         return compat_urllib_request.ProxyHandler.proxy_open(
4584             self, req, proxy, type)
4585
4586
4587 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4588 # released into Public Domain
4589 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4590
4591 def long_to_bytes(n, blocksize=0):
4592     """long_to_bytes(n:long, blocksize:int) : string
4593     Convert a long integer to a byte string.
4594
4595     If optional blocksize is given and greater than zero, pad the front of the
4596     byte string with binary zeros so that the length is a multiple of
4597     blocksize.
4598     """
4599     # after much testing, this algorithm was deemed to be the fastest
4600     s = b''
4601     n = int(n)
4602     while n > 0:
4603         s = compat_struct_pack('>I', n & 0xffffffff) + s
4604         n = n >> 32
4605     # strip off leading zeros
4606     for i in range(len(s)):
4607         if s[i] != b'\000'[0]:
4608             break
4609     else:
4610         # only happens when n == 0
4611         s = b'\000'
4612         i = 0
4613     s = s[i:]
4614     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4615     # de-padding being done above, but sigh...
4616     if blocksize > 0 and len(s) % blocksize:
4617         s = (blocksize - len(s) % blocksize) * b'\000' + s
4618     return s
4619
4620
4621 def bytes_to_long(s):
4622     """bytes_to_long(string) : long
4623     Convert a byte string to a long integer.
4624
4625     This is (essentially) the inverse of long_to_bytes().
4626     """
4627     acc = 0
4628     length = len(s)
4629     if length % 4:
4630         extra = (4 - length % 4)
4631         s = b'\000' * extra + s
4632         length = length + extra
4633     for i in range(0, length, 4):
4634         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4635     return acc
4636
4637
4638 def ohdave_rsa_encrypt(data, exponent, modulus):
4639     '''
4640     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4641
4642     Input:
4643         data: data to encrypt, bytes-like object
4644         exponent, modulus: parameter e and N of RSA algorithm, both integer
4645     Output: hex string of encrypted data
4646
4647     Limitation: supports one block encryption only
4648     '''
4649
4650     payload = int(binascii.hexlify(data[::-1]), 16)
4651     encrypted = pow(payload, exponent, modulus)
4652     return '%x' % encrypted
4653
4654
4655 def pkcs1pad(data, length):
4656     """
4657     Padding input data with PKCS#1 scheme
4658
4659     @param {int[]} data        input data
4660     @param {int}   length      target length
4661     @returns {int[]}           padded data
4662     """
4663     if len(data) > length - 11:
4664         raise ValueError('Input data too long for PKCS#1 padding')
4665
4666     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4667     return [0, 2] + pseudo_random + [0] + data
4668
4669
4670 def encode_base_n(num, n, table=None):
4671     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4672     if not table:
4673         table = FULL_TABLE[:n]
4674
4675     if n > len(table):
4676         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4677
4678     if num == 0:
4679         return table[0]
4680
4681     ret = ''
4682     while num:
4683         ret = table[num % n] + ret
4684         num = num // n
4685     return ret
4686
4687
4688 def decode_packed_codes(code):
4689     mobj = re.search(PACKED_CODES_RE, code)
4690     obfuscated_code, base, count, symbols = mobj.groups()
4691     base = int(base)
4692     count = int(count)
4693     symbols = symbols.split('|')
4694     symbol_table = {}
4695
4696     while count:
4697         count -= 1
4698         base_n_count = encode_base_n(count, base)
4699         symbol_table[base_n_count] = symbols[count] or base_n_count
4700
4701     return re.sub(
4702         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4703         obfuscated_code)
4704
4705
4706 def caesar(s, alphabet, shift):
4707     if shift == 0:
4708         return s
4709     l = len(alphabet)
4710     return ''.join(
4711         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4712         for c in s)
4713
4714
4715 def rot47(s):
4716     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4717
4718
4719 def parse_m3u8_attributes(attrib):
4720     info = {}
4721     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4722         if val.startswith('"'):
4723             val = val[1:-1]
4724         info[key] = val
4725     return info
4726
4727
4728 def urshift(val, n):
4729     return val >> n if val >= 0 else (val + 0x100000000) >> n
4730
4731
4732 # Based on png2str() written by @gdkchan and improved by @yokrysty
4733 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4734 def decode_png(png_data):
4735     # Reference: https://www.w3.org/TR/PNG/
4736     header = png_data[8:]
4737
4738     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4739         raise IOError('Not a valid PNG file.')
4740
4741     int_map = {1: '>B', 2: '>H', 4: '>I'}
4742     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4743
4744     chunks = []
4745
4746     while header:
4747         length = unpack_integer(header[:4])
4748         header = header[4:]
4749
4750         chunk_type = header[:4]
4751         header = header[4:]
4752
4753         chunk_data = header[:length]
4754         header = header[length:]
4755
4756         header = header[4:]  # Skip CRC
4757
4758         chunks.append({
4759             'type': chunk_type,
4760             'length': length,
4761             'data': chunk_data
4762         })
4763
4764     ihdr = chunks[0]['data']
4765
4766     width = unpack_integer(ihdr[:4])
4767     height = unpack_integer(ihdr[4:8])
4768
4769     idat = b''
4770
4771     for chunk in chunks:
4772         if chunk['type'] == b'IDAT':
4773             idat += chunk['data']
4774
4775     if not idat:
4776         raise IOError('Unable to read PNG data.')
4777
4778     decompressed_data = bytearray(zlib.decompress(idat))
4779
4780     stride = width * 3
4781     pixels = []
4782
4783     def _get_pixel(idx):
4784         x = idx % stride
4785         y = idx // stride
4786         return pixels[y][x]
4787
4788     for y in range(height):
4789         basePos = y * (1 + stride)
4790         filter_type = decompressed_data[basePos]
4791
4792         current_row = []
4793
4794         pixels.append(current_row)
4795
4796         for x in range(stride):
4797             color = decompressed_data[1 + basePos + x]
4798             basex = y * stride + x
4799             left = 0
4800             up = 0
4801
4802             if x > 2:
4803                 left = _get_pixel(basex - 3)
4804             if y > 0:
4805                 up = _get_pixel(basex - stride)
4806
4807             if filter_type == 1:  # Sub
4808                 color = (color + left) & 0xff
4809             elif filter_type == 2:  # Up
4810                 color = (color + up) & 0xff
4811             elif filter_type == 3:  # Average
4812                 color = (color + ((left + up) >> 1)) & 0xff
4813             elif filter_type == 4:  # Paeth
4814                 a = left
4815                 b = up
4816                 c = 0
4817
4818                 if x > 2 and y > 0:
4819                     c = _get_pixel(basex - stride - 3)
4820
4821                 p = a + b - c
4822
4823                 pa = abs(p - a)
4824                 pb = abs(p - b)
4825                 pc = abs(p - c)
4826
4827                 if pa <= pb and pa <= pc:
4828                     color = (color + a) & 0xff
4829                 elif pb <= pc:
4830                     color = (color + b) & 0xff
4831                 else:
4832                     color = (color + c) & 0xff
4833
4834             current_row.append(color)
4835
4836     return width, height, pixels
4837
4838
4839 def write_xattr(path, key, value):
4840     # This mess below finds the best xattr tool for the job
4841     try:
4842         # try the pyxattr module...
4843         import xattr
4844
4845         if hasattr(xattr, 'set'):  # pyxattr
4846             # Unicode arguments are not supported in python-pyxattr until
4847             # version 0.5.0
4848             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4849             pyxattr_required_version = '0.5.0'
4850             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4851                 # TODO: fallback to CLI tools
4852                 raise XAttrUnavailableError(
4853                     'python-pyxattr is detected but is too old. '
4854                     'yt-dlp requires %s or above while your version is %s. '
4855                     'Falling back to other xattr implementations' % (
4856                         pyxattr_required_version, xattr.__version__))
4857
4858             setxattr = xattr.set
4859         else:  # xattr
4860             setxattr = xattr.setxattr
4861
4862         try:
4863             setxattr(path, key, value)
4864         except EnvironmentError as e:
4865             raise XAttrMetadataError(e.errno, e.strerror)
4866
4867     except ImportError:
4868         if compat_os_name == 'nt':
4869             # Write xattrs to NTFS Alternate Data Streams:
4870             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4871             assert ':' not in key
4872             assert os.path.exists(path)
4873
4874             ads_fn = path + ':' + key
4875             try:
4876                 with open(ads_fn, 'wb') as f:
4877                     f.write(value)
4878             except EnvironmentError as e:
4879                 raise XAttrMetadataError(e.errno, e.strerror)
4880         else:
4881             user_has_setfattr = check_executable('setfattr', ['--version'])
4882             user_has_xattr = check_executable('xattr', ['-h'])
4883
4884             if user_has_setfattr or user_has_xattr:
4885
4886                 value = value.decode('utf-8')
4887                 if user_has_setfattr:
4888                     executable = 'setfattr'
4889                     opts = ['-n', key, '-v', value]
4890                 elif user_has_xattr:
4891                     executable = 'xattr'
4892                     opts = ['-w', key, value]
4893
4894                 cmd = ([encodeFilename(executable, True)]
4895                        + [encodeArgument(o) for o in opts]
4896                        + [encodeFilename(path, True)])
4897
4898                 try:
4899                     p = Popen(
4900                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4901                 except EnvironmentError as e:
4902                     raise XAttrMetadataError(e.errno, e.strerror)
4903                 stdout, stderr = p.communicate_or_kill()
4904                 stderr = stderr.decode('utf-8', 'replace')
4905                 if p.returncode != 0:
4906                     raise XAttrMetadataError(p.returncode, stderr)
4907
4908             else:
4909                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4910                 if sys.platform.startswith('linux'):
4911                     raise XAttrUnavailableError(
4912                         "Couldn't find a tool to set the xattrs. "
4913                         "Install either the python 'pyxattr' or 'xattr' "
4914                         "modules, or the GNU 'attr' package "
4915                         "(which contains the 'setfattr' tool).")
4916                 else:
4917                     raise XAttrUnavailableError(
4918                         "Couldn't find a tool to set the xattrs. "
4919                         "Install either the python 'xattr' module, "
4920                         "or the 'xattr' binary.")
4921
4922
4923 def random_birthday(year_field, month_field, day_field):
4924     start_date = datetime.date(1950, 1, 1)
4925     end_date = datetime.date(1995, 12, 31)
4926     offset = random.randint(0, (end_date - start_date).days)
4927     random_date = start_date + datetime.timedelta(offset)
4928     return {
4929         year_field: str(random_date.year),
4930         month_field: str(random_date.month),
4931         day_field: str(random_date.day),
4932     }
4933
4934
4935 # Templates for internet shortcut files, which are plain text files.
4936 DOT_URL_LINK_TEMPLATE = '''
4937 [InternetShortcut]
4938 URL=%(url)s
4939 '''.lstrip()
4940
4941 DOT_WEBLOC_LINK_TEMPLATE = '''
4942 <?xml version="1.0" encoding="UTF-8"?>
4943 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4944 <plist version="1.0">
4945 <dict>
4946 \t<key>URL</key>
4947 \t<string>%(url)s</string>
4948 </dict>
4949 </plist>
4950 '''.lstrip()
4951
4952 DOT_DESKTOP_LINK_TEMPLATE = '''
4953 [Desktop Entry]
4954 Encoding=UTF-8
4955 Name=%(filename)s
4956 Type=Link
4957 URL=%(url)s
4958 Icon=text-html
4959 '''.lstrip()
4960
4961 LINK_TEMPLATES = {
4962     'url': DOT_URL_LINK_TEMPLATE,
4963     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4964     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4965 }
4966
4967
4968 def iri_to_uri(iri):
4969     """
4970     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4971
4972     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4973     """
4974
4975     iri_parts = compat_urllib_parse_urlparse(iri)
4976
4977     if '[' in iri_parts.netloc:
4978         raise ValueError('IPv6 URIs are not, yet, supported.')
4979         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4980
4981     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4982
4983     net_location = ''
4984     if iri_parts.username:
4985         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4986         if iri_parts.password is not None:
4987             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4988         net_location += '@'
4989
4990     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4991     # The 'idna' encoding produces ASCII text.
4992     if iri_parts.port is not None and iri_parts.port != 80:
4993         net_location += ':' + str(iri_parts.port)
4994
4995     return compat_urllib_parse_urlunparse(
4996         (iri_parts.scheme,
4997             net_location,
4998
4999             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5000
5001             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5002             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5003
5004             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5005             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5006
5007             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5008
5009     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5010
5011
5012 def to_high_limit_path(path):
5013     if sys.platform in ['win32', 'cygwin']:
5014         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5015         return r'\\?\ '.rstrip() + os.path.abspath(path)
5016
5017     return path
5018
5019
5020 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5021     val = traverse_obj(obj, *variadic(field))
5022     if val in ignore:
5023         return default
5024     return template % (func(val) if func else val)
5025
5026
5027 def clean_podcast_url(url):
5028     return re.sub(r'''(?x)
5029         (?:
5030             (?:
5031                 chtbl\.com/track|
5032                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5033                 play\.podtrac\.com
5034             )/[^/]+|
5035             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5036             flex\.acast\.com|
5037             pd(?:
5038                 cn\.co| # https://podcorn.com/analytics-prefix/
5039                 st\.fm # https://podsights.com/docs/
5040             )/e
5041         )/''', '', url)
5042
5043
5044 _HEX_TABLE = '0123456789abcdef'
5045
5046
5047 def random_uuidv4():
5048     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5049
5050
5051 def make_dir(path, to_screen=None):
5052     try:
5053         dn = os.path.dirname(path)
5054         if dn and not os.path.exists(dn):
5055             os.makedirs(dn)
5056         return True
5057     except (OSError, IOError) as err:
5058         if callable(to_screen) is not None:
5059             to_screen('unable to create directory ' + error_to_compat_str(err))
5060         return False
5061
5062
5063 def get_executable_path():
5064     from zipimport import zipimporter
5065     if hasattr(sys, 'frozen'):  # Running from PyInstaller
5066         path = os.path.dirname(sys.executable)
5067     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
5068         path = os.path.join(os.path.dirname(__file__), '../..')
5069     else:
5070         path = os.path.join(os.path.dirname(__file__), '..')
5071     return os.path.abspath(path)
5072
5073
5074 def load_plugins(name, suffix, namespace):
5075     classes = {}
5076     try:
5077         plugins_spec = importlib.util.spec_from_file_location(
5078             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5079         plugins = importlib.util.module_from_spec(plugins_spec)
5080         sys.modules[plugins_spec.name] = plugins
5081         plugins_spec.loader.exec_module(plugins)
5082         for name in dir(plugins):
5083             if name in namespace:
5084                 continue
5085             if not name.endswith(suffix):
5086                 continue
5087             klass = getattr(plugins, name)
5088             classes[name] = namespace[name] = klass
5089     except FileNotFoundError:
5090         pass
5091     return classes
5092
5093
5094 def traverse_obj(
5095         obj, *path_list, default=None, expected_type=None, get_all=True,
5096         casesense=True, is_user_input=False, traverse_string=False):
5097     ''' Traverse nested list/dict/tuple
5098     @param path_list        A list of paths which are checked one by one.
5099                             Each path is a list of keys where each key is a string,
5100                             a function, a tuple of strings/None or "...".
5101                             When a fuction is given, it takes the key as argument and
5102                             returns whether the key matches or not. When a tuple is given,
5103                             all the keys given in the tuple are traversed, and
5104                             "..." traverses all the keys in the object
5105                             "None" returns the object without traversal
5106     @param default          Default value to return
5107     @param expected_type    Only accept final value of this type (Can also be any callable)
5108     @param get_all          Return all the values obtained from a path or only the first one
5109     @param casesense        Whether to consider dictionary keys as case sensitive
5110     @param is_user_input    Whether the keys are generated from user input. If True,
5111                             strings are converted to int/slice if necessary
5112     @param traverse_string  Whether to traverse inside strings. If True, any
5113                             non-compatible object will also be converted into a string
5114     # TODO: Write tests
5115     '''
5116     if not casesense:
5117         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5118         path_list = (map(_lower, variadic(path)) for path in path_list)
5119
5120     def _traverse_obj(obj, path, _current_depth=0):
5121         nonlocal depth
5122         path = tuple(variadic(path))
5123         for i, key in enumerate(path):
5124             if None in (key, obj):
5125                 return obj
5126             if isinstance(key, (list, tuple)):
5127                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5128                 key = ...
5129             if key is ...:
5130                 obj = (obj.values() if isinstance(obj, dict)
5131                        else obj if isinstance(obj, (list, tuple, LazyList))
5132                        else str(obj) if traverse_string else [])
5133                 _current_depth += 1
5134                 depth = max(depth, _current_depth)
5135                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5136             elif callable(key):
5137                 if isinstance(obj, (list, tuple, LazyList)):
5138                     obj = enumerate(obj)
5139                 elif isinstance(obj, dict):
5140                     obj = obj.items()
5141                 else:
5142                     if not traverse_string:
5143                         return None
5144                     obj = str(obj)
5145                 _current_depth += 1
5146                 depth = max(depth, _current_depth)
5147                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5148             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5149                 obj = (obj.get(key) if casesense or (key in obj)
5150                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5151             else:
5152                 if is_user_input:
5153                     key = (int_or_none(key) if ':' not in key
5154                            else slice(*map(int_or_none, key.split(':'))))
5155                     if key == slice(None):
5156                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5157                 if not isinstance(key, (int, slice)):
5158                     return None
5159                 if not isinstance(obj, (list, tuple, LazyList)):
5160                     if not traverse_string:
5161                         return None
5162                     obj = str(obj)
5163                 try:
5164                     obj = obj[key]
5165                 except IndexError:
5166                     return None
5167         return obj
5168
5169     if isinstance(expected_type, type):
5170         type_test = lambda val: val if isinstance(val, expected_type) else None
5171     elif expected_type is not None:
5172         type_test = expected_type
5173     else:
5174         type_test = lambda val: val
5175
5176     for path in path_list:
5177         depth = 0
5178         val = _traverse_obj(obj, path)
5179         if val is not None:
5180             if depth:
5181                 for _ in range(depth - 1):
5182                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5183                 val = [v for v in map(type_test, val) if v is not None]
5184                 if val:
5185                     return val if get_all else val[0]
5186             else:
5187                 val = type_test(val)
5188                 if val is not None:
5189                     return val
5190     return default
5191
5192
5193 def traverse_dict(dictn, keys, casesense=True):
5194     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5195                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5196     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5197
5198
5199 def variadic(x, allowed_types=(str, bytes, dict)):
5200     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5201
5202
5203 def decode_base(value, digits):
5204     # This will convert given base-x string to scalar (long or int)
5205     table = {char: index for index, char in enumerate(digits)}
5206     result = 0
5207     base = len(digits)
5208     for chr in value:
5209         result *= base
5210         result += table[chr]
5211     return result
5212
5213
5214 def time_seconds(**kwargs):
5215     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5216     return t.timestamp()
5217
5218
5219 # create a JSON Web Signature (jws) with HS256 algorithm
5220 # the resulting format is in JWS Compact Serialization
5221 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5222 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5223 def jwt_encode_hs256(payload_data, key, headers={}):
5224     header_data = {
5225         'alg': 'HS256',
5226         'typ': 'JWT',
5227     }
5228     if headers:
5229         header_data.update(headers)
5230     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5231     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5232     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5233     signature_b64 = base64.b64encode(h.digest())
5234     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5235     return token
5236
5237
5238 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5239 def jwt_decode_hs256(jwt):
5240     header_b64, payload_b64, signature_b64 = jwt.split('.')
5241     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5242     return payload_data
5243
5244
5245 def supports_terminal_sequences(stream):
5246     if compat_os_name == 'nt':
5247         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5248         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5249             return False
5250     elif not os.getenv('TERM'):
5251         return False
5252     try:
5253         return stream.isatty()
5254     except BaseException:
5255         return False
5256
5257
5258 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5259
5260
5261 def remove_terminal_sequences(string):
5262     return _terminal_sequences_re.sub('', string)
5263
5264
5265 def number_of_digits(number):
5266     return len('%d' % number)
5267
5268
5269 def join_nonempty(*values, delim='-', from_dict=None):
5270     if from_dict is not None:
5271         values = map(from_dict.get, values)
5272     return delim.join(map(str, filter(None, values)))
5273
5274
5275 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5276     """
5277     Find the largest format dimensions in terms of video width and, for each thumbnail:
5278     * Modify the URL: Match the width with the provided regex and replace with the former width
5279     * Update dimensions
5280
5281     This function is useful with video services that scale the provided thumbnails on demand
5282     """
5283     _keys = ('width', 'height')
5284     max_dimensions = max(
5285         [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5286         default=(0, 0))
5287     if not max_dimensions[0]:
5288         return thumbnails
5289     return [
5290         merge_dicts(
5291             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5292             dict(zip(_keys, max_dimensions)), thumbnail)
5293         for thumbnail in thumbnails
5294     ]
5295
5296
5297 def parse_http_range(range):
5298     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5299     if not range:
5300         return None, None, None
5301     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5302     if not crg:
5303         return None, None, None
5304     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5305
5306
5307 class Config:
5308     own_args = None
5309     filename = None
5310     __initialized = False
5311
5312     def __init__(self, parser, label=None):
5313         self._parser, self.label = parser, label
5314         self._loaded_paths, self.configs = set(), []
5315
5316     def init(self, args=None, filename=None):
5317         assert not self.__initialized
5318         directory = ''
5319         if filename:
5320             location = os.path.realpath(filename)
5321             directory = os.path.dirname(location)
5322             if location in self._loaded_paths:
5323                 return False
5324             self._loaded_paths.add(location)
5325
5326         self.__initialized = True
5327         self.own_args, self.filename = args, filename
5328         for location in self._parser.parse_args(args)[0].config_locations or []:
5329             location = os.path.join(directory, expand_path(location))
5330             if os.path.isdir(location):
5331                 location = os.path.join(location, 'yt-dlp.conf')
5332             if not os.path.exists(location):
5333                 self._parser.error(f'config location {location} does not exist')
5334             self.append_config(self.read_file(location), location)
5335         return True
5336
5337     def __str__(self):
5338         label = join_nonempty(
5339             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5340             delim=' ')
5341         return join_nonempty(
5342             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5343             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5344             delim='\n')
5345
5346     @staticmethod
5347     def read_file(filename, default=[]):
5348         try:
5349             optionf = open(filename)
5350         except IOError:
5351             return default  # silently skip if file is not present
5352         try:
5353             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5354             contents = optionf.read()
5355             if sys.version_info < (3,):
5356                 contents = contents.decode(preferredencoding())
5357             res = compat_shlex_split(contents, comments=True)
5358         finally:
5359             optionf.close()
5360         return res
5361
5362     @staticmethod
5363     def hide_login_info(opts):
5364         PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5365         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5366
5367         def _scrub_eq(o):
5368             m = eqre.match(o)
5369             if m:
5370                 return m.group('key') + '=PRIVATE'
5371             else:
5372                 return o
5373
5374         opts = list(map(_scrub_eq, opts))
5375         for idx, opt in enumerate(opts):
5376             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5377                 opts[idx + 1] = 'PRIVATE'
5378         return opts
5379
5380     def append_config(self, *args, label=None):
5381         config = type(self)(self._parser, label)
5382         config._loaded_paths = self._loaded_paths
5383         if config.init(*args):
5384             self.configs.append(config)
5385
5386     @property
5387     def all_args(self):
5388         for config in reversed(self.configs):
5389             yield from config.all_args
5390         yield from self.own_args or []
5391
5392     def parse_args(self):
5393         return self._parser.parse_args(list(self.all_args))
5394
5395
5396 class WebSocketsWrapper():
5397     """Wraps websockets module to use in non-async scopes"""
5398
5399     def __init__(self, url, headers=None):
5400         self.loop = asyncio.events.new_event_loop()
5401         self.conn = compat_websockets.connect(
5402             url, extra_headers=headers, ping_interval=None,
5403             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5404         atexit.register(self.__exit__, None, None, None)
5405
5406     def __enter__(self):
5407         self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5408         return self
5409
5410     def send(self, *args):
5411         self.run_with_loop(self.pool.send(*args), self.loop)
5412
5413     def recv(self, *args):
5414         return self.run_with_loop(self.pool.recv(*args), self.loop)
5415
5416     def __exit__(self, type, value, traceback):
5417         try:
5418             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5419         finally:
5420             self.loop.close()
5421             self._cancel_all_tasks(self.loop)
5422
5423     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5424     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5425     @staticmethod
5426     def run_with_loop(main, loop):
5427         if not asyncio.coroutines.iscoroutine(main):
5428             raise ValueError(f'a coroutine was expected, got {main!r}')
5429
5430         try:
5431             return loop.run_until_complete(main)
5432         finally:
5433             loop.run_until_complete(loop.shutdown_asyncgens())
5434             if hasattr(loop, 'shutdown_default_executor'):
5435                 loop.run_until_complete(loop.shutdown_default_executor())
5436
5437     @staticmethod
5438     def _cancel_all_tasks(loop):
5439         to_cancel = asyncio.tasks.all_tasks(loop)
5440
5441         if not to_cancel:
5442             return
5443
5444         for task in to_cancel:
5445             task.cancel()
5446
5447         loop.run_until_complete(
5448             asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5449
5450         for task in to_cancel:
5451             if task.cancelled():
5452                 continue
5453             if task.exception() is not None:
5454                 loop.call_exception_handler({
5455                     'message': 'unhandled exception during asyncio.run() shutdown',
5456                     'exception': task.exception(),
5457                     'task': task,
5458                 })
5459
5460
5461 has_websockets = bool(compat_websockets)
5462
5463
5464 def merge_headers(*dicts):
5465     """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
5466     return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}