venv/lib/python3.11/site-packages/pip/_vendor/pygments/lexer.py

   1 """
   2     pygments.lexer
   3     ~~~~~~~~~~~~~~
   4
   5     Base lexer classes.
   6
   7     :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
   8     :license: BSD, see LICENSE for details.
   9 """
  10
  11 import re
  12 import sys
  13 import time
  14
  15 from pip._vendor.pygments.filter import apply_filters, Filter
  16 from pip._vendor.pygments.filters import get_filter_by_name
  17 from pip._vendor.pygments.token import Error, Text, Other, Whitespace, _TokenType
  18 from pip._vendor.pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
  19     make_analysator, Future, guess_decode
  20 from pip._vendor.pygments.regexopt import regex_opt
  21
  22 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
  23            'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
  24            'default', 'words', 'line_re']
  25
  26 line_re = re.compile('.*?\n')
  27
  28 _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
  29                  (b'\xff\xfe\0\0', 'utf-32'),
  30                  (b'\0\0\xfe\xff', 'utf-32be'),
  31                  (b'\xff\xfe', 'utf-16'),
  32                  (b'\xfe\xff', 'utf-16be')]
  33
  34 _default_analyse = staticmethod(lambda x: 0.0)
  35
  36
  37 class LexerMeta(type):
  38     """
  39     This metaclass automagically converts ``analyse_text`` methods into
  40     static methods which always return float values.
  41     """
  42
  43     def __new__(mcs, name, bases, d):
  44         if 'analyse_text' in d:
  45             d['analyse_text'] = make_analysator(d['analyse_text'])
  46         return type.__new__(mcs, name, bases, d)
  47
  48
  49 class Lexer(metaclass=LexerMeta):
  50     """
  51     Lexer for a specific language.
  52
  53     See also :doc:`lexerdevelopment`, a high-level guide to writing
  54     lexers.
  55
  56     Lexer classes have attributes used for choosing the most appropriate
  57     lexer based on various criteria.
  58
  59     .. autoattribute:: name
  60        :no-value:
  61     .. autoattribute:: aliases
  62        :no-value:
  63     .. autoattribute:: filenames
  64        :no-value:
  65     .. autoattribute:: alias_filenames
  66     .. autoattribute:: mimetypes
  67        :no-value:
  68     .. autoattribute:: priority
  69
  70     Lexers included in Pygments should have an additional attribute:
  71
  72     .. autoattribute:: url
  73        :no-value:
  74
  75     You can pass options to the constructor. The basic options recognized
  76     by all lexers and processed by the base `Lexer` class are:
  77
  78     ``stripnl``
  79         Strip leading and trailing newlines from the input (default: True).
  80     ``stripall``
  81         Strip all leading and trailing whitespace from the input
  82         (default: False).
  83     ``ensurenl``
  84         Make sure that the input ends with a newline (default: True).  This
  85         is required for some lexers that consume input linewise.
  86
  87         .. versionadded:: 1.3
  88
  89     ``tabsize``
  90         If given and greater than 0, expand tabs in the input (default: 0).
  91     ``encoding``
  92         If given, must be an encoding name. This encoding will be used to
  93         convert the input string to Unicode, if it is not already a Unicode
  94         string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
  95         Latin1 detection.  Can also be ``'chardet'`` to use the chardet
  96         library, if it is installed.
  97     ``inencoding``
  98         Overrides the ``encoding`` if given.
  99     """
 100
 101     #: Full name of the lexer, in human-readable form
 102     name = None
 103
 104     #: A list of short, unique identifiers that can be used to look
 105     #: up the lexer from a list, e.g., using `get_lexer_by_name()`.
 106     aliases = []
 107
 108     #: A list of `fnmatch` patterns that match filenames which contain
 109     #: content for this lexer. The patterns in this list should be unique among
 110     #: all lexers.
 111     filenames = []
 112
 113     #: A list of `fnmatch` patterns that match filenames which may or may not
 114     #: contain content for this lexer. This list is used by the
 115     #: :func:`.guess_lexer_for_filename()` function, to determine which lexers
 116     #: are then included in guessing the correct one. That means that
 117     #: e.g. every lexer for HTML and a template language should include
 118     #: ``\*.html`` in this list.
 119     alias_filenames = []
 120
 121     #: A list of MIME types for content that can be lexed with this lexer.
 122     mimetypes = []
 123
 124     #: Priority, should multiple lexers match and no content is provided
 125     priority = 0
 126
 127     #: URL of the language specification/definition. Used in the Pygments
 128     #: documentation.
 129     url = None
 130
 131     def __init__(self, **options):
 132         """
 133         This constructor takes arbitrary options as keyword arguments.
 134         Every subclass must first process its own options and then call
 135         the `Lexer` constructor, since it processes the basic
 136         options like `stripnl`.
 137
 138         An example looks like this:
 139
 140         .. sourcecode:: python
 141
 142            def __init__(self, **options):
 143                self.compress = options.get('compress', '')
 144                Lexer.__init__(self, **options)
 145
 146         As these options must all be specifiable as strings (due to the
 147         command line usage), there are various utility functions
 148         available to help with that, see `Utilities`_.
 149         """
 150         self.options = options
 151         self.stripnl = get_bool_opt(options, 'stripnl', True)
 152         self.stripall = get_bool_opt(options, 'stripall', False)
 153         self.ensurenl = get_bool_opt(options, 'ensurenl', True)
 154         self.tabsize = get_int_opt(options, 'tabsize', 0)
 155         self.encoding = options.get('encoding', 'guess')
 156         self.encoding = options.get('inencoding') or self.encoding
 157         self.filters = []
 158         for filter_ in get_list_opt(options, 'filters', ()):
 159             self.add_filter(filter_)
 160
 161     def __repr__(self):
 162         if self.options:
 163             return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
 164                                                      self.options)
 165         else:
 166             return '<pygments.lexers.%s>' % self.__class__.__name__
 167
 168     def add_filter(self, filter_, **options):
 169         """
 170         Add a new stream filter to this lexer.
 171         """
 172         if not isinstance(filter_, Filter):
 173             filter_ = get_filter_by_name(filter_, **options)
 174         self.filters.append(filter_)
 175
 176     def analyse_text(text):
 177         """
 178         A static method which is called for lexer guessing.
 179
 180         It should analyse the text and return a float in the range
 181         from ``0.0`` to ``1.0``.  If it returns ``0.0``, the lexer
 182         will not be selected as the most probable one, if it returns
 183         ``1.0``, it will be selected immediately.  This is used by
 184         `guess_lexer`.
 185
 186         The `LexerMeta` metaclass automatically wraps this function so
 187         that it works like a static method (no ``self`` or ``cls``
 188         parameter) and the return value is automatically converted to
 189         `float`. If the return value is an object that is boolean `False`
 190         it's the same as if the return values was ``0.0``.
 191         """
 192
 193     def get_tokens(self, text, unfiltered=False):
 194         """
 195         This method is the basic interface of a lexer. It is called by
 196         the `highlight()` function. It must process the text and return an
 197         iterable of ``(tokentype, value)`` pairs from `text`.
 198
 199         Normally, you don't need to override this method. The default
 200         implementation processes the options recognized by all lexers
 201         (`stripnl`, `stripall` and so on), and then yields all tokens
 202         from `get_tokens_unprocessed()`, with the ``index`` dropped.
 203
 204         If `unfiltered` is set to `True`, the filtering mechanism is
 205         bypassed even if filters are defined.
 206         """
 207         if not isinstance(text, str):
 208             if self.encoding == 'guess':
 209                 text, _ = guess_decode(text)
 210             elif self.encoding == 'chardet':
 211                 try:
 212                     from pip._vendor import chardet
 213                 except ImportError as e:
 214                     raise ImportError('To enable chardet encoding guessing, '
 215                                       'please install the chardet library '
 216                                       'from http://chardet.feedparser.org/') from e
 217                 # check for BOM first
 218                 decoded = None
 219                 for bom, encoding in _encoding_map:
 220                     if text.startswith(bom):
 221                         decoded = text[len(bom):].decode(encoding, 'replace')
 222                         break
 223                 # no BOM found, so use chardet
 224                 if decoded is None:
 225                     enc = chardet.detect(text[:1024])  # Guess using first 1KB
 226                     decoded = text.decode(enc.get('encoding') or 'utf-8',
 227                                           'replace')
 228                 text = decoded
 229             else:
 230                 text = text.decode(self.encoding)
 231                 if text.startswith('\ufeff'):
 232                     text = text[len('\ufeff'):]
 233         else:
 234             if text.startswith('\ufeff'):
 235                 text = text[len('\ufeff'):]
 236
 237         # text now *is* a unicode string
 238         text = text.replace('\r\n', '\n')
 239         text = text.replace('\r', '\n')
 240         if self.stripall:
 241             text = text.strip()
 242         elif self.stripnl:
 243             text = text.strip('\n')
 244         if self.tabsize > 0:
 245             text = text.expandtabs(self.tabsize)
 246         if self.ensurenl and not text.endswith('\n'):
 247             text += '\n'
 248
 249         def streamer():
 250             for _, t, v in self.get_tokens_unprocessed(text):
 251                 yield t, v
 252         stream = streamer()
 253         if not unfiltered:
 254             stream = apply_filters(stream, self.filters, self)
 255         return stream
 256
 257     def get_tokens_unprocessed(self, text):
 258         """
 259         This method should process the text and return an iterable of
 260         ``(index, tokentype, value)`` tuples where ``index`` is the starting
 261         position of the token within the input text.
 262
 263         It must be overridden by subclasses. It is recommended to
 264         implement it as a generator to maximize effectiveness.
 265         """
 266         raise NotImplementedError
 267
 268
 269 class DelegatingLexer(Lexer):
 270     """
 271     This lexer takes two lexer as arguments. A root lexer and
 272     a language lexer. First everything is scanned using the language
 273     lexer, afterwards all ``Other`` tokens are lexed using the root
 274     lexer.
 275
 276     The lexers from the ``template`` lexer package use this base lexer.
 277     """
 278
 279     def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
 280         self.root_lexer = _root_lexer(**options)
 281         self.language_lexer = _language_lexer(**options)
 282         self.needle = _needle
 283         Lexer.__init__(self, **options)
 284
 285     def get_tokens_unprocessed(self, text):
 286         buffered = ''
 287         insertions = []
 288         lng_buffer = []
 289         for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
 290             if t is self.needle:
 291                 if lng_buffer:
 292                     insertions.append((len(buffered), lng_buffer))
 293                     lng_buffer = []
 294                 buffered += v
 295             else:
 296                 lng_buffer.append((i, t, v))
 297         if lng_buffer:
 298             insertions.append((len(buffered), lng_buffer))
 299         return do_insertions(insertions,
 300                              self.root_lexer.get_tokens_unprocessed(buffered))
 301
 302
 303 # ------------------------------------------------------------------------------
 304 # RegexLexer and ExtendedRegexLexer
 305 #
 306
 307
 308 class include(str):  # pylint: disable=invalid-name
 309     """
 310     Indicates that a state should include rules from another state.
 311     """
 312     pass
 313
 314
 315 class _inherit:
 316     """
 317     Indicates the a state should inherit from its superclass.
 318     """
 319     def __repr__(self):
 320         return 'inherit'
 321
 322 inherit = _inherit()  # pylint: disable=invalid-name
 323
 324
 325 class combined(tuple):  # pylint: disable=invalid-name
 326     """
 327     Indicates a state combined from multiple states.
 328     """
 329
 330     def __new__(cls, *args):
 331         return tuple.__new__(cls, args)
 332
 333     def __init__(self, *args):
 334         # tuple.__init__ doesn't do anything
 335         pass
 336
 337
 338 class _PseudoMatch:
 339     """
 340     A pseudo match object constructed from a string.
 341     """
 342
 343     def __init__(self, start, text):
 344         self._text = text
 345         self._start = start
 346
 347     def start(self, arg=None):
 348         return self._start
 349
 350     def end(self, arg=None):
 351         return self._start + len(self._text)
 352
 353     def group(self, arg=None):
 354         if arg:
 355             raise IndexError('No such group')
 356         return self._text
 357
 358     def groups(self):
 359         return (self._text,)
 360
 361     def groupdict(self):
 362         return {}
 363
 364
 365 def bygroups(*args):
 366     """
 367     Callback that yields multiple actions for each group in the match.
 368     """
 369     def callback(lexer, match, ctx=None):
 370         for i, action in enumerate(args):
 371             if action is None:
 372                 continue
 373             elif type(action) is _TokenType:
 374                 data = match.group(i + 1)
 375                 if data:
 376                     yield match.start(i + 1), action, data
 377             else:
 378                 data = match.group(i + 1)
 379                 if data is not None:
 380                     if ctx:
 381                         ctx.pos = match.start(i + 1)
 382                     for item in action(lexer,
 383                                        _PseudoMatch(match.start(i + 1), data), ctx):
 384                         if item:
 385                             yield item
 386         if ctx:
 387             ctx.pos = match.end()
 388     return callback
 389
 390
 391 class _This:
 392     """
 393     Special singleton used for indicating the caller class.
 394     Used by ``using``.
 395     """
 396
 397 this = _This()
 398
 399
 400 def using(_other, **kwargs):
 401     """
 402     Callback that processes the match with a different lexer.
 403
 404     The keyword arguments are forwarded to the lexer, except `state` which
 405     is handled separately.
 406
 407     `state` specifies the state that the new lexer will start in, and can
 408     be an enumerable such as ('root', 'inline', 'string') or a simple
 409     string which is assumed to be on top of the root state.
 410
 411     Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
 412     """
 413     gt_kwargs = {}
 414     if 'state' in kwargs:
 415         s = kwargs.pop('state')
 416         if isinstance(s, (list, tuple)):
 417             gt_kwargs['stack'] = s
 418         else:
 419             gt_kwargs['stack'] = ('root', s)
 420
 421     if _other is this:
 422         def callback(lexer, match, ctx=None):
 423             # if keyword arguments are given the callback
 424             # function has to create a new lexer instance
 425             if kwargs:
 426                 # XXX: cache that somehow
 427                 kwargs.update(lexer.options)
 428                 lx = lexer.__class__(**kwargs)
 429             else:
 430                 lx = lexer
 431             s = match.start()
 432             for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
 433                 yield i + s, t, v
 434             if ctx:
 435                 ctx.pos = match.end()
 436     else:
 437         def callback(lexer, match, ctx=None):
 438             # XXX: cache that somehow
 439             kwargs.update(lexer.options)
 440             lx = _other(**kwargs)
 441
 442             s = match.start()
 443             for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
 444                 yield i + s, t, v
 445             if ctx:
 446                 ctx.pos = match.end()
 447     return callback
 448
 449
 450 class default:
 451     """
 452     Indicates a state or state action (e.g. #pop) to apply.
 453     For example default('#pop') is equivalent to ('', Token, '#pop')
 454     Note that state tuples may be used as well.
 455
 456     .. versionadded:: 2.0
 457     """
 458     def __init__(self, state):
 459         self.state = state
 460
 461
 462 class words(Future):
 463     """
 464     Indicates a list of literal words that is transformed into an optimized
 465     regex that matches any of the words.
 466
 467     .. versionadded:: 2.0
 468     """
 469     def __init__(self, words, prefix='', suffix=''):
 470         self.words = words
 471         self.prefix = prefix
 472         self.suffix = suffix
 473
 474     def get(self):
 475         return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
 476
 477
 478 class RegexLexerMeta(LexerMeta):
 479     """
 480     Metaclass for RegexLexer, creates the self._tokens attribute from
 481     self.tokens on the first instantiation.
 482     """
 483
 484     def _process_regex(cls, regex, rflags, state):
 485         """Preprocess the regular expression component of a token definition."""
 486         if isinstance(regex, Future):
 487             regex = regex.get()
 488         return re.compile(regex, rflags).match
 489
 490     def _process_token(cls, token):
 491         """Preprocess the token component of a token definition."""
 492         assert type(token) is _TokenType or callable(token), \
 493             'token type must be simple type or callable, not %r' % (token,)
 494         return token
 495
 496     def _process_new_state(cls, new_state, unprocessed, processed):
 497         """Preprocess the state transition action of a token definition."""
 498         if isinstance(new_state, str):
 499             # an existing state
 500             if new_state == '#pop':
 501                 return -1
 502             elif new_state in unprocessed:
 503                 return (new_state,)
 504             elif new_state == '#push':
 505                 return new_state
 506             elif new_state[:5] == '#pop:':
 507                 return -int(new_state[5:])
 508             else:
 509                 assert False, 'unknown new state %r' % new_state
 510         elif isinstance(new_state, combined):
 511             # combine a new state from existing ones
 512             tmp_state = '_tmp_%d' % cls._tmpname
 513             cls._tmpname += 1
 514             itokens = []
 515             for istate in new_state:
 516                 assert istate != new_state, 'circular state ref %r' % istate
 517                 itokens.extend(cls._process_state(unprocessed,
 518                                                   processed, istate))
 519             processed[tmp_state] = itokens
 520             return (tmp_state,)
 521         elif isinstance(new_state, tuple):
 522             # push more than one state
 523             for istate in new_state:
 524                 assert (istate in unprocessed or
 525                         istate in ('#pop', '#push')), \
 526                     'unknown new state ' + istate
 527             return new_state
 528         else:
 529             assert False, 'unknown new state def %r' % new_state
 530
 531     def _process_state(cls, unprocessed, processed, state):
 532         """Preprocess a single state definition."""
 533         assert type(state) is str, "wrong state name %r" % state
 534         assert state[0] != '#', "invalid state name %r" % state
 535         if state in processed:
 536             return processed[state]
 537         tokens = processed[state] = []
 538         rflags = cls.flags
 539         for tdef in unprocessed[state]:
 540             if isinstance(tdef, include):
 541                 # it's a state reference
 542                 assert tdef != state, "circular state reference %r" % state
 543                 tokens.extend(cls._process_state(unprocessed, processed,
 544                                                  str(tdef)))
 545                 continue
 546             if isinstance(tdef, _inherit):
 547                 # should be processed already, but may not in the case of:
 548                 # 1. the state has no counterpart in any parent
 549                 # 2. the state includes more than one 'inherit'
 550                 continue
 551             if isinstance(tdef, default):
 552                 new_state = cls._process_new_state(tdef.state, unprocessed, processed)
 553                 tokens.append((re.compile('').match, None, new_state))
 554                 continue
 555
 556             assert type(tdef) is tuple, "wrong rule def %r" % tdef
 557
 558             try:
 559                 rex = cls._process_regex(tdef[0], rflags, state)
 560             except Exception as err:
 561                 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
 562                                  (tdef[0], state, cls, err)) from err
 563
 564             token = cls._process_token(tdef[1])
 565
 566             if len(tdef) == 2:
 567                 new_state = None
 568             else:
 569                 new_state = cls._process_new_state(tdef[2],
 570                                                    unprocessed, processed)
 571
 572             tokens.append((rex, token, new_state))
 573         return tokens
 574
 575     def process_tokendef(cls, name, tokendefs=None):
 576         """Preprocess a dictionary of token definitions."""
 577         processed = cls._all_tokens[name] = {}
 578         tokendefs = tokendefs or cls.tokens[name]
 579         for state in list(tokendefs):
 580             cls._process_state(tokendefs, processed, state)
 581         return processed
 582
 583     def get_tokendefs(cls):
 584         """
 585         Merge tokens from superclasses in MRO order, returning a single tokendef
 586         dictionary.
 587
 588         Any state that is not defined by a subclass will be inherited
 589         automatically.  States that *are* defined by subclasses will, by
 590         default, override that state in the superclass.  If a subclass wishes to
 591         inherit definitions from a superclass, it can use the special value
 592         "inherit", which will cause the superclass' state definition to be
 593         included at that point in the state.
 594         """
 595         tokens = {}
 596         inheritable = {}
 597         for c in cls.__mro__:
 598             toks = c.__dict__.get('tokens', {})
 599
 600             for state, items in toks.items():
 601                 curitems = tokens.get(state)
 602                 if curitems is None:
 603                     # N.b. because this is assigned by reference, sufficiently
 604                     # deep hierarchies are processed incrementally (e.g. for
 605                     # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
 606                     # will not see any inherits in B).
 607                     tokens[state] = items
 608                     try:
 609                         inherit_ndx = items.index(inherit)
 610                     except ValueError:
 611                         continue
 612                     inheritable[state] = inherit_ndx
 613                     continue
 614
 615                 inherit_ndx = inheritable.pop(state, None)
 616                 if inherit_ndx is None:
 617                     continue
 618
 619                 # Replace the "inherit" value with the items
 620                 curitems[inherit_ndx:inherit_ndx+1] = items
 621                 try:
 622                     # N.b. this is the index in items (that is, the superclass
 623                     # copy), so offset required when storing below.
 624                     new_inh_ndx = items.index(inherit)
 625                 except ValueError:
 626                     pass
 627                 else:
 628                     inheritable[state] = inherit_ndx + new_inh_ndx
 629
 630         return tokens
 631
 632     def __call__(cls, *args, **kwds):
 633         """Instantiate cls after preprocessing its token definitions."""
 634         if '_tokens' not in cls.__dict__:
 635             cls._all_tokens = {}
 636             cls._tmpname = 0
 637             if hasattr(cls, 'token_variants') and cls.token_variants:
 638                 # don't process yet
 639                 pass
 640             else:
 641                 cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
 642
 643         return type.__call__(cls, *args, **kwds)
 644
 645
 646 class RegexLexer(Lexer, metaclass=RegexLexerMeta):
 647     """
 648     Base for simple stateful regular expression-based lexers.
 649     Simplifies the lexing process so that you need only
 650     provide a list of states and regular expressions.
 651     """
 652
 653     #: Flags for compiling the regular expressions.
 654     #: Defaults to MULTILINE.
 655     flags = re.MULTILINE
 656
 657     #: At all time there is a stack of states. Initially, the stack contains
 658     #: a single state 'root'. The top of the stack is called "the current state".
 659     #:
 660     #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
 661     #:
 662     #: ``new_state`` can be omitted to signify no state transition.
 663     #: If ``new_state`` is a string, it is pushed on the stack. This ensure
 664     #: the new current state is ``new_state``.
 665     #: If ``new_state`` is a tuple of strings, all of those strings are pushed
 666     #: on the stack and the current state will be the last element of the list.
 667     #: ``new_state`` can also be ``combined('state1', 'state2', ...)``
 668     #: to signify a new, anonymous state combined from the rules of two
 669     #: or more existing ones.
 670     #: Furthermore, it can be '#pop' to signify going back one step in
 671     #: the state stack, or '#push' to push the current state on the stack
 672     #: again. Note that if you push while in a combined state, the combined
 673     #: state itself is pushed, and not only the state in which the rule is
 674     #: defined.
 675     #:
 676     #: The tuple can also be replaced with ``include('state')``, in which
 677     #: case the rules from the state named by the string are included in the
 678     #: current one.
 679     tokens = {}
 680
 681     def get_tokens_unprocessed(self, text, stack=('root',)):
 682         """
 683         Split ``text`` into (tokentype, text) pairs.
 684
 685         ``stack`` is the initial stack (default: ``['root']``)
 686         """
 687         pos = 0
 688         tokendefs = self._tokens
 689         statestack = list(stack)
 690         statetokens = tokendefs[statestack[-1]]
 691         while 1:
 692             for rexmatch, action, new_state in statetokens:
 693                 m = rexmatch(text, pos)
 694                 if m:
 695                     if action is not None:
 696                         if type(action) is _TokenType:
 697                             yield pos, action, m.group()
 698                         else:
 699                             yield from action(self, m)
 700                     pos = m.end()
 701                     if new_state is not None:
 702                         # state transition
 703                         if isinstance(new_state, tuple):
 704                             for state in new_state:
 705                                 if state == '#pop':
 706                                     if len(statestack) > 1:
 707                                         statestack.pop()
 708                                 elif state == '#push':
 709                                     statestack.append(statestack[-1])
 710                                 else:
 711                                     statestack.append(state)
 712                         elif isinstance(new_state, int):
 713                             # pop, but keep at least one state on the stack
 714                             # (random code leading to unexpected pops should
 715                             # not allow exceptions)
 716                             if abs(new_state) >= len(statestack):
 717                                 del statestack[1:]
 718                             else:
 719                                 del statestack[new_state:]
 720                         elif new_state == '#push':
 721                             statestack.append(statestack[-1])
 722                         else:
 723                             assert False, "wrong state def: %r" % new_state
 724                         statetokens = tokendefs[statestack[-1]]
 725                     break
 726             else:
 727                 # We are here only if all state tokens have been considered
 728                 # and there was not a match on any of them.
 729                 try:
 730                     if text[pos] == '\n':
 731                         # at EOL, reset state to "root"
 732                         statestack = ['root']
 733                         statetokens = tokendefs['root']
 734                         yield pos, Whitespace, '\n'
 735                         pos += 1
 736                         continue
 737                     yield pos, Error, text[pos]
 738                     pos += 1
 739                 except IndexError:
 740                     break
 741
 742
 743 class LexerContext:
 744     """
 745     A helper object that holds lexer position data.
 746     """
 747
 748     def __init__(self, text, pos, stack=None, end=None):
 749         self.text = text
 750         self.pos = pos
 751         self.end = end or len(text)  # end=0 not supported ;-)
 752         self.stack = stack or ['root']
 753
 754     def __repr__(self):
 755         return 'LexerContext(%r, %r, %r)' % (
 756             self.text, self.pos, self.stack)
 757
 758
 759 class ExtendedRegexLexer(RegexLexer):
 760     """
 761     A RegexLexer that uses a context object to store its state.
 762     """
 763
 764     def get_tokens_unprocessed(self, text=None, context=None):
 765         """
 766         Split ``text`` into (tokentype, text) pairs.
 767         If ``context`` is given, use this lexer context instead.
 768         """
 769         tokendefs = self._tokens
 770         if not context:
 771             ctx = LexerContext(text, 0)
 772             statetokens = tokendefs['root']
 773         else:
 774             ctx = context
 775             statetokens = tokendefs[ctx.stack[-1]]
 776             text = ctx.text
 777         while 1:
 778             for rexmatch, action, new_state in statetokens:
 779                 m = rexmatch(text, ctx.pos, ctx.end)
 780                 if m:
 781                     if action is not None:
 782                         if type(action) is _TokenType:
 783                             yield ctx.pos, action, m.group()
 784                             ctx.pos = m.end()
 785                         else:
 786                             yield from action(self, m, ctx)
 787                             if not new_state:
 788                                 # altered the state stack?
 789                                 statetokens = tokendefs[ctx.stack[-1]]
 790                     # CAUTION: callback must set ctx.pos!
 791                     if new_state is not None:
 792                         # state transition
 793                         if isinstance(new_state, tuple):
 794                             for state in new_state:
 795                                 if state == '#pop':
 796                                     if len(ctx.stack) > 1:
 797                                         ctx.stack.pop()
 798                                 elif state == '#push':
 799                                     ctx.stack.append(ctx.stack[-1])
 800                                 else:
 801                                     ctx.stack.append(state)
 802                         elif isinstance(new_state, int):
 803                             # see RegexLexer for why this check is made
 804                             if abs(new_state) >= len(ctx.stack):
 805                                 del ctx.stack[1:]
 806                             else:
 807                                 del ctx.stack[new_state:]
 808                         elif new_state == '#push':
 809                             ctx.stack.append(ctx.stack[-1])
 810                         else:
 811                             assert False, "wrong state def: %r" % new_state
 812                         statetokens = tokendefs[ctx.stack[-1]]
 813                     break
 814             else:
 815                 try:
 816                     if ctx.pos >= ctx.end:
 817                         break
 818                     if text[ctx.pos] == '\n':
 819                         # at EOL, reset state to "root"
 820                         ctx.stack = ['root']
 821                         statetokens = tokendefs['root']
 822                         yield ctx.pos, Text, '\n'
 823                         ctx.pos += 1
 824                         continue
 825                     yield ctx.pos, Error, text[ctx.pos]
 826                     ctx.pos += 1
 827                 except IndexError:
 828                     break
 829
 830
 831 def do_insertions(insertions, tokens):
 832     """
 833     Helper for lexers which must combine the results of several
 834     sublexers.
 835
 836     ``insertions`` is a list of ``(index, itokens)`` pairs.
 837     Each ``itokens`` iterable should be inserted at position
 838     ``index`` into the token stream given by the ``tokens``
 839     argument.
 840
 841     The result is a combined token stream.
 842
 843     TODO: clean up the code here.
 844     """
 845     insertions = iter(insertions)
 846     try:
 847         index, itokens = next(insertions)
 848     except StopIteration:
 849         # no insertions
 850         yield from tokens
 851         return
 852
 853     realpos = None
 854     insleft = True
 855
 856     # iterate over the token stream where we want to insert
 857     # the tokens from the insertion list.
 858     for i, t, v in tokens:
 859         # first iteration. store the position of first item
 860         if realpos is None:
 861             realpos = i
 862         oldi = 0
 863         while insleft and i + len(v) >= index:
 864             tmpval = v[oldi:index - i]
 865             if tmpval:
 866                 yield realpos, t, tmpval
 867                 realpos += len(tmpval)
 868             for it_index, it_token, it_value in itokens:
 869                 yield realpos, it_token, it_value
 870                 realpos += len(it_value)
 871             oldi = index - i
 872             try:
 873                 index, itokens = next(insertions)
 874             except StopIteration:
 875                 insleft = False
 876                 break  # not strictly necessary
 877         if oldi < len(v):
 878             yield realpos, t, v[oldi:]
 879             realpos += len(v) - oldi
 880
 881     # leftover tokens
 882     while insleft:
 883         # no normal tokens, set realpos to zero
 884         realpos = realpos or 0
 885         for p, t, v in itokens:
 886             yield realpos, t, v
 887             realpos += len(v)
 888         try:
 889             index, itokens = next(insertions)
 890         except StopIteration:
 891             insleft = False
 892             break  # not strictly necessary
 893
 894
 895 class ProfilingRegexLexerMeta(RegexLexerMeta):
 896     """Metaclass for ProfilingRegexLexer, collects regex timing info."""
 897
 898     def _process_regex(cls, regex, rflags, state):
 899         if isinstance(regex, words):
 900             rex = regex_opt(regex.words, prefix=regex.prefix,
 901                             suffix=regex.suffix)
 902         else:
 903             rex = regex
 904         compiled = re.compile(rex, rflags)
 905
 906         def match_func(text, pos, endpos=sys.maxsize):
 907             info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
 908             t0 = time.time()
 909             res = compiled.match(text, pos, endpos)
 910             t1 = time.time()
 911             info[0] += 1
 912             info[1] += t1 - t0
 913             return res
 914         return match_func
 915
 916
 917 class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
 918     """Drop-in replacement for RegexLexer that does profiling of its regexes."""
 919
 920     _prof_data = []
 921     _prof_sort_index = 4  # defaults to time per call
 922
 923     def get_tokens_unprocessed(self, text, stack=('root',)):
 924         # this needs to be a stack, since using(this) will produce nested calls
 925         self.__class__._prof_data.append({})
 926         yield from RegexLexer.get_tokens_unprocessed(self, text, stack)
 927         rawdata = self.__class__._prof_data.pop()
 928         data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
 929                         n, 1000 * t, 1000 * t / n)
 930                        for ((s, r), (n, t)) in rawdata.items()),
 931                       key=lambda x: x[self._prof_sort_index],
 932                       reverse=True)
 933         sum_total = sum(x[3] for x in data)
 934
 935         print()
 936         print('Profiling result for %s lexing %d chars in %.3f ms' %
 937               (self.__class__.__name__, len(text), sum_total))
 938         print('=' * 110)
 939         print('%-20s %-64s ncalls  tottime  percall' % ('state', 'regex'))
 940         print('-' * 110)
 941         for d in data:
 942             print('%-20s %-65s %5d %8.4f %8.4f' % d)
 943         print('=' * 110)