]>
Commit | Line | Data |
---|---|---|
e0df8241 JR |
1 | """ |
2 | pygments.lexer | |
3 | ~~~~~~~~~~~~~~ | |
4 | ||
5 | Base lexer classes. | |
6 | ||
7 | :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS. | |
8 | :license: BSD, see LICENSE for details. | |
9 | """ | |
10 | ||
11 | import re | |
12 | import sys | |
13 | import time | |
14 | ||
15 | from pip._vendor.pygments.filter import apply_filters, Filter | |
16 | from pip._vendor.pygments.filters import get_filter_by_name | |
17 | from pip._vendor.pygments.token import Error, Text, Other, Whitespace, _TokenType | |
18 | from pip._vendor.pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ | |
19 | make_analysator, Future, guess_decode | |
20 | from pip._vendor.pygments.regexopt import regex_opt | |
21 | ||
22 | __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', | |
23 | 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', | |
24 | 'default', 'words', 'line_re'] | |
25 | ||
26 | line_re = re.compile('.*?\n') | |
27 | ||
28 | _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'), | |
29 | (b'\xff\xfe\0\0', 'utf-32'), | |
30 | (b'\0\0\xfe\xff', 'utf-32be'), | |
31 | (b'\xff\xfe', 'utf-16'), | |
32 | (b'\xfe\xff', 'utf-16be')] | |
33 | ||
34 | _default_analyse = staticmethod(lambda x: 0.0) | |
35 | ||
36 | ||
37 | class LexerMeta(type): | |
38 | """ | |
39 | This metaclass automagically converts ``analyse_text`` methods into | |
40 | static methods which always return float values. | |
41 | """ | |
42 | ||
43 | def __new__(mcs, name, bases, d): | |
44 | if 'analyse_text' in d: | |
45 | d['analyse_text'] = make_analysator(d['analyse_text']) | |
46 | return type.__new__(mcs, name, bases, d) | |
47 | ||
48 | ||
49 | class Lexer(metaclass=LexerMeta): | |
50 | """ | |
51 | Lexer for a specific language. | |
52 | ||
53 | See also :doc:`lexerdevelopment`, a high-level guide to writing | |
54 | lexers. | |
55 | ||
56 | Lexer classes have attributes used for choosing the most appropriate | |
57 | lexer based on various criteria. | |
58 | ||
59 | .. autoattribute:: name | |
60 | :no-value: | |
61 | .. autoattribute:: aliases | |
62 | :no-value: | |
63 | .. autoattribute:: filenames | |
64 | :no-value: | |
65 | .. autoattribute:: alias_filenames | |
66 | .. autoattribute:: mimetypes | |
67 | :no-value: | |
68 | .. autoattribute:: priority | |
69 | ||
70 | Lexers included in Pygments should have an additional attribute: | |
71 | ||
72 | .. autoattribute:: url | |
73 | :no-value: | |
74 | ||
75 | You can pass options to the constructor. The basic options recognized | |
76 | by all lexers and processed by the base `Lexer` class are: | |
77 | ||
78 | ``stripnl`` | |
79 | Strip leading and trailing newlines from the input (default: True). | |
80 | ``stripall`` | |
81 | Strip all leading and trailing whitespace from the input | |
82 | (default: False). | |
83 | ``ensurenl`` | |
84 | Make sure that the input ends with a newline (default: True). This | |
85 | is required for some lexers that consume input linewise. | |
86 | ||
87 | .. versionadded:: 1.3 | |
88 | ||
89 | ``tabsize`` | |
90 | If given and greater than 0, expand tabs in the input (default: 0). | |
91 | ``encoding`` | |
92 | If given, must be an encoding name. This encoding will be used to | |
93 | convert the input string to Unicode, if it is not already a Unicode | |
94 | string (default: ``'guess'``, which uses a simple UTF-8 / Locale / | |
95 | Latin1 detection. Can also be ``'chardet'`` to use the chardet | |
96 | library, if it is installed. | |
97 | ``inencoding`` | |
98 | Overrides the ``encoding`` if given. | |
99 | """ | |
100 | ||
101 | #: Full name of the lexer, in human-readable form | |
102 | name = None | |
103 | ||
104 | #: A list of short, unique identifiers that can be used to look | |
105 | #: up the lexer from a list, e.g., using `get_lexer_by_name()`. | |
106 | aliases = [] | |
107 | ||
108 | #: A list of `fnmatch` patterns that match filenames which contain | |
109 | #: content for this lexer. The patterns in this list should be unique among | |
110 | #: all lexers. | |
111 | filenames = [] | |
112 | ||
113 | #: A list of `fnmatch` patterns that match filenames which may or may not | |
114 | #: contain content for this lexer. This list is used by the | |
115 | #: :func:`.guess_lexer_for_filename()` function, to determine which lexers | |
116 | #: are then included in guessing the correct one. That means that | |
117 | #: e.g. every lexer for HTML and a template language should include | |
118 | #: ``\*.html`` in this list. | |
119 | alias_filenames = [] | |
120 | ||
121 | #: A list of MIME types for content that can be lexed with this lexer. | |
122 | mimetypes = [] | |
123 | ||
124 | #: Priority, should multiple lexers match and no content is provided | |
125 | priority = 0 | |
126 | ||
127 | #: URL of the language specification/definition. Used in the Pygments | |
128 | #: documentation. | |
129 | url = None | |
130 | ||
131 | def __init__(self, **options): | |
132 | """ | |
133 | This constructor takes arbitrary options as keyword arguments. | |
134 | Every subclass must first process its own options and then call | |
135 | the `Lexer` constructor, since it processes the basic | |
136 | options like `stripnl`. | |
137 | ||
138 | An example looks like this: | |
139 | ||
140 | .. sourcecode:: python | |
141 | ||
142 | def __init__(self, **options): | |
143 | self.compress = options.get('compress', '') | |
144 | Lexer.__init__(self, **options) | |
145 | ||
146 | As these options must all be specifiable as strings (due to the | |
147 | command line usage), there are various utility functions | |
148 | available to help with that, see `Utilities`_. | |
149 | """ | |
150 | self.options = options | |
151 | self.stripnl = get_bool_opt(options, 'stripnl', True) | |
152 | self.stripall = get_bool_opt(options, 'stripall', False) | |
153 | self.ensurenl = get_bool_opt(options, 'ensurenl', True) | |
154 | self.tabsize = get_int_opt(options, 'tabsize', 0) | |
155 | self.encoding = options.get('encoding', 'guess') | |
156 | self.encoding = options.get('inencoding') or self.encoding | |
157 | self.filters = [] | |
158 | for filter_ in get_list_opt(options, 'filters', ()): | |
159 | self.add_filter(filter_) | |
160 | ||
161 | def __repr__(self): | |
162 | if self.options: | |
163 | return '<pygments.lexers.%s with %r>' % (self.__class__.__name__, | |
164 | self.options) | |
165 | else: | |
166 | return '<pygments.lexers.%s>' % self.__class__.__name__ | |
167 | ||
168 | def add_filter(self, filter_, **options): | |
169 | """ | |
170 | Add a new stream filter to this lexer. | |
171 | """ | |
172 | if not isinstance(filter_, Filter): | |
173 | filter_ = get_filter_by_name(filter_, **options) | |
174 | self.filters.append(filter_) | |
175 | ||
176 | def analyse_text(text): | |
177 | """ | |
178 | A static method which is called for lexer guessing. | |
179 | ||
180 | It should analyse the text and return a float in the range | |
181 | from ``0.0`` to ``1.0``. If it returns ``0.0``, the lexer | |
182 | will not be selected as the most probable one, if it returns | |
183 | ``1.0``, it will be selected immediately. This is used by | |
184 | `guess_lexer`. | |
185 | ||
186 | The `LexerMeta` metaclass automatically wraps this function so | |
187 | that it works like a static method (no ``self`` or ``cls`` | |
188 | parameter) and the return value is automatically converted to | |
189 | `float`. If the return value is an object that is boolean `False` | |
190 | it's the same as if the return values was ``0.0``. | |
191 | """ | |
192 | ||
193 | def get_tokens(self, text, unfiltered=False): | |
194 | """ | |
195 | This method is the basic interface of a lexer. It is called by | |
196 | the `highlight()` function. It must process the text and return an | |
197 | iterable of ``(tokentype, value)`` pairs from `text`. | |
198 | ||
199 | Normally, you don't need to override this method. The default | |
200 | implementation processes the options recognized by all lexers | |
201 | (`stripnl`, `stripall` and so on), and then yields all tokens | |
202 | from `get_tokens_unprocessed()`, with the ``index`` dropped. | |
203 | ||
204 | If `unfiltered` is set to `True`, the filtering mechanism is | |
205 | bypassed even if filters are defined. | |
206 | """ | |
207 | if not isinstance(text, str): | |
208 | if self.encoding == 'guess': | |
209 | text, _ = guess_decode(text) | |
210 | elif self.encoding == 'chardet': | |
211 | try: | |
212 | from pip._vendor import chardet | |
213 | except ImportError as e: | |
214 | raise ImportError('To enable chardet encoding guessing, ' | |
215 | 'please install the chardet library ' | |
216 | 'from http://chardet.feedparser.org/') from e | |
217 | # check for BOM first | |
218 | decoded = None | |
219 | for bom, encoding in _encoding_map: | |
220 | if text.startswith(bom): | |
221 | decoded = text[len(bom):].decode(encoding, 'replace') | |
222 | break | |
223 | # no BOM found, so use chardet | |
224 | if decoded is None: | |
225 | enc = chardet.detect(text[:1024]) # Guess using first 1KB | |
226 | decoded = text.decode(enc.get('encoding') or 'utf-8', | |
227 | 'replace') | |
228 | text = decoded | |
229 | else: | |
230 | text = text.decode(self.encoding) | |
231 | if text.startswith('\ufeff'): | |
232 | text = text[len('\ufeff'):] | |
233 | else: | |
234 | if text.startswith('\ufeff'): | |
235 | text = text[len('\ufeff'):] | |
236 | ||
237 | # text now *is* a unicode string | |
238 | text = text.replace('\r\n', '\n') | |
239 | text = text.replace('\r', '\n') | |
240 | if self.stripall: | |
241 | text = text.strip() | |
242 | elif self.stripnl: | |
243 | text = text.strip('\n') | |
244 | if self.tabsize > 0: | |
245 | text = text.expandtabs(self.tabsize) | |
246 | if self.ensurenl and not text.endswith('\n'): | |
247 | text += '\n' | |
248 | ||
249 | def streamer(): | |
250 | for _, t, v in self.get_tokens_unprocessed(text): | |
251 | yield t, v | |
252 | stream = streamer() | |
253 | if not unfiltered: | |
254 | stream = apply_filters(stream, self.filters, self) | |
255 | return stream | |
256 | ||
257 | def get_tokens_unprocessed(self, text): | |
258 | """ | |
259 | This method should process the text and return an iterable of | |
260 | ``(index, tokentype, value)`` tuples where ``index`` is the starting | |
261 | position of the token within the input text. | |
262 | ||
263 | It must be overridden by subclasses. It is recommended to | |
264 | implement it as a generator to maximize effectiveness. | |
265 | """ | |
266 | raise NotImplementedError | |
267 | ||
268 | ||
269 | class DelegatingLexer(Lexer): | |
270 | """ | |
271 | This lexer takes two lexer as arguments. A root lexer and | |
272 | a language lexer. First everything is scanned using the language | |
273 | lexer, afterwards all ``Other`` tokens are lexed using the root | |
274 | lexer. | |
275 | ||
276 | The lexers from the ``template`` lexer package use this base lexer. | |
277 | """ | |
278 | ||
279 | def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options): | |
280 | self.root_lexer = _root_lexer(**options) | |
281 | self.language_lexer = _language_lexer(**options) | |
282 | self.needle = _needle | |
283 | Lexer.__init__(self, **options) | |
284 | ||
285 | def get_tokens_unprocessed(self, text): | |
286 | buffered = '' | |
287 | insertions = [] | |
288 | lng_buffer = [] | |
289 | for i, t, v in self.language_lexer.get_tokens_unprocessed(text): | |
290 | if t is self.needle: | |
291 | if lng_buffer: | |
292 | insertions.append((len(buffered), lng_buffer)) | |
293 | lng_buffer = [] | |
294 | buffered += v | |
295 | else: | |
296 | lng_buffer.append((i, t, v)) | |
297 | if lng_buffer: | |
298 | insertions.append((len(buffered), lng_buffer)) | |
299 | return do_insertions(insertions, | |
300 | self.root_lexer.get_tokens_unprocessed(buffered)) | |
301 | ||
302 | ||
303 | # ------------------------------------------------------------------------------ | |
304 | # RegexLexer and ExtendedRegexLexer | |
305 | # | |
306 | ||
307 | ||
308 | class include(str): # pylint: disable=invalid-name | |
309 | """ | |
310 | Indicates that a state should include rules from another state. | |
311 | """ | |
312 | pass | |
313 | ||
314 | ||
315 | class _inherit: | |
316 | """ | |
317 | Indicates the a state should inherit from its superclass. | |
318 | """ | |
319 | def __repr__(self): | |
320 | return 'inherit' | |
321 | ||
322 | inherit = _inherit() # pylint: disable=invalid-name | |
323 | ||
324 | ||
325 | class combined(tuple): # pylint: disable=invalid-name | |
326 | """ | |
327 | Indicates a state combined from multiple states. | |
328 | """ | |
329 | ||
330 | def __new__(cls, *args): | |
331 | return tuple.__new__(cls, args) | |
332 | ||
333 | def __init__(self, *args): | |
334 | # tuple.__init__ doesn't do anything | |
335 | pass | |
336 | ||
337 | ||
338 | class _PseudoMatch: | |
339 | """ | |
340 | A pseudo match object constructed from a string. | |
341 | """ | |
342 | ||
343 | def __init__(self, start, text): | |
344 | self._text = text | |
345 | self._start = start | |
346 | ||
347 | def start(self, arg=None): | |
348 | return self._start | |
349 | ||
350 | def end(self, arg=None): | |
351 | return self._start + len(self._text) | |
352 | ||
353 | def group(self, arg=None): | |
354 | if arg: | |
355 | raise IndexError('No such group') | |
356 | return self._text | |
357 | ||
358 | def groups(self): | |
359 | return (self._text,) | |
360 | ||
361 | def groupdict(self): | |
362 | return {} | |
363 | ||
364 | ||
365 | def bygroups(*args): | |
366 | """ | |
367 | Callback that yields multiple actions for each group in the match. | |
368 | """ | |
369 | def callback(lexer, match, ctx=None): | |
370 | for i, action in enumerate(args): | |
371 | if action is None: | |
372 | continue | |
373 | elif type(action) is _TokenType: | |
374 | data = match.group(i + 1) | |
375 | if data: | |
376 | yield match.start(i + 1), action, data | |
377 | else: | |
378 | data = match.group(i + 1) | |
379 | if data is not None: | |
380 | if ctx: | |
381 | ctx.pos = match.start(i + 1) | |
382 | for item in action(lexer, | |
383 | _PseudoMatch(match.start(i + 1), data), ctx): | |
384 | if item: | |
385 | yield item | |
386 | if ctx: | |
387 | ctx.pos = match.end() | |
388 | return callback | |
389 | ||
390 | ||
391 | class _This: | |
392 | """ | |
393 | Special singleton used for indicating the caller class. | |
394 | Used by ``using``. | |
395 | """ | |
396 | ||
397 | this = _This() | |
398 | ||
399 | ||
400 | def using(_other, **kwargs): | |
401 | """ | |
402 | Callback that processes the match with a different lexer. | |
403 | ||
404 | The keyword arguments are forwarded to the lexer, except `state` which | |
405 | is handled separately. | |
406 | ||
407 | `state` specifies the state that the new lexer will start in, and can | |
408 | be an enumerable such as ('root', 'inline', 'string') or a simple | |
409 | string which is assumed to be on top of the root state. | |
410 | ||
411 | Note: For that to work, `_other` must not be an `ExtendedRegexLexer`. | |
412 | """ | |
413 | gt_kwargs = {} | |
414 | if 'state' in kwargs: | |
415 | s = kwargs.pop('state') | |
416 | if isinstance(s, (list, tuple)): | |
417 | gt_kwargs['stack'] = s | |
418 | else: | |
419 | gt_kwargs['stack'] = ('root', s) | |
420 | ||
421 | if _other is this: | |
422 | def callback(lexer, match, ctx=None): | |
423 | # if keyword arguments are given the callback | |
424 | # function has to create a new lexer instance | |
425 | if kwargs: | |
426 | # XXX: cache that somehow | |
427 | kwargs.update(lexer.options) | |
428 | lx = lexer.__class__(**kwargs) | |
429 | else: | |
430 | lx = lexer | |
431 | s = match.start() | |
432 | for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs): | |
433 | yield i + s, t, v | |
434 | if ctx: | |
435 | ctx.pos = match.end() | |
436 | else: | |
437 | def callback(lexer, match, ctx=None): | |
438 | # XXX: cache that somehow | |
439 | kwargs.update(lexer.options) | |
440 | lx = _other(**kwargs) | |
441 | ||
442 | s = match.start() | |
443 | for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs): | |
444 | yield i + s, t, v | |
445 | if ctx: | |
446 | ctx.pos = match.end() | |
447 | return callback | |
448 | ||
449 | ||
450 | class default: | |
451 | """ | |
452 | Indicates a state or state action (e.g. #pop) to apply. | |
453 | For example default('#pop') is equivalent to ('', Token, '#pop') | |
454 | Note that state tuples may be used as well. | |
455 | ||
456 | .. versionadded:: 2.0 | |
457 | """ | |
458 | def __init__(self, state): | |
459 | self.state = state | |
460 | ||
461 | ||
462 | class words(Future): | |
463 | """ | |
464 | Indicates a list of literal words that is transformed into an optimized | |
465 | regex that matches any of the words. | |
466 | ||
467 | .. versionadded:: 2.0 | |
468 | """ | |
469 | def __init__(self, words, prefix='', suffix=''): | |
470 | self.words = words | |
471 | self.prefix = prefix | |
472 | self.suffix = suffix | |
473 | ||
474 | def get(self): | |
475 | return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix) | |
476 | ||
477 | ||
478 | class RegexLexerMeta(LexerMeta): | |
479 | """ | |
480 | Metaclass for RegexLexer, creates the self._tokens attribute from | |
481 | self.tokens on the first instantiation. | |
482 | """ | |
483 | ||
484 | def _process_regex(cls, regex, rflags, state): | |
485 | """Preprocess the regular expression component of a token definition.""" | |
486 | if isinstance(regex, Future): | |
487 | regex = regex.get() | |
488 | return re.compile(regex, rflags).match | |
489 | ||
490 | def _process_token(cls, token): | |
491 | """Preprocess the token component of a token definition.""" | |
492 | assert type(token) is _TokenType or callable(token), \ | |
493 | 'token type must be simple type or callable, not %r' % (token,) | |
494 | return token | |
495 | ||
496 | def _process_new_state(cls, new_state, unprocessed, processed): | |
497 | """Preprocess the state transition action of a token definition.""" | |
498 | if isinstance(new_state, str): | |
499 | # an existing state | |
500 | if new_state == '#pop': | |
501 | return -1 | |
502 | elif new_state in unprocessed: | |
503 | return (new_state,) | |
504 | elif new_state == '#push': | |
505 | return new_state | |
506 | elif new_state[:5] == '#pop:': | |
507 | return -int(new_state[5:]) | |
508 | else: | |
509 | assert False, 'unknown new state %r' % new_state | |
510 | elif isinstance(new_state, combined): | |
511 | # combine a new state from existing ones | |
512 | tmp_state = '_tmp_%d' % cls._tmpname | |
513 | cls._tmpname += 1 | |
514 | itokens = [] | |
515 | for istate in new_state: | |
516 | assert istate != new_state, 'circular state ref %r' % istate | |
517 | itokens.extend(cls._process_state(unprocessed, | |
518 | processed, istate)) | |
519 | processed[tmp_state] = itokens | |
520 | return (tmp_state,) | |
521 | elif isinstance(new_state, tuple): | |
522 | # push more than one state | |
523 | for istate in new_state: | |
524 | assert (istate in unprocessed or | |
525 | istate in ('#pop', '#push')), \ | |
526 | 'unknown new state ' + istate | |
527 | return new_state | |
528 | else: | |
529 | assert False, 'unknown new state def %r' % new_state | |
530 | ||
531 | def _process_state(cls, unprocessed, processed, state): | |
532 | """Preprocess a single state definition.""" | |
533 | assert type(state) is str, "wrong state name %r" % state | |
534 | assert state[0] != '#', "invalid state name %r" % state | |
535 | if state in processed: | |
536 | return processed[state] | |
537 | tokens = processed[state] = [] | |
538 | rflags = cls.flags | |
539 | for tdef in unprocessed[state]: | |
540 | if isinstance(tdef, include): | |
541 | # it's a state reference | |
542 | assert tdef != state, "circular state reference %r" % state | |
543 | tokens.extend(cls._process_state(unprocessed, processed, | |
544 | str(tdef))) | |
545 | continue | |
546 | if isinstance(tdef, _inherit): | |
547 | # should be processed already, but may not in the case of: | |
548 | # 1. the state has no counterpart in any parent | |
549 | # 2. the state includes more than one 'inherit' | |
550 | continue | |
551 | if isinstance(tdef, default): | |
552 | new_state = cls._process_new_state(tdef.state, unprocessed, processed) | |
553 | tokens.append((re.compile('').match, None, new_state)) | |
554 | continue | |
555 | ||
556 | assert type(tdef) is tuple, "wrong rule def %r" % tdef | |
557 | ||
558 | try: | |
559 | rex = cls._process_regex(tdef[0], rflags, state) | |
560 | except Exception as err: | |
561 | raise ValueError("uncompilable regex %r in state %r of %r: %s" % | |
562 | (tdef[0], state, cls, err)) from err | |
563 | ||
564 | token = cls._process_token(tdef[1]) | |
565 | ||
566 | if len(tdef) == 2: | |
567 | new_state = None | |
568 | else: | |
569 | new_state = cls._process_new_state(tdef[2], | |
570 | unprocessed, processed) | |
571 | ||
572 | tokens.append((rex, token, new_state)) | |
573 | return tokens | |
574 | ||
575 | def process_tokendef(cls, name, tokendefs=None): | |
576 | """Preprocess a dictionary of token definitions.""" | |
577 | processed = cls._all_tokens[name] = {} | |
578 | tokendefs = tokendefs or cls.tokens[name] | |
579 | for state in list(tokendefs): | |
580 | cls._process_state(tokendefs, processed, state) | |
581 | return processed | |
582 | ||
583 | def get_tokendefs(cls): | |
584 | """ | |
585 | Merge tokens from superclasses in MRO order, returning a single tokendef | |
586 | dictionary. | |
587 | ||
588 | Any state that is not defined by a subclass will be inherited | |
589 | automatically. States that *are* defined by subclasses will, by | |
590 | default, override that state in the superclass. If a subclass wishes to | |
591 | inherit definitions from a superclass, it can use the special value | |
592 | "inherit", which will cause the superclass' state definition to be | |
593 | included at that point in the state. | |
594 | """ | |
595 | tokens = {} | |
596 | inheritable = {} | |
597 | for c in cls.__mro__: | |
598 | toks = c.__dict__.get('tokens', {}) | |
599 | ||
600 | for state, items in toks.items(): | |
601 | curitems = tokens.get(state) | |
602 | if curitems is None: | |
603 | # N.b. because this is assigned by reference, sufficiently | |
604 | # deep hierarchies are processed incrementally (e.g. for | |
605 | # A(B), B(C), C(RegexLexer), B will be premodified so X(B) | |
606 | # will not see any inherits in B). | |
607 | tokens[state] = items | |
608 | try: | |
609 | inherit_ndx = items.index(inherit) | |
610 | except ValueError: | |
611 | continue | |
612 | inheritable[state] = inherit_ndx | |
613 | continue | |
614 | ||
615 | inherit_ndx = inheritable.pop(state, None) | |
616 | if inherit_ndx is None: | |
617 | continue | |
618 | ||
619 | # Replace the "inherit" value with the items | |
620 | curitems[inherit_ndx:inherit_ndx+1] = items | |
621 | try: | |
622 | # N.b. this is the index in items (that is, the superclass | |
623 | # copy), so offset required when storing below. | |
624 | new_inh_ndx = items.index(inherit) | |
625 | except ValueError: | |
626 | pass | |
627 | else: | |
628 | inheritable[state] = inherit_ndx + new_inh_ndx | |
629 | ||
630 | return tokens | |
631 | ||
632 | def __call__(cls, *args, **kwds): | |
633 | """Instantiate cls after preprocessing its token definitions.""" | |
634 | if '_tokens' not in cls.__dict__: | |
635 | cls._all_tokens = {} | |
636 | cls._tmpname = 0 | |
637 | if hasattr(cls, 'token_variants') and cls.token_variants: | |
638 | # don't process yet | |
639 | pass | |
640 | else: | |
641 | cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) | |
642 | ||
643 | return type.__call__(cls, *args, **kwds) | |
644 | ||
645 | ||
646 | class RegexLexer(Lexer, metaclass=RegexLexerMeta): | |
647 | """ | |
648 | Base for simple stateful regular expression-based lexers. | |
649 | Simplifies the lexing process so that you need only | |
650 | provide a list of states and regular expressions. | |
651 | """ | |
652 | ||
653 | #: Flags for compiling the regular expressions. | |
654 | #: Defaults to MULTILINE. | |
655 | flags = re.MULTILINE | |
656 | ||
657 | #: At all time there is a stack of states. Initially, the stack contains | |
658 | #: a single state 'root'. The top of the stack is called "the current state". | |
659 | #: | |
660 | #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}`` | |
661 | #: | |
662 | #: ``new_state`` can be omitted to signify no state transition. | |
663 | #: If ``new_state`` is a string, it is pushed on the stack. This ensure | |
664 | #: the new current state is ``new_state``. | |
665 | #: If ``new_state`` is a tuple of strings, all of those strings are pushed | |
666 | #: on the stack and the current state will be the last element of the list. | |
667 | #: ``new_state`` can also be ``combined('state1', 'state2', ...)`` | |
668 | #: to signify a new, anonymous state combined from the rules of two | |
669 | #: or more existing ones. | |
670 | #: Furthermore, it can be '#pop' to signify going back one step in | |
671 | #: the state stack, or '#push' to push the current state on the stack | |
672 | #: again. Note that if you push while in a combined state, the combined | |
673 | #: state itself is pushed, and not only the state in which the rule is | |
674 | #: defined. | |
675 | #: | |
676 | #: The tuple can also be replaced with ``include('state')``, in which | |
677 | #: case the rules from the state named by the string are included in the | |
678 | #: current one. | |
679 | tokens = {} | |
680 | ||
681 | def get_tokens_unprocessed(self, text, stack=('root',)): | |
682 | """ | |
683 | Split ``text`` into (tokentype, text) pairs. | |
684 | ||
685 | ``stack`` is the initial stack (default: ``['root']``) | |
686 | """ | |
687 | pos = 0 | |
688 | tokendefs = self._tokens | |
689 | statestack = list(stack) | |
690 | statetokens = tokendefs[statestack[-1]] | |
691 | while 1: | |
692 | for rexmatch, action, new_state in statetokens: | |
693 | m = rexmatch(text, pos) | |
694 | if m: | |
695 | if action is not None: | |
696 | if type(action) is _TokenType: | |
697 | yield pos, action, m.group() | |
698 | else: | |
699 | yield from action(self, m) | |
700 | pos = m.end() | |
701 | if new_state is not None: | |
702 | # state transition | |
703 | if isinstance(new_state, tuple): | |
704 | for state in new_state: | |
705 | if state == '#pop': | |
706 | if len(statestack) > 1: | |
707 | statestack.pop() | |
708 | elif state == '#push': | |
709 | statestack.append(statestack[-1]) | |
710 | else: | |
711 | statestack.append(state) | |
712 | elif isinstance(new_state, int): | |
713 | # pop, but keep at least one state on the stack | |
714 | # (random code leading to unexpected pops should | |
715 | # not allow exceptions) | |
716 | if abs(new_state) >= len(statestack): | |
717 | del statestack[1:] | |
718 | else: | |
719 | del statestack[new_state:] | |
720 | elif new_state == '#push': | |
721 | statestack.append(statestack[-1]) | |
722 | else: | |
723 | assert False, "wrong state def: %r" % new_state | |
724 | statetokens = tokendefs[statestack[-1]] | |
725 | break | |
726 | else: | |
727 | # We are here only if all state tokens have been considered | |
728 | # and there was not a match on any of them. | |
729 | try: | |
730 | if text[pos] == '\n': | |
731 | # at EOL, reset state to "root" | |
732 | statestack = ['root'] | |
733 | statetokens = tokendefs['root'] | |
734 | yield pos, Whitespace, '\n' | |
735 | pos += 1 | |
736 | continue | |
737 | yield pos, Error, text[pos] | |
738 | pos += 1 | |
739 | except IndexError: | |
740 | break | |
741 | ||
742 | ||
743 | class LexerContext: | |
744 | """ | |
745 | A helper object that holds lexer position data. | |
746 | """ | |
747 | ||
748 | def __init__(self, text, pos, stack=None, end=None): | |
749 | self.text = text | |
750 | self.pos = pos | |
751 | self.end = end or len(text) # end=0 not supported ;-) | |
752 | self.stack = stack or ['root'] | |
753 | ||
754 | def __repr__(self): | |
755 | return 'LexerContext(%r, %r, %r)' % ( | |
756 | self.text, self.pos, self.stack) | |
757 | ||
758 | ||
759 | class ExtendedRegexLexer(RegexLexer): | |
760 | """ | |
761 | A RegexLexer that uses a context object to store its state. | |
762 | """ | |
763 | ||
764 | def get_tokens_unprocessed(self, text=None, context=None): | |
765 | """ | |
766 | Split ``text`` into (tokentype, text) pairs. | |
767 | If ``context`` is given, use this lexer context instead. | |
768 | """ | |
769 | tokendefs = self._tokens | |
770 | if not context: | |
771 | ctx = LexerContext(text, 0) | |
772 | statetokens = tokendefs['root'] | |
773 | else: | |
774 | ctx = context | |
775 | statetokens = tokendefs[ctx.stack[-1]] | |
776 | text = ctx.text | |
777 | while 1: | |
778 | for rexmatch, action, new_state in statetokens: | |
779 | m = rexmatch(text, ctx.pos, ctx.end) | |
780 | if m: | |
781 | if action is not None: | |
782 | if type(action) is _TokenType: | |
783 | yield ctx.pos, action, m.group() | |
784 | ctx.pos = m.end() | |
785 | else: | |
786 | yield from action(self, m, ctx) | |
787 | if not new_state: | |
788 | # altered the state stack? | |
789 | statetokens = tokendefs[ctx.stack[-1]] | |
790 | # CAUTION: callback must set ctx.pos! | |
791 | if new_state is not None: | |
792 | # state transition | |
793 | if isinstance(new_state, tuple): | |
794 | for state in new_state: | |
795 | if state == '#pop': | |
796 | if len(ctx.stack) > 1: | |
797 | ctx.stack.pop() | |
798 | elif state == '#push': | |
799 | ctx.stack.append(ctx.stack[-1]) | |
800 | else: | |
801 | ctx.stack.append(state) | |
802 | elif isinstance(new_state, int): | |
803 | # see RegexLexer for why this check is made | |
804 | if abs(new_state) >= len(ctx.stack): | |
805 | del ctx.stack[1:] | |
806 | else: | |
807 | del ctx.stack[new_state:] | |
808 | elif new_state == '#push': | |
809 | ctx.stack.append(ctx.stack[-1]) | |
810 | else: | |
811 | assert False, "wrong state def: %r" % new_state | |
812 | statetokens = tokendefs[ctx.stack[-1]] | |
813 | break | |
814 | else: | |
815 | try: | |
816 | if ctx.pos >= ctx.end: | |
817 | break | |
818 | if text[ctx.pos] == '\n': | |
819 | # at EOL, reset state to "root" | |
820 | ctx.stack = ['root'] | |
821 | statetokens = tokendefs['root'] | |
822 | yield ctx.pos, Text, '\n' | |
823 | ctx.pos += 1 | |
824 | continue | |
825 | yield ctx.pos, Error, text[ctx.pos] | |
826 | ctx.pos += 1 | |
827 | except IndexError: | |
828 | break | |
829 | ||
830 | ||
831 | def do_insertions(insertions, tokens): | |
832 | """ | |
833 | Helper for lexers which must combine the results of several | |
834 | sublexers. | |
835 | ||
836 | ``insertions`` is a list of ``(index, itokens)`` pairs. | |
837 | Each ``itokens`` iterable should be inserted at position | |
838 | ``index`` into the token stream given by the ``tokens`` | |
839 | argument. | |
840 | ||
841 | The result is a combined token stream. | |
842 | ||
843 | TODO: clean up the code here. | |
844 | """ | |
845 | insertions = iter(insertions) | |
846 | try: | |
847 | index, itokens = next(insertions) | |
848 | except StopIteration: | |
849 | # no insertions | |
850 | yield from tokens | |
851 | return | |
852 | ||
853 | realpos = None | |
854 | insleft = True | |
855 | ||
856 | # iterate over the token stream where we want to insert | |
857 | # the tokens from the insertion list. | |
858 | for i, t, v in tokens: | |
859 | # first iteration. store the position of first item | |
860 | if realpos is None: | |
861 | realpos = i | |
862 | oldi = 0 | |
863 | while insleft and i + len(v) >= index: | |
864 | tmpval = v[oldi:index - i] | |
865 | if tmpval: | |
866 | yield realpos, t, tmpval | |
867 | realpos += len(tmpval) | |
868 | for it_index, it_token, it_value in itokens: | |
869 | yield realpos, it_token, it_value | |
870 | realpos += len(it_value) | |
871 | oldi = index - i | |
872 | try: | |
873 | index, itokens = next(insertions) | |
874 | except StopIteration: | |
875 | insleft = False | |
876 | break # not strictly necessary | |
877 | if oldi < len(v): | |
878 | yield realpos, t, v[oldi:] | |
879 | realpos += len(v) - oldi | |
880 | ||
881 | # leftover tokens | |
882 | while insleft: | |
883 | # no normal tokens, set realpos to zero | |
884 | realpos = realpos or 0 | |
885 | for p, t, v in itokens: | |
886 | yield realpos, t, v | |
887 | realpos += len(v) | |
888 | try: | |
889 | index, itokens = next(insertions) | |
890 | except StopIteration: | |
891 | insleft = False | |
892 | break # not strictly necessary | |
893 | ||
894 | ||
895 | class ProfilingRegexLexerMeta(RegexLexerMeta): | |
896 | """Metaclass for ProfilingRegexLexer, collects regex timing info.""" | |
897 | ||
898 | def _process_regex(cls, regex, rflags, state): | |
899 | if isinstance(regex, words): | |
900 | rex = regex_opt(regex.words, prefix=regex.prefix, | |
901 | suffix=regex.suffix) | |
902 | else: | |
903 | rex = regex | |
904 | compiled = re.compile(rex, rflags) | |
905 | ||
906 | def match_func(text, pos, endpos=sys.maxsize): | |
907 | info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0]) | |
908 | t0 = time.time() | |
909 | res = compiled.match(text, pos, endpos) | |
910 | t1 = time.time() | |
911 | info[0] += 1 | |
912 | info[1] += t1 - t0 | |
913 | return res | |
914 | return match_func | |
915 | ||
916 | ||
917 | class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta): | |
918 | """Drop-in replacement for RegexLexer that does profiling of its regexes.""" | |
919 | ||
920 | _prof_data = [] | |
921 | _prof_sort_index = 4 # defaults to time per call | |
922 | ||
923 | def get_tokens_unprocessed(self, text, stack=('root',)): | |
924 | # this needs to be a stack, since using(this) will produce nested calls | |
925 | self.__class__._prof_data.append({}) | |
926 | yield from RegexLexer.get_tokens_unprocessed(self, text, stack) | |
927 | rawdata = self.__class__._prof_data.pop() | |
928 | data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65], | |
929 | n, 1000 * t, 1000 * t / n) | |
930 | for ((s, r), (n, t)) in rawdata.items()), | |
931 | key=lambda x: x[self._prof_sort_index], | |
932 | reverse=True) | |
933 | sum_total = sum(x[3] for x in data) | |
934 | ||
935 | print() | |
936 | print('Profiling result for %s lexing %d chars in %.3f ms' % | |
937 | (self.__class__.__name__, len(text), sum_total)) | |
938 | print('=' * 110) | |
939 | print('%-20s %-64s ncalls tottime percall' % ('state', 'regex')) | |
940 | print('-' * 110) | |
941 | for d in data: | |
942 | print('%-20s %-65s %5d %8.4f %8.4f' % d) | |
943 | print('=' * 110) |