jinja2.lexer

Implements a Jinja / Python combination lexer. The Lexer class is used to do some preprocessing. It filters out invalid operators like the bitshift operators we don't allow in templates. It separates template code and python code in expressions.

View Source

  1"""Implements a Jinja / Python combination lexer. The ``Lexer`` class
  2is used to do some preprocessing. It filters out invalid operators like
  3the bitshift operators we don't allow in templates. It separates
  4template code and python code in expressions.
  5"""
  6import re
  7import typing as t
  8from ast import literal_eval
  9from collections import deque
 10from sys import intern
 11
 12from ._identifier import pattern as name_re
 13from .exceptions import TemplateSyntaxError
 14from .utils import LRUCache
 15
 16if t.TYPE_CHECKING:
 17    import typing_extensions as te
 18    from .environment import Environment
 19
 20# cache for the lexers. Exists in order to be able to have multiple
 21# environments with the same lexer
 22_lexer_cache: t.MutableMapping[t.Tuple, "Lexer"] = LRUCache(50)  # type: ignore
 23
 24# static regular expressions
 25whitespace_re = re.compile(r"\s+")
 26newline_re = re.compile(r"(\r\n|\r|\n)")
 27string_re = re.compile(
 28    r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S
 29)
 30integer_re = re.compile(
 31    r"""
 32    (
 33        0b(_?[0-1])+ # binary
 34    |
 35        0o(_?[0-7])+ # octal
 36    |
 37        0x(_?[\da-f])+ # hex
 38    |
 39        [1-9](_?\d)* # decimal
 40    |
 41        0(_?0)* # decimal zero
 42    )
 43    """,
 44    re.IGNORECASE | re.VERBOSE,
 45)
 46float_re = re.compile(
 47    r"""
 48    (?<!\.)  # doesn't start with a .
 49    (\d+_)*\d+  # digits, possibly _ separated
 50    (
 51        (\.(\d+_)*\d+)?  # optional fractional part
 52        e[+\-]?(\d+_)*\d+  # exponent part
 53    |
 54        \.(\d+_)*\d+  # required fractional part
 55    )
 56    """,
 57    re.IGNORECASE | re.VERBOSE,
 58)
 59
 60# internal the tokens and keep references to them
 61TOKEN_ADD = intern("add")
 62TOKEN_ASSIGN = intern("assign")
 63TOKEN_COLON = intern("colon")
 64TOKEN_COMMA = intern("comma")
 65TOKEN_DIV = intern("div")
 66TOKEN_DOT = intern("dot")
 67TOKEN_EQ = intern("eq")
 68TOKEN_FLOORDIV = intern("floordiv")
 69TOKEN_GT = intern("gt")
 70TOKEN_GTEQ = intern("gteq")
 71TOKEN_LBRACE = intern("lbrace")
 72TOKEN_LBRACKET = intern("lbracket")
 73TOKEN_LPAREN = intern("lparen")
 74TOKEN_LT = intern("lt")
 75TOKEN_LTEQ = intern("lteq")
 76TOKEN_MOD = intern("mod")
 77TOKEN_MUL = intern("mul")
 78TOKEN_NE = intern("ne")
 79TOKEN_PIPE = intern("pipe")
 80TOKEN_POW = intern("pow")
 81TOKEN_RBRACE = intern("rbrace")
 82TOKEN_RBRACKET = intern("rbracket")
 83TOKEN_RPAREN = intern("rparen")
 84TOKEN_SEMICOLON = intern("semicolon")
 85TOKEN_SUB = intern("sub")
 86TOKEN_TILDE = intern("tilde")
 87TOKEN_WHITESPACE = intern("whitespace")
 88TOKEN_FLOAT = intern("float")
 89TOKEN_INTEGER = intern("integer")
 90TOKEN_NAME = intern("name")
 91TOKEN_STRING = intern("string")
 92TOKEN_OPERATOR = intern("operator")
 93TOKEN_BLOCK_BEGIN = intern("block_begin")
 94TOKEN_BLOCK_END = intern("block_end")
 95TOKEN_VARIABLE_BEGIN = intern("variable_begin")
 96TOKEN_VARIABLE_END = intern("variable_end")
 97TOKEN_RAW_BEGIN = intern("raw_begin")
 98TOKEN_RAW_END = intern("raw_end")
 99TOKEN_COMMENT_BEGIN = intern("comment_begin")
100TOKEN_COMMENT_END = intern("comment_end")
101TOKEN_COMMENT = intern("comment")
102TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin")
103TOKEN_LINESTATEMENT_END = intern("linestatement_end")
104TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin")
105TOKEN_LINECOMMENT_END = intern("linecomment_end")
106TOKEN_LINECOMMENT = intern("linecomment")
107TOKEN_DATA = intern("data")
108TOKEN_INITIAL = intern("initial")
109TOKEN_EOF = intern("eof")
110
111# bind operators to token types
112operators = {
113    "+": TOKEN_ADD,
114    "-": TOKEN_SUB,
115    "/": TOKEN_DIV,
116    "//": TOKEN_FLOORDIV,
117    "*": TOKEN_MUL,
118    "%": TOKEN_MOD,
119    "**": TOKEN_POW,
120    "~": TOKEN_TILDE,
121    "[": TOKEN_LBRACKET,
122    "]": TOKEN_RBRACKET,
123    "(": TOKEN_LPAREN,
124    ")": TOKEN_RPAREN,
125    "{": TOKEN_LBRACE,
126    "}": TOKEN_RBRACE,
127    "==": TOKEN_EQ,
128    "!=": TOKEN_NE,
129    ">": TOKEN_GT,
130    ">=": TOKEN_GTEQ,
131    "<": TOKEN_LT,
132    "<=": TOKEN_LTEQ,
133    "=": TOKEN_ASSIGN,
134    ".": TOKEN_DOT,
135    ":": TOKEN_COLON,
136    "|": TOKEN_PIPE,
137    ",": TOKEN_COMMA,
138    ";": TOKEN_SEMICOLON,
139}
140
141reverse_operators = {v: k for k, v in operators.items()}
142assert len(operators) == len(reverse_operators), "operators dropped"
143operator_re = re.compile(
144    f"({'|'.join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))})"
145)
146
147ignored_tokens = frozenset(
148    [
149        TOKEN_COMMENT_BEGIN,
150        TOKEN_COMMENT,
151        TOKEN_COMMENT_END,
152        TOKEN_WHITESPACE,
153        TOKEN_LINECOMMENT_BEGIN,
154        TOKEN_LINECOMMENT_END,
155        TOKEN_LINECOMMENT,
156    ]
157)
158ignore_if_empty = frozenset(
159    [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT]
160)
161
162
163def _describe_token_type(token_type: str) -> str:
164    if token_type in reverse_operators:
165        return reverse_operators[token_type]
166
167    return {
168        TOKEN_COMMENT_BEGIN: "begin of comment",
169        TOKEN_COMMENT_END: "end of comment",
170        TOKEN_COMMENT: "comment",
171        TOKEN_LINECOMMENT: "comment",
172        TOKEN_BLOCK_BEGIN: "begin of statement block",
173        TOKEN_BLOCK_END: "end of statement block",
174        TOKEN_VARIABLE_BEGIN: "begin of print statement",
175        TOKEN_VARIABLE_END: "end of print statement",
176        TOKEN_LINESTATEMENT_BEGIN: "begin of line statement",
177        TOKEN_LINESTATEMENT_END: "end of line statement",
178        TOKEN_DATA: "template data / text",
179        TOKEN_EOF: "end of template",
180    }.get(token_type, token_type)
181
182
183def describe_token(token: "Token") -> str:
184    """Returns a description of the token."""
185    if token.type == TOKEN_NAME:
186        return token.value
187
188    return _describe_token_type(token.type)
189
190
191def describe_token_expr(expr: str) -> str:
192    """Like `describe_token` but for token expressions."""
193    if ":" in expr:
194        type, value = expr.split(":", 1)
195
196        if type == TOKEN_NAME:
197            return value
198    else:
199        type = expr
200
201    return _describe_token_type(type)
202
203
204def count_newlines(value: str) -> int:
205    """Count the number of newline characters in the string.  This is
206    useful for extensions that filter a stream.
207    """
208    return len(newline_re.findall(value))
209
210
211def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]:
212    """Compiles all the rules from the environment into a list of rules."""
213    e = re.escape
214    rules = [
215        (
216            len(environment.comment_start_string),
217            TOKEN_COMMENT_BEGIN,
218            e(environment.comment_start_string),
219        ),
220        (
221            len(environment.block_start_string),
222            TOKEN_BLOCK_BEGIN,
223            e(environment.block_start_string),
224        ),
225        (
226            len(environment.variable_start_string),
227            TOKEN_VARIABLE_BEGIN,
228            e(environment.variable_start_string),
229        ),
230    ]
231
232    if environment.line_statement_prefix is not None:
233        rules.append(
234            (
235                len(environment.line_statement_prefix),
236                TOKEN_LINESTATEMENT_BEGIN,
237                r"^[ \t\v]*" + e(environment.line_statement_prefix),
238            )
239        )
240    if environment.line_comment_prefix is not None:
241        rules.append(
242            (
243                len(environment.line_comment_prefix),
244                TOKEN_LINECOMMENT_BEGIN,
245                r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix),
246            )
247        )
248
249    return [x[1:] for x in sorted(rules, reverse=True)]
250
251
252class Failure:
253    """Class that raises a `TemplateSyntaxError` if called.
254    Used by the `Lexer` to specify known errors.
255    """
256
257    def __init__(
258        self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
259    ) -> None:
260        self.message = message
261        self.error_class = cls
262
263    def __call__(self, lineno: int, filename: str) -> "te.NoReturn":
264        raise self.error_class(self.message, lineno, filename)
265
266
267class Token(t.NamedTuple):
268    lineno: int
269    type: str
270    value: str
271
272    def __str__(self) -> str:
273        return describe_token(self)
274
275    def test(self, expr: str) -> bool:
276        """Test a token against a token expression.  This can either be a
277        token type or ``'token_type:token_value'``.  This can only test
278        against string values and types.
279        """
280        # here we do a regular string equality check as test_any is usually
281        # passed an iterable of not interned strings.
282        if self.type == expr:
283            return True
284
285        if ":" in expr:
286            return expr.split(":", 1) == [self.type, self.value]
287
288        return False
289
290    def test_any(self, *iterable: str) -> bool:
291        """Test against multiple token expressions."""
292        return any(self.test(expr) for expr in iterable)
293
294
295class TokenStreamIterator:
296    """The iterator for tokenstreams.  Iterate over the stream
297    until the eof token is reached.
298    """
299
300    def __init__(self, stream: "TokenStream") -> None:
301        self.stream = stream
302
303    def __iter__(self) -> "TokenStreamIterator":
304        return self
305
306    def __next__(self) -> Token:
307        token = self.stream.current
308
309        if token.type is TOKEN_EOF:
310            self.stream.close()
311            raise StopIteration
312
313        next(self.stream)
314        return token
315
316
317class TokenStream:
318    """A token stream is an iterable that yields :class:`Token`\\s.  The
319    parser however does not iterate over it but calls :meth:`next` to go
320    one token ahead.  The current active token is stored as :attr:`current`.
321    """
322
323    def __init__(
324        self,
325        generator: t.Iterable[Token],
326        name: t.Optional[str],
327        filename: t.Optional[str],
328    ):
329        self._iter = iter(generator)
330        self._pushed: "te.Deque[Token]" = deque()
331        self.name = name
332        self.filename = filename
333        self.closed = False
334        self.current = Token(1, TOKEN_INITIAL, "")
335        next(self)
336
337    def __iter__(self) -> TokenStreamIterator:
338        return TokenStreamIterator(self)
339
340    def __bool__(self) -> bool:
341        return bool(self._pushed) or self.current.type is not TOKEN_EOF
342
343    @property
344    def eos(self) -> bool:
345        """Are we at the end of the stream?"""
346        return not self
347
348    def push(self, token: Token) -> None:
349        """Push a token back to the stream."""
350        self._pushed.append(token)
351
352    def look(self) -> Token:
353        """Look at the next token."""
354        old_token = next(self)
355        result = self.current
356        self.push(result)
357        self.current = old_token
358        return result
359
360    def skip(self, n: int = 1) -> None:
361        """Got n tokens ahead."""
362        for _ in range(n):
363            next(self)
364
365    def next_if(self, expr: str) -> t.Optional[Token]:
366        """Perform the token test and return the token if it matched.
367        Otherwise the return value is `None`.
368        """
369        if self.current.test(expr):
370            return next(self)
371
372        return None
373
374    def skip_if(self, expr: str) -> bool:
375        """Like :meth:`next_if` but only returns `True` or `False`."""
376        return self.next_if(expr) is not None
377
378    def __next__(self) -> Token:
379        """Go one token ahead and return the old one.
380
381        Use the built-in :func:`next` instead of calling this directly.
382        """
383        rv = self.current
384
385        if self._pushed:
386            self.current = self._pushed.popleft()
387        elif self.current.type is not TOKEN_EOF:
388            try:
389                self.current = next(self._iter)
390            except StopIteration:
391                self.close()
392
393        return rv
394
395    def close(self) -> None:
396        """Close the stream."""
397        self.current = Token(self.current.lineno, TOKEN_EOF, "")
398        self._iter = iter(())
399        self.closed = True
400
401    def expect(self, expr: str) -> Token:
402        """Expect a given token type and return it.  This accepts the same
403        argument as :meth:`jinja2.lexer.Token.test`.
404        """
405        if not self.current.test(expr):
406            expr = describe_token_expr(expr)
407
408            if self.current.type is TOKEN_EOF:
409                raise TemplateSyntaxError(
410                    f"unexpected end of template, expected {expr!r}.",
411                    self.current.lineno,
412                    self.name,
413                    self.filename,
414                )
415
416            raise TemplateSyntaxError(
417                f"expected token {expr!r}, got {describe_token(self.current)!r}",
418                self.current.lineno,
419                self.name,
420                self.filename,
421            )
422
423        return next(self)
424
425
426def get_lexer(environment: "Environment") -> "Lexer":
427    """Return a lexer which is probably cached."""
428    key = (
429        environment.block_start_string,
430        environment.block_end_string,
431        environment.variable_start_string,
432        environment.variable_end_string,
433        environment.comment_start_string,
434        environment.comment_end_string,
435        environment.line_statement_prefix,
436        environment.line_comment_prefix,
437        environment.trim_blocks,
438        environment.lstrip_blocks,
439        environment.newline_sequence,
440        environment.keep_trailing_newline,
441    )
442    lexer = _lexer_cache.get(key)
443
444    if lexer is None:
445        _lexer_cache[key] = lexer = Lexer(environment)
446
447    return lexer
448
449
450class OptionalLStrip(tuple):
451    """A special tuple for marking a point in the state that can have
452    lstrip applied.
453    """
454
455    __slots__ = ()
456
457    # Even though it looks like a no-op, creating instances fails
458    # without this.
459    def __new__(cls, *members, **kwargs):  # type: ignore
460        return super().__new__(cls, members)
461
462
463class _Rule(t.NamedTuple):
464    pattern: t.Pattern[str]
465    tokens: t.Union[str, t.Tuple[str, ...], t.Tuple[Failure]]
466    command: t.Optional[str]
467
468
469class Lexer:
470    """Class that implements a lexer for a given environment. Automatically
471    created by the environment class, usually you don't have to do that.
472
473    Note that the lexer is not automatically bound to an environment.
474    Multiple environments can share the same lexer.
475    """
476
477    def __init__(self, environment: "Environment") -> None:
478        # shortcuts
479        e = re.escape
480
481        def c(x: str) -> t.Pattern[str]:
482            return re.compile(x, re.M | re.S)
483
484        # lexing rules for tags
485        tag_rules: t.List[_Rule] = [
486            _Rule(whitespace_re, TOKEN_WHITESPACE, None),
487            _Rule(float_re, TOKEN_FLOAT, None),
488            _Rule(integer_re, TOKEN_INTEGER, None),
489            _Rule(name_re, TOKEN_NAME, None),
490            _Rule(string_re, TOKEN_STRING, None),
491            _Rule(operator_re, TOKEN_OPERATOR, None),
492        ]
493
494        # assemble the root lexing rule. because "|" is ungreedy
495        # we have to sort by length so that the lexer continues working
496        # as expected when we have parsing rules like <% for block and
497        # <%= for variables. (if someone wants asp like syntax)
498        # variables are just part of the rules if variable processing
499        # is required.
500        root_tag_rules = compile_rules(environment)
501
502        block_start_re = e(environment.block_start_string)
503        block_end_re = e(environment.block_end_string)
504        comment_end_re = e(environment.comment_end_string)
505        variable_end_re = e(environment.variable_end_string)
506
507        # block suffix if trimming is enabled
508        block_suffix_re = "\\n?" if environment.trim_blocks else ""
509
510        self.lstrip_blocks = environment.lstrip_blocks
511
512        self.newline_sequence = environment.newline_sequence
513        self.keep_trailing_newline = environment.keep_trailing_newline
514
515        root_raw_re = (
516            rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
517            rf"(?:\-{block_end_re}\s*|{block_end_re}))"
518        )
519        root_parts_re = "|".join(
520            [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
521        )
522
523        # global lexing rules
524        self.rules: t.Dict[str, t.List[_Rule]] = {
525            "root": [
526                # directives
527                _Rule(
528                    c(rf"(.*?)(?:{root_parts_re})"),
529                    OptionalLStrip(TOKEN_DATA, "#bygroup"),  # type: ignore
530                    "#bygroup",
531                ),
532                # data
533                _Rule(c(".+"), TOKEN_DATA, None),
534            ],
535            # comments
536            TOKEN_COMMENT_BEGIN: [
537                _Rule(
538                    c(
539                        rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
540                        rf"|{comment_end_re}{block_suffix_re}))"
541                    ),
542                    (TOKEN_COMMENT, TOKEN_COMMENT_END),
543                    "#pop",
544                ),
545                _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
546            ],
547            # blocks
548            TOKEN_BLOCK_BEGIN: [
549                _Rule(
550                    c(
551                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
552                        rf"|{block_end_re}{block_suffix_re})"
553                    ),
554                    TOKEN_BLOCK_END,
555                    "#pop",
556                ),
557            ]
558            + tag_rules,
559            # variables
560            TOKEN_VARIABLE_BEGIN: [
561                _Rule(
562                    c(rf"\-{variable_end_re}\s*|{variable_end_re}"),
563                    TOKEN_VARIABLE_END,
564                    "#pop",
565                )
566            ]
567            + tag_rules,
568            # raw block
569            TOKEN_RAW_BEGIN: [
570                _Rule(
571                    c(
572                        rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
573                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
574                        rf"|{block_end_re}{block_suffix_re}))"
575                    ),
576                    OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END),  # type: ignore
577                    "#pop",
578                ),
579                _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
580            ],
581            # line statements
582            TOKEN_LINESTATEMENT_BEGIN: [
583                _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
584            ]
585            + tag_rules,
586            # line comments
587            TOKEN_LINECOMMENT_BEGIN: [
588                _Rule(
589                    c(r"(.*?)()(?=\n|$)"),
590                    (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
591                    "#pop",
592                )
593            ],
594        }
595
596    def _normalize_newlines(self, value: str) -> str:
597        """Replace all newlines with the configured sequence in strings
598        and template data.
599        """
600        return newline_re.sub(self.newline_sequence, value)
601
602    def tokenize(
603        self,
604        source: str,
605        name: t.Optional[str] = None,
606        filename: t.Optional[str] = None,
607        state: t.Optional[str] = None,
608    ) -> TokenStream:
609        """Calls tokeniter + tokenize and wraps it in a token stream."""
610        stream = self.tokeniter(source, name, filename, state)
611        return TokenStream(self.wrap(stream, name, filename), name, filename)
612
613    def wrap(
614        self,
615        stream: t.Iterable[t.Tuple[int, str, str]],
616        name: t.Optional[str] = None,
617        filename: t.Optional[str] = None,
618    ) -> t.Iterator[Token]:
619        """This is called with the stream as returned by `tokenize` and wraps
620        every token in a :class:`Token` and converts the value.
621        """
622        for lineno, token, value_str in stream:
623            if token in ignored_tokens:
624                continue
625
626            value: t.Any = value_str
627
628            if token == TOKEN_LINESTATEMENT_BEGIN:
629                token = TOKEN_BLOCK_BEGIN
630            elif token == TOKEN_LINESTATEMENT_END:
631                token = TOKEN_BLOCK_END
632            # we are not interested in those tokens in the parser
633            elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
634                continue
635            elif token == TOKEN_DATA:
636                value = self._normalize_newlines(value_str)
637            elif token == "keyword":
638                token = value_str
639            elif token == TOKEN_NAME:
640                value = value_str
641
642                if not value.isidentifier():
643                    raise TemplateSyntaxError(
644                        "Invalid character in identifier", lineno, name, filename
645                    )
646            elif token == TOKEN_STRING:
647                # try to unescape string
648                try:
649                    value = (
650                        self._normalize_newlines(value_str[1:-1])
651                        .encode("ascii", "backslashreplace")
652                        .decode("unicode-escape")
653                    )
654                except Exception as e:
655                    msg = str(e).split(":")[-1].strip()
656                    raise TemplateSyntaxError(msg, lineno, name, filename) from e
657            elif token == TOKEN_INTEGER:
658                value = int(value_str.replace("_", ""), 0)
659            elif token == TOKEN_FLOAT:
660                # remove all "_" first to support more Python versions
661                value = literal_eval(value_str.replace("_", ""))
662            elif token == TOKEN_OPERATOR:
663                token = operators[value_str]
664
665            yield Token(lineno, token, value)
666
667    def tokeniter(
668        self,
669        source: str,
670        name: t.Optional[str],
671        filename: t.Optional[str] = None,
672        state: t.Optional[str] = None,
673    ) -> t.Iterator[t.Tuple[int, str, str]]:
674        """This method tokenizes the text and returns the tokens in a
675        generator. Use this method if you just want to tokenize a template.
676
677        .. versionchanged:: 3.0
678            Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
679            breaks.
680        """
681        lines = newline_re.split(source)[::2]
682
683        if not self.keep_trailing_newline and lines[-1] == "":
684            del lines[-1]
685
686        source = "\n".join(lines)
687        pos = 0
688        lineno = 1
689        stack = ["root"]
690
691        if state is not None and state != "root":
692            assert state in ("variable", "block"), "invalid state"
693            stack.append(state + "_begin")
694
695        statetokens = self.rules[stack[-1]]
696        source_length = len(source)
697        balancing_stack: t.List[str] = []
698        newlines_stripped = 0
699        line_starting = True
700
701        while True:
702            # tokenizer loop
703            for regex, tokens, new_state in statetokens:
704                m = regex.match(source, pos)
705
706                # if no match we try again with the next rule
707                if m is None:
708                    continue
709
710                # we only match blocks and variables if braces / parentheses
711                # are balanced. continue parsing with the lower rule which
712                # is the operator rule. do this only if the end tags look
713                # like operators
714                if balancing_stack and tokens in (
715                    TOKEN_VARIABLE_END,
716                    TOKEN_BLOCK_END,
717                    TOKEN_LINESTATEMENT_END,
718                ):
719                    continue
720
721                # tuples support more options
722                if isinstance(tokens, tuple):
723                    groups: t.Sequence[str] = m.groups()
724
725                    if isinstance(tokens, OptionalLStrip):
726                        # Rule supports lstrip. Match will look like
727                        # text, block type, whitespace control, type, control, ...
728                        text = groups[0]
729                        # Skipping the text and first type, every other group is the
730                        # whitespace control for each type. One of the groups will be
731                        # -, +, or empty string instead of None.
732                        strip_sign = next(g for g in groups[2::2] if g is not None)
733
734                        if strip_sign == "-":
735                            # Strip all whitespace between the text and the tag.
736                            stripped = text.rstrip()
737                            newlines_stripped = text[len(stripped) :].count("\n")
738                            groups = [stripped, *groups[1:]]
739                        elif (
740                            # Not marked for preserving whitespace.
741                            strip_sign != "+"
742                            # lstrip is enabled.
743                            and self.lstrip_blocks
744                            # Not a variable expression.
745                            and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
746                        ):
747                            # The start of text between the last newline and the tag.
748                            l_pos = text.rfind("\n") + 1
749
750                            if l_pos > 0 or line_starting:
751                                # If there's only whitespace between the newline and the
752                                # tag, strip it.
753                                if whitespace_re.fullmatch(text, l_pos):
754                                    groups = [text[:l_pos], *groups[1:]]
755
756                    for idx, token in enumerate(tokens):
757                        # failure group
758                        if token.__class__ is Failure:
759                            raise token(lineno, filename)
760                        # bygroup is a bit more complex, in that case we
761                        # yield for the current token the first named
762                        # group that matched
763                        elif token == "#bygroup":
764                            for key, value in m.groupdict().items():
765                                if value is not None:
766                                    yield lineno, key, value
767                                    lineno += value.count("\n")
768                                    break
769                            else:
770                                raise RuntimeError(
771                                    f"{regex!r} wanted to resolve the token dynamically"
772                                    " but no group matched"
773                                )
774                        # normal group
775                        else:
776                            data = groups[idx]
777
778                            if data or token not in ignore_if_empty:
779                                yield lineno, token, data
780
781                            lineno += data.count("\n") + newlines_stripped
782                            newlines_stripped = 0
783
784                # strings as token just are yielded as it.
785                else:
786                    data = m.group()
787
788                    # update brace/parentheses balance
789                    if tokens == TOKEN_OPERATOR:
790                        if data == "{":
791                            balancing_stack.append("}")
792                        elif data == "(":
793                            balancing_stack.append(")")
794                        elif data == "[":
795                            balancing_stack.append("]")
796                        elif data in ("}", ")", "]"):
797                            if not balancing_stack:
798                                raise TemplateSyntaxError(
799                                    f"unexpected '{data}'", lineno, name, filename
800                                )
801
802                            expected_op = balancing_stack.pop()
803
804                            if expected_op != data:
805                                raise TemplateSyntaxError(
806                                    f"unexpected '{data}', expected '{expected_op}'",
807                                    lineno,
808                                    name,
809                                    filename,
810                                )
811
812                    # yield items
813                    if data or tokens not in ignore_if_empty:
814                        yield lineno, tokens, data
815
816                    lineno += data.count("\n")
817
818                line_starting = m.group()[-1:] == "\n"
819                # fetch new position into new variable so that we can check
820                # if there is a internal parsing error which would result
821                # in an infinite loop
822                pos2 = m.end()
823
824                # handle state changes
825                if new_state is not None:
826                    # remove the uppermost state
827                    if new_state == "#pop":
828                        stack.pop()
829                    # resolve the new state by group checking
830                    elif new_state == "#bygroup":
831                        for key, value in m.groupdict().items():
832                            if value is not None:
833                                stack.append(key)
834                                break
835                        else:
836                            raise RuntimeError(
837                                f"{regex!r} wanted to resolve the new state dynamically"
838                                f" but no group matched"
839                            )
840                    # direct state name given
841                    else:
842                        stack.append(new_state)
843
844                    statetokens = self.rules[stack[-1]]
845                # we are still at the same position and no stack change.
846                # this means a loop without break condition, avoid that and
847                # raise error
848                elif pos2 == pos:
849                    raise RuntimeError(
850                        f"{regex!r} yielded empty string without stack change"
851                    )
852
853                # publish new function and start again
854                pos = pos2
855                break
856            # if loop terminated without break we haven't found a single match
857            # either we are at the end of the file or we have a problem
858            else:
859                # end of text
860                if pos >= source_length:
861                    return
862
863                # something went wrong
864                raise TemplateSyntaxError(
865                    f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
866                )

whitespace_re = re.compile('\\s+')

newline_re = re.compile('(\\r\\n|\\r|\\n)')

string_re = re.compile('(\'([^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)")', re.DOTALL)

float_re = re.compile("\n (?<!\\.) # doesn't start with a .\n (\\d+_)*\\d+ # digits, possibly _ separated\n (\n (\\.(\\d+_)*\\d+)? # optional fractional part\n e[+\\-]?(\\d+_)*\\d+ # exponent par, re.IGNORECASE|re.VERBOSE)

TOKEN_ADD = 'add'

TOKEN_ASSIGN = 'assign'

TOKEN_COLON = 'colon'

TOKEN_COMMA = 'comma'

TOKEN_DIV = 'div'

TOKEN_DOT = 'dot'

TOKEN_EQ = 'eq'

TOKEN_FLOORDIV = 'floordiv'

TOKEN_GT = 'gt'

TOKEN_GTEQ = 'gteq'

TOKEN_LBRACE = 'lbrace'

TOKEN_LBRACKET = 'lbracket'

TOKEN_LPAREN = 'lparen'

TOKEN_LT = 'lt'

TOKEN_LTEQ = 'lteq'

TOKEN_MOD = 'mod'

TOKEN_MUL = 'mul'

TOKEN_NE = 'ne'

TOKEN_PIPE = 'pipe'

TOKEN_POW = 'pow'

TOKEN_RBRACE = 'rbrace'

TOKEN_RBRACKET = 'rbracket'

TOKEN_RPAREN = 'rparen'

TOKEN_SEMICOLON = 'semicolon'

TOKEN_SUB = 'sub'

TOKEN_TILDE = 'tilde'

TOKEN_WHITESPACE = 'whitespace'

TOKEN_FLOAT = 'float'

TOKEN_INTEGER = 'integer'

TOKEN_NAME = 'name'

TOKEN_STRING = 'string'

TOKEN_OPERATOR = 'operator'

TOKEN_BLOCK_BEGIN = 'block_begin'

TOKEN_BLOCK_END = 'block_end'

TOKEN_VARIABLE_BEGIN = 'variable_begin'

TOKEN_VARIABLE_END = 'variable_end'

TOKEN_RAW_BEGIN = 'raw_begin'

TOKEN_RAW_END = 'raw_end'

TOKEN_COMMENT_BEGIN = 'comment_begin'

TOKEN_COMMENT_END = 'comment_end'

TOKEN_COMMENT = 'comment'

TOKEN_LINESTATEMENT_BEGIN = 'linestatement_begin'

TOKEN_LINESTATEMENT_END = 'linestatement_end'

TOKEN_LINECOMMENT_BEGIN = 'linecomment_begin'

TOKEN_LINECOMMENT_END = 'linecomment_end'

TOKEN_LINECOMMENT = 'linecomment'

TOKEN_DATA = 'data'

TOKEN_INITIAL = 'initial'

TOKEN_EOF = 'eof'

operators = {'+': 'add', '-': 'sub', '/': 'div', '//': 'floordiv', '*': 'mul', '%': 'mod', '**': 'pow', '~': 'tilde', '[': 'lbracket', ']': 'rbracket', '(': 'lparen', ')': 'rparen', '{': 'lbrace', '}': 'rbrace', '==': 'eq', '!=': 'ne', '>': 'gt', '>=': 'gteq', '<': 'lt', '<=': 'lteq', '=': 'assign', '.': 'dot', ':': 'colon', '|': 'pipe', ',': 'comma', ';': 'semicolon'}

reverse_operators = {'add': '+', 'sub': '-', 'div': '/', 'floordiv': '//', 'mul': '*', 'mod': '%', 'pow': '**', 'tilde': '~', 'lbracket': '[', 'rbracket': ']', 'lparen': '(', 'rparen': ')', 'lbrace': '{', 'rbrace': '}', 'eq': '==', 'ne': '!=', 'gt': '>', 'gteq': '>=', 'lt': '<', 'lteq': '<=', 'assign': '=', 'dot': '.', 'colon': ':', 'pipe': '|', 'comma': ',', 'semicolon': ';'}

operator_re = re.compile('(//|\\*\\*|==|!=|>=|<=|\\+|\\-|/|\\*|%|\\~|\\[|\\]|\\(|\\)|\\{|\\}|>|<|=|\\.|:|\\||,|;)')

ignored_tokens = frozenset({'linecomment_end', 'whitespace', 'comment_begin', 'linecomment', 'linecomment_begin', 'comment_end', 'comment'})

ignore_if_empty = frozenset({'linecomment', 'whitespace', 'comment', 'data'})

def describe_token(token: Token) -> str: View Source

184def describe_token(token: "Token") -> str:
185    """Returns a description of the token."""
186    if token.type == TOKEN_NAME:
187        return token.value
188
189    return _describe_token_type(token.type)

Returns a description of the token.

def describe_token_expr(expr: str) -> str: View Source

192def describe_token_expr(expr: str) -> str:
193    """Like `describe_token` but for token expressions."""
194    if ":" in expr:
195        type, value = expr.split(":", 1)
196
197        if type == TOKEN_NAME:
198            return value
199    else:
200        type = expr
201
202    return _describe_token_type(type)

Like describe_token but for token expressions.

def count_newlines(value: str) -> int: View Source

205def count_newlines(value: str) -> int:
206    """Count the number of newline characters in the string.  This is
207    useful for extensions that filter a stream.
208    """
209    return len(newline_re.findall(value))

Count the number of newline characters in the string. This is useful for extensions that filter a stream.

def compile_rules(environment: jinja2.environment.Environment) -> List[Tuple[str, str]]: View Source

212def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]:
213    """Compiles all the rules from the environment into a list of rules."""
214    e = re.escape
215    rules = [
216        (
217            len(environment.comment_start_string),
218            TOKEN_COMMENT_BEGIN,
219            e(environment.comment_start_string),
220        ),
221        (
222            len(environment.block_start_string),
223            TOKEN_BLOCK_BEGIN,
224            e(environment.block_start_string),
225        ),
226        (
227            len(environment.variable_start_string),
228            TOKEN_VARIABLE_BEGIN,
229            e(environment.variable_start_string),
230        ),
231    ]
232
233    if environment.line_statement_prefix is not None:
234        rules.append(
235            (
236                len(environment.line_statement_prefix),
237                TOKEN_LINESTATEMENT_BEGIN,
238                r"^[ \t\v]*" + e(environment.line_statement_prefix),
239            )
240        )
241    if environment.line_comment_prefix is not None:
242        rules.append(
243            (
244                len(environment.line_comment_prefix),
245                TOKEN_LINECOMMENT_BEGIN,
246                r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix),
247            )
248        )
249
250    return [x[1:] for x in sorted(rules, reverse=True)]

Compiles all the rules from the environment into a list of rules.

class Failure: View Source

253class Failure:
254    """Class that raises a `TemplateSyntaxError` if called.
255    Used by the `Lexer` to specify known errors.
256    """
257
258    def __init__(
259        self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
260    ) -> None:
261        self.message = message
262        self.error_class = cls
263
264    def __call__(self, lineno: int, filename: str) -> "te.NoReturn":
265        raise self.error_class(self.message, lineno, filename)

Class that raises a TemplateSyntaxError if called. Used by the Lexer to specify known errors.

Failure( message: str, cls: Type[jinja2.exceptions.TemplateSyntaxError] = <class 'jinja2.exceptions.TemplateSyntaxError'>) View Source

258    def __init__(
259        self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
260    ) -> None:
261        self.message = message
262        self.error_class = cls

message

error_class

class Token(typing.NamedTuple): View Source

268class Token(t.NamedTuple):
269    lineno: int
270    type: str
271    value: str
272
273    def __str__(self) -> str:
274        return describe_token(self)
275
276    def test(self, expr: str) -> bool:
277        """Test a token against a token expression.  This can either be a
278        token type or ``'token_type:token_value'``.  This can only test
279        against string values and types.
280        """
281        # here we do a regular string equality check as test_any is usually
282        # passed an iterable of not interned strings.
283        if self.type == expr:
284            return True
285
286        if ":" in expr:
287            return expr.split(":", 1) == [self.type, self.value]
288
289        return False
290
291    def test_any(self, *iterable: str) -> bool:
292        """Test against multiple token expressions."""
293        return any(self.test(expr) for expr in iterable)

Token(lineno, type, value)

Token(lineno: int, type: str, value: str)

Create new instance of Token(lineno, type, value)

lineno: int

Alias for field number 0

type: str

Alias for field number 1

value: str

Alias for field number 2

def test(self, expr: str) -> bool: View Source

276    def test(self, expr: str) -> bool:
277        """Test a token against a token expression.  This can either be a
278        token type or ``'token_type:token_value'``.  This can only test
279        against string values and types.
280        """
281        # here we do a regular string equality check as test_any is usually
282        # passed an iterable of not interned strings.
283        if self.type == expr:
284            return True
285
286        if ":" in expr:
287            return expr.split(":", 1) == [self.type, self.value]
288
289        return False

Test a token against a token expression. This can either be a token type or 'token_type:token_value'. This can only test against string values and types.

def test_any(self, *iterable: str) -> bool: View Source

291    def test_any(self, *iterable: str) -> bool:
292        """Test against multiple token expressions."""
293        return any(self.test(expr) for expr in iterable)

Test against multiple token expressions.

Inherited Members

builtins.tuple: index; count

class TokenStreamIterator: View Source

296class TokenStreamIterator:
297    """The iterator for tokenstreams.  Iterate over the stream
298    until the eof token is reached.
299    """
300
301    def __init__(self, stream: "TokenStream") -> None:
302        self.stream = stream
303
304    def __iter__(self) -> "TokenStreamIterator":
305        return self
306
307    def __next__(self) -> Token:
308        token = self.stream.current
309
310        if token.type is TOKEN_EOF:
311            self.stream.close()
312            raise StopIteration
313
314        next(self.stream)
315        return token

The iterator for tokenstreams. Iterate over the stream until the eof token is reached.

TokenStreamIterator(stream: TokenStream) View Source

301    def __init__(self, stream: "TokenStream") -> None:
302        self.stream = stream

stream

def get_lexer(environment: jinja2.environment.Environment) -> Lexer: View Source

427def get_lexer(environment: "Environment") -> "Lexer":
428    """Return a lexer which is probably cached."""
429    key = (
430        environment.block_start_string,
431        environment.block_end_string,
432        environment.variable_start_string,
433        environment.variable_end_string,
434        environment.comment_start_string,
435        environment.comment_end_string,
436        environment.line_statement_prefix,
437        environment.line_comment_prefix,
438        environment.trim_blocks,
439        environment.lstrip_blocks,
440        environment.newline_sequence,
441        environment.keep_trailing_newline,
442    )
443    lexer = _lexer_cache.get(key)
444
445    if lexer is None:
446        _lexer_cache[key] = lexer = Lexer(environment)
447
448    return lexer

Return a lexer which is probably cached.

class OptionalLStrip(builtins.tuple): View Source

451class OptionalLStrip(tuple):
452    """A special tuple for marking a point in the state that can have
453    lstrip applied.
454    """
455
456    __slots__ = ()
457
458    # Even though it looks like a no-op, creating instances fails
459    # without this.
460    def __new__(cls, *members, **kwargs):  # type: ignore
461        return super().__new__(cls, members)

A special tuple for marking a point in the state that can have lstrip applied.

Inherited Members

builtins.tuple: index; count

class Lexer: View Source

470class Lexer:
471    """Class that implements a lexer for a given environment. Automatically
472    created by the environment class, usually you don't have to do that.
473
474    Note that the lexer is not automatically bound to an environment.
475    Multiple environments can share the same lexer.
476    """
477
478    def __init__(self, environment: "Environment") -> None:
479        # shortcuts
480        e = re.escape
481
482        def c(x: str) -> t.Pattern[str]:
483            return re.compile(x, re.M | re.S)
484
485        # lexing rules for tags
486        tag_rules: t.List[_Rule] = [
487            _Rule(whitespace_re, TOKEN_WHITESPACE, None),
488            _Rule(float_re, TOKEN_FLOAT, None),
489            _Rule(integer_re, TOKEN_INTEGER, None),
490            _Rule(name_re, TOKEN_NAME, None),
491            _Rule(string_re, TOKEN_STRING, None),
492            _Rule(operator_re, TOKEN_OPERATOR, None),
493        ]
494
495        # assemble the root lexing rule. because "|" is ungreedy
496        # we have to sort by length so that the lexer continues working
497        # as expected when we have parsing rules like <% for block and
498        # <%= for variables. (if someone wants asp like syntax)
499        # variables are just part of the rules if variable processing
500        # is required.
501        root_tag_rules = compile_rules(environment)
502
503        block_start_re = e(environment.block_start_string)
504        block_end_re = e(environment.block_end_string)
505        comment_end_re = e(environment.comment_end_string)
506        variable_end_re = e(environment.variable_end_string)
507
508        # block suffix if trimming is enabled
509        block_suffix_re = "\\n?" if environment.trim_blocks else ""
510
511        self.lstrip_blocks = environment.lstrip_blocks
512
513        self.newline_sequence = environment.newline_sequence
514        self.keep_trailing_newline = environment.keep_trailing_newline
515
516        root_raw_re = (
517            rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
518            rf"(?:\-{block_end_re}\s*|{block_end_re}))"
519        )
520        root_parts_re = "|".join(
521            [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
522        )
523
524        # global lexing rules
525        self.rules: t.Dict[str, t.List[_Rule]] = {
526            "root": [
527                # directives
528                _Rule(
529                    c(rf"(.*?)(?:{root_parts_re})"),
530                    OptionalLStrip(TOKEN_DATA, "#bygroup"),  # type: ignore
531                    "#bygroup",
532                ),
533                # data
534                _Rule(c(".+"), TOKEN_DATA, None),
535            ],
536            # comments
537            TOKEN_COMMENT_BEGIN: [
538                _Rule(
539                    c(
540                        rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
541                        rf"|{comment_end_re}{block_suffix_re}))"
542                    ),
543                    (TOKEN_COMMENT, TOKEN_COMMENT_END),
544                    "#pop",
545                ),
546                _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
547            ],
548            # blocks
549            TOKEN_BLOCK_BEGIN: [
550                _Rule(
551                    c(
552                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
553                        rf"|{block_end_re}{block_suffix_re})"
554                    ),
555                    TOKEN_BLOCK_END,
556                    "#pop",
557                ),
558            ]
559            + tag_rules,
560            # variables
561            TOKEN_VARIABLE_BEGIN: [
562                _Rule(
563                    c(rf"\-{variable_end_re}\s*|{variable_end_re}"),
564                    TOKEN_VARIABLE_END,
565                    "#pop",
566                )
567            ]
568            + tag_rules,
569            # raw block
570            TOKEN_RAW_BEGIN: [
571                _Rule(
572                    c(
573                        rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
574                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
575                        rf"|{block_end_re}{block_suffix_re}))"
576                    ),
577                    OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END),  # type: ignore
578                    "#pop",
579                ),
580                _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
581            ],
582            # line statements
583            TOKEN_LINESTATEMENT_BEGIN: [
584                _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
585            ]
586            + tag_rules,
587            # line comments
588            TOKEN_LINECOMMENT_BEGIN: [
589                _Rule(
590                    c(r"(.*?)()(?=\n|$)"),
591                    (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
592                    "#pop",
593                )
594            ],
595        }
596
597    def _normalize_newlines(self, value: str) -> str:
598        """Replace all newlines with the configured sequence in strings
599        and template data.
600        """
601        return newline_re.sub(self.newline_sequence, value)
602
603    def tokenize(
604        self,
605        source: str,
606        name: t.Optional[str] = None,
607        filename: t.Optional[str] = None,
608        state: t.Optional[str] = None,
609    ) -> TokenStream:
610        """Calls tokeniter + tokenize and wraps it in a token stream."""
611        stream = self.tokeniter(source, name, filename, state)
612        return TokenStream(self.wrap(stream, name, filename), name, filename)
613
614    def wrap(
615        self,
616        stream: t.Iterable[t.Tuple[int, str, str]],
617        name: t.Optional[str] = None,
618        filename: t.Optional[str] = None,
619    ) -> t.Iterator[Token]:
620        """This is called with the stream as returned by `tokenize` and wraps
621        every token in a :class:`Token` and converts the value.
622        """
623        for lineno, token, value_str in stream:
624            if token in ignored_tokens:
625                continue
626
627            value: t.Any = value_str
628
629            if token == TOKEN_LINESTATEMENT_BEGIN:
630                token = TOKEN_BLOCK_BEGIN
631            elif token == TOKEN_LINESTATEMENT_END:
632                token = TOKEN_BLOCK_END
633            # we are not interested in those tokens in the parser
634            elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
635                continue
636            elif token == TOKEN_DATA:
637                value = self._normalize_newlines(value_str)
638            elif token == "keyword":
639                token = value_str
640            elif token == TOKEN_NAME:
641                value = value_str
642
643                if not value.isidentifier():
644                    raise TemplateSyntaxError(
645                        "Invalid character in identifier", lineno, name, filename
646                    )
647            elif token == TOKEN_STRING:
648                # try to unescape string
649                try:
650                    value = (
651                        self._normalize_newlines(value_str[1:-1])
652                        .encode("ascii", "backslashreplace")
653                        .decode("unicode-escape")
654                    )
655                except Exception as e:
656                    msg = str(e).split(":")[-1].strip()
657                    raise TemplateSyntaxError(msg, lineno, name, filename) from e
658            elif token == TOKEN_INTEGER:
659                value = int(value_str.replace("_", ""), 0)
660            elif token == TOKEN_FLOAT:
661                # remove all "_" first to support more Python versions
662                value = literal_eval(value_str.replace("_", ""))
663            elif token == TOKEN_OPERATOR:
664                token = operators[value_str]
665
666            yield Token(lineno, token, value)
667
668    def tokeniter(
669        self,
670        source: str,
671        name: t.Optional[str],
672        filename: t.Optional[str] = None,
673        state: t.Optional[str] = None,
674    ) -> t.Iterator[t.Tuple[int, str, str]]:
675        """This method tokenizes the text and returns the tokens in a
676        generator. Use this method if you just want to tokenize a template.
677
678        .. versionchanged:: 3.0
679            Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
680            breaks.
681        """
682        lines = newline_re.split(source)[::2]
683
684        if not self.keep_trailing_newline and lines[-1] == "":
685            del lines[-1]
686
687        source = "\n".join(lines)
688        pos = 0
689        lineno = 1
690        stack = ["root"]
691
692        if state is not None and state != "root":
693            assert state in ("variable", "block"), "invalid state"
694            stack.append(state + "_begin")
695
696        statetokens = self.rules[stack[-1]]
697        source_length = len(source)
698        balancing_stack: t.List[str] = []
699        newlines_stripped = 0
700        line_starting = True
701
702        while True:
703            # tokenizer loop
704            for regex, tokens, new_state in statetokens:
705                m = regex.match(source, pos)
706
707                # if no match we try again with the next rule
708                if m is None:
709                    continue
710
711                # we only match blocks and variables if braces / parentheses
712                # are balanced. continue parsing with the lower rule which
713                # is the operator rule. do this only if the end tags look
714                # like operators
715                if balancing_stack and tokens in (
716                    TOKEN_VARIABLE_END,
717                    TOKEN_BLOCK_END,
718                    TOKEN_LINESTATEMENT_END,
719                ):
720                    continue
721
722                # tuples support more options
723                if isinstance(tokens, tuple):
724                    groups: t.Sequence[str] = m.groups()
725
726                    if isinstance(tokens, OptionalLStrip):
727                        # Rule supports lstrip. Match will look like
728                        # text, block type, whitespace control, type, control, ...
729                        text = groups[0]
730                        # Skipping the text and first type, every other group is the
731                        # whitespace control for each type. One of the groups will be
732                        # -, +, or empty string instead of None.
733                        strip_sign = next(g for g in groups[2::2] if g is not None)
734
735                        if strip_sign == "-":
736                            # Strip all whitespace between the text and the tag.
737                            stripped = text.rstrip()
738                            newlines_stripped = text[len(stripped) :].count("\n")
739                            groups = [stripped, *groups[1:]]
740                        elif (
741                            # Not marked for preserving whitespace.
742                            strip_sign != "+"
743                            # lstrip is enabled.
744                            and self.lstrip_blocks
745                            # Not a variable expression.
746                            and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
747                        ):
748                            # The start of text between the last newline and the tag.
749                            l_pos = text.rfind("\n") + 1
750
751                            if l_pos > 0 or line_starting:
752                                # If there's only whitespace between the newline and the
753                                # tag, strip it.
754                                if whitespace_re.fullmatch(text, l_pos):
755                                    groups = [text[:l_pos], *groups[1:]]
756
757                    for idx, token in enumerate(tokens):
758                        # failure group
759                        if token.__class__ is Failure:
760                            raise token(lineno, filename)
761                        # bygroup is a bit more complex, in that case we
762                        # yield for the current token the first named
763                        # group that matched
764                        elif token == "#bygroup":
765                            for key, value in m.groupdict().items():
766                                if value is not None:
767                                    yield lineno, key, value
768                                    lineno += value.count("\n")
769                                    break
770                            else:
771                                raise RuntimeError(
772                                    f"{regex!r} wanted to resolve the token dynamically"
773                                    " but no group matched"
774                                )
775                        # normal group
776                        else:
777                            data = groups[idx]
778
779                            if data or token not in ignore_if_empty:
780                                yield lineno, token, data
781
782                            lineno += data.count("\n") + newlines_stripped
783                            newlines_stripped = 0
784
785                # strings as token just are yielded as it.
786                else:
787                    data = m.group()
788
789                    # update brace/parentheses balance
790                    if tokens == TOKEN_OPERATOR:
791                        if data == "{":
792                            balancing_stack.append("}")
793                        elif data == "(":
794                            balancing_stack.append(")")
795                        elif data == "[":
796                            balancing_stack.append("]")
797                        elif data in ("}", ")", "]"):
798                            if not balancing_stack:
799                                raise TemplateSyntaxError(
800                                    f"unexpected '{data}'", lineno, name, filename
801                                )
802
803                            expected_op = balancing_stack.pop()
804
805                            if expected_op != data:
806                                raise TemplateSyntaxError(
807                                    f"unexpected '{data}', expected '{expected_op}'",
808                                    lineno,
809                                    name,
810                                    filename,
811                                )
812
813                    # yield items
814                    if data or tokens not in ignore_if_empty:
815                        yield lineno, tokens, data
816
817                    lineno += data.count("\n")
818
819                line_starting = m.group()[-1:] == "\n"
820                # fetch new position into new variable so that we can check
821                # if there is a internal parsing error which would result
822                # in an infinite loop
823                pos2 = m.end()
824
825                # handle state changes
826                if new_state is not None:
827                    # remove the uppermost state
828                    if new_state == "#pop":
829                        stack.pop()
830                    # resolve the new state by group checking
831                    elif new_state == "#bygroup":
832                        for key, value in m.groupdict().items():
833                            if value is not None:
834                                stack.append(key)
835                                break
836                        else:
837                            raise RuntimeError(
838                                f"{regex!r} wanted to resolve the new state dynamically"
839                                f" but no group matched"
840                            )
841                    # direct state name given
842                    else:
843                        stack.append(new_state)
844
845                    statetokens = self.rules[stack[-1]]
846                # we are still at the same position and no stack change.
847                # this means a loop without break condition, avoid that and
848                # raise error
849                elif pos2 == pos:
850                    raise RuntimeError(
851                        f"{regex!r} yielded empty string without stack change"
852                    )
853
854                # publish new function and start again
855                pos = pos2
856                break
857            # if loop terminated without break we haven't found a single match
858            # either we are at the end of the file or we have a problem
859            else:
860                # end of text
861                if pos >= source_length:
862                    return
863
864                # something went wrong
865                raise TemplateSyntaxError(
866                    f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
867                )

Class that implements a lexer for a given environment. Automatically created by the environment class, usually you don't have to do that.

Note that the lexer is not automatically bound to an environment. Multiple environments can share the same lexer.

Lexer(environment: jinja2.environment.Environment) View Source

478    def __init__(self, environment: "Environment") -> None:
479        # shortcuts
480        e = re.escape
481
482        def c(x: str) -> t.Pattern[str]:
483            return re.compile(x, re.M | re.S)
484
485        # lexing rules for tags
486        tag_rules: t.List[_Rule] = [
487            _Rule(whitespace_re, TOKEN_WHITESPACE, None),
488            _Rule(float_re, TOKEN_FLOAT, None),
489            _Rule(integer_re, TOKEN_INTEGER, None),
490            _Rule(name_re, TOKEN_NAME, None),
491            _Rule(string_re, TOKEN_STRING, None),
492            _Rule(operator_re, TOKEN_OPERATOR, None),
493        ]
494
495        # assemble the root lexing rule. because "|" is ungreedy
496        # we have to sort by length so that the lexer continues working
497        # as expected when we have parsing rules like <% for block and
498        # <%= for variables. (if someone wants asp like syntax)
499        # variables are just part of the rules if variable processing
500        # is required.
501        root_tag_rules = compile_rules(environment)
502
503        block_start_re = e(environment.block_start_string)
504        block_end_re = e(environment.block_end_string)
505        comment_end_re = e(environment.comment_end_string)
506        variable_end_re = e(environment.variable_end_string)
507
508        # block suffix if trimming is enabled
509        block_suffix_re = "\\n?" if environment.trim_blocks else ""
510
511        self.lstrip_blocks = environment.lstrip_blocks
512
513        self.newline_sequence = environment.newline_sequence
514        self.keep_trailing_newline = environment.keep_trailing_newline
515
516        root_raw_re = (
517            rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
518            rf"(?:\-{block_end_re}\s*|{block_end_re}))"
519        )
520        root_parts_re = "|".join(
521            [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
522        )
523
524        # global lexing rules
525        self.rules: t.Dict[str, t.List[_Rule]] = {
526            "root": [
527                # directives
528                _Rule(
529                    c(rf"(.*?)(?:{root_parts_re})"),
530                    OptionalLStrip(TOKEN_DATA, "#bygroup"),  # type: ignore
531                    "#bygroup",
532                ),
533                # data
534                _Rule(c(".+"), TOKEN_DATA, None),
535            ],
536            # comments
537            TOKEN_COMMENT_BEGIN: [
538                _Rule(
539                    c(
540                        rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
541                        rf"|{comment_end_re}{block_suffix_re}))"
542                    ),
543                    (TOKEN_COMMENT, TOKEN_COMMENT_END),
544                    "#pop",
545                ),
546                _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
547            ],
548            # blocks
549            TOKEN_BLOCK_BEGIN: [
550                _Rule(
551                    c(
552                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
553                        rf"|{block_end_re}{block_suffix_re})"
554                    ),
555                    TOKEN_BLOCK_END,
556                    "#pop",
557                ),
558            ]
559            + tag_rules,
560            # variables
561            TOKEN_VARIABLE_BEGIN: [
562                _Rule(
563                    c(rf"\-{variable_end_re}\s*|{variable_end_re}"),
564                    TOKEN_VARIABLE_END,
565                    "#pop",
566                )
567            ]
568            + tag_rules,
569            # raw block
570            TOKEN_RAW_BEGIN: [
571                _Rule(
572                    c(
573                        rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
574                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
575                        rf"|{block_end_re}{block_suffix_re}))"
576                    ),
577                    OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END),  # type: ignore
578                    "#pop",
579                ),
580                _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
581            ],
582            # line statements
583            TOKEN_LINESTATEMENT_BEGIN: [
584                _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
585            ]
586            + tag_rules,
587            # line comments
588            TOKEN_LINECOMMENT_BEGIN: [
589                _Rule(
590                    c(r"(.*?)()(?=\n|$)"),
591                    (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
592                    "#pop",
593                )
594            ],
595        }

lstrip_blocks

newline_sequence

keep_trailing_newline

rules: Dict[str, List[jinja2.lexer._Rule]]

def tokenize( self, source: str, name: Optional[str] = None, filename: Optional[str] = None, state: Optional[str] = None) -> TokenStream: View Source

603    def tokenize(
604        self,
605        source: str,
606        name: t.Optional[str] = None,
607        filename: t.Optional[str] = None,
608        state: t.Optional[str] = None,
609    ) -> TokenStream:
610        """Calls tokeniter + tokenize and wraps it in a token stream."""
611        stream = self.tokeniter(source, name, filename, state)
612        return TokenStream(self.wrap(stream, name, filename), name, filename)

Calls tokeniter + tokenize and wraps it in a token stream.

def wrap( self, stream: Iterable[Tuple[int, str, str]], name: Optional[str] = None, filename: Optional[str] = None) -> Iterator[Token]: View Source

614    def wrap(
615        self,
616        stream: t.Iterable[t.Tuple[int, str, str]],
617        name: t.Optional[str] = None,
618        filename: t.Optional[str] = None,
619    ) -> t.Iterator[Token]:
620        """This is called with the stream as returned by `tokenize` and wraps
621        every token in a :class:`Token` and converts the value.
622        """
623        for lineno, token, value_str in stream:
624            if token in ignored_tokens:
625                continue
626
627            value: t.Any = value_str
628
629            if token == TOKEN_LINESTATEMENT_BEGIN:
630                token = TOKEN_BLOCK_BEGIN
631            elif token == TOKEN_LINESTATEMENT_END:
632                token = TOKEN_BLOCK_END
633            # we are not interested in those tokens in the parser
634            elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
635                continue
636            elif token == TOKEN_DATA:
637                value = self._normalize_newlines(value_str)
638            elif token == "keyword":
639                token = value_str
640            elif token == TOKEN_NAME:
641                value = value_str
642
643                if not value.isidentifier():
644                    raise TemplateSyntaxError(
645                        "Invalid character in identifier", lineno, name, filename
646                    )
647            elif token == TOKEN_STRING:
648                # try to unescape string
649                try:
650                    value = (
651                        self._normalize_newlines(value_str[1:-1])
652                        .encode("ascii", "backslashreplace")
653                        .decode("unicode-escape")
654                    )
655                except Exception as e:
656                    msg = str(e).split(":")[-1].strip()
657                    raise TemplateSyntaxError(msg, lineno, name, filename) from e
658            elif token == TOKEN_INTEGER:
659                value = int(value_str.replace("_", ""), 0)
660            elif token == TOKEN_FLOAT:
661                # remove all "_" first to support more Python versions
662                value = literal_eval(value_str.replace("_", ""))
663            elif token == TOKEN_OPERATOR:
664                token = operators[value_str]
665
666            yield Token(lineno, token, value)

This is called with the stream as returned by tokenize and wraps every token in a Token and converts the value.

def tokeniter( self, source: str, name: Optional[str], filename: Optional[str] = None, state: Optional[str] = None) -> Iterator[Tuple[int, str, str]]: View Source

668    def tokeniter(
669        self,
670        source: str,
671        name: t.Optional[str],
672        filename: t.Optional[str] = None,
673        state: t.Optional[str] = None,
674    ) -> t.Iterator[t.Tuple[int, str, str]]:
675        """This method tokenizes the text and returns the tokens in a
676        generator. Use this method if you just want to tokenize a template.
677
678        .. versionchanged:: 3.0
679            Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
680            breaks.
681        """
682        lines = newline_re.split(source)[::2]
683
684        if not self.keep_trailing_newline and lines[-1] == "":
685            del lines[-1]
686
687        source = "\n".join(lines)
688        pos = 0
689        lineno = 1
690        stack = ["root"]
691
692        if state is not None and state != "root":
693            assert state in ("variable", "block"), "invalid state"
694            stack.append(state + "_begin")
695
696        statetokens = self.rules[stack[-1]]
697        source_length = len(source)
698        balancing_stack: t.List[str] = []
699        newlines_stripped = 0
700        line_starting = True
701
702        while True:
703            # tokenizer loop
704            for regex, tokens, new_state in statetokens:
705                m = regex.match(source, pos)
706
707                # if no match we try again with the next rule
708                if m is None:
709                    continue
710
711                # we only match blocks and variables if braces / parentheses
712                # are balanced. continue parsing with the lower rule which
713                # is the operator rule. do this only if the end tags look
714                # like operators
715                if balancing_stack and tokens in (
716                    TOKEN_VARIABLE_END,
717                    TOKEN_BLOCK_END,
718                    TOKEN_LINESTATEMENT_END,
719                ):
720                    continue
721
722                # tuples support more options
723                if isinstance(tokens, tuple):
724                    groups: t.Sequence[str] = m.groups()
725
726                    if isinstance(tokens, OptionalLStrip):
727                        # Rule supports lstrip. Match will look like
728                        # text, block type, whitespace control, type, control, ...
729                        text = groups[0]
730                        # Skipping the text and first type, every other group is the
731                        # whitespace control for each type. One of the groups will be
732                        # -, +, or empty string instead of None.
733                        strip_sign = next(g for g in groups[2::2] if g is not None)
734
735                        if strip_sign == "-":
736                            # Strip all whitespace between the text and the tag.
737                            stripped = text.rstrip()
738                            newlines_stripped = text[len(stripped) :].count("\n")
739                            groups = [stripped, *groups[1:]]
740                        elif (
741                            # Not marked for preserving whitespace.
742                            strip_sign != "+"
743                            # lstrip is enabled.
744                            and self.lstrip_blocks
745                            # Not a variable expression.
746                            and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
747                        ):
748                            # The start of text between the last newline and the tag.
749                            l_pos = text.rfind("\n") + 1
750
751                            if l_pos > 0 or line_starting:
752                                # If there's only whitespace between the newline and the
753                                # tag, strip it.
754                                if whitespace_re.fullmatch(text, l_pos):
755                                    groups = [text[:l_pos], *groups[1:]]
756
757                    for idx, token in enumerate(tokens):
758                        # failure group
759                        if token.__class__ is Failure:
760                            raise token(lineno, filename)
761                        # bygroup is a bit more complex, in that case we
762                        # yield for the current token the first named
763                        # group that matched
764                        elif token == "#bygroup":
765                            for key, value in m.groupdict().items():
766                                if value is not None:
767                                    yield lineno, key, value
768                                    lineno += value.count("\n")
769                                    break
770                            else:
771                                raise RuntimeError(
772                                    f"{regex!r} wanted to resolve the token dynamically"
773                                    " but no group matched"
774                                )
775                        # normal group
776                        else:
777                            data = groups[idx]
778
779                            if data or token not in ignore_if_empty:
780                                yield lineno, token, data
781
782                            lineno += data.count("\n") + newlines_stripped
783                            newlines_stripped = 0
784
785                # strings as token just are yielded as it.
786                else:
787                    data = m.group()
788
789                    # update brace/parentheses balance
790                    if tokens == TOKEN_OPERATOR:
791                        if data == "{":
792                            balancing_stack.append("}")
793                        elif data == "(":
794                            balancing_stack.append(")")
795                        elif data == "[":
796                            balancing_stack.append("]")
797                        elif data in ("}", ")", "]"):
798                            if not balancing_stack:
799                                raise TemplateSyntaxError(
800                                    f"unexpected '{data}'", lineno, name, filename
801                                )
802
803                            expected_op = balancing_stack.pop()
804
805                            if expected_op != data:
806                                raise TemplateSyntaxError(
807                                    f"unexpected '{data}', expected '{expected_op}'",
808                                    lineno,
809                                    name,
810                                    filename,
811                                )
812
813                    # yield items
814                    if data or tokens not in ignore_if_empty:
815                        yield lineno, tokens, data
816
817                    lineno += data.count("\n")
818
819                line_starting = m.group()[-1:] == "\n"
820                # fetch new position into new variable so that we can check
821                # if there is a internal parsing error which would result
822                # in an infinite loop
823                pos2 = m.end()
824
825                # handle state changes
826                if new_state is not None:
827                    # remove the uppermost state
828                    if new_state == "#pop":
829                        stack.pop()
830                    # resolve the new state by group checking
831                    elif new_state == "#bygroup":
832                        for key, value in m.groupdict().items():
833                            if value is not None:
834                                stack.append(key)
835                                break
836                        else:
837                            raise RuntimeError(
838                                f"{regex!r} wanted to resolve the new state dynamically"
839                                f" but no group matched"
840                            )
841                    # direct state name given
842                    else:
843                        stack.append(new_state)
844
845                    statetokens = self.rules[stack[-1]]
846                # we are still at the same position and no stack change.
847                # this means a loop without break condition, avoid that and
848                # raise error
849                elif pos2 == pos:
850                    raise RuntimeError(
851                        f"{regex!r} yielded empty string without stack change"
852                    )
853
854                # publish new function and start again
855                pos = pos2
856                break
857            # if loop terminated without break we haven't found a single match
858            # either we are at the end of the file or we have a problem
859            else:
860                # end of text
861                if pos >= source_length:
862                    return
863
864                # something went wrong
865                raise TemplateSyntaxError(
866                    f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
867                )

This method tokenizes the text and returns the tokens in a generator. Use this method if you just want to tokenize a template.

Changed in version 3.0: Only \n, \r\n and \r are treated as line breaks.