jinja2.lexer

Implements a Jinja / Python combination lexer. The Lexer class is used to do some preprocessing. It filters out invalid operators like the bitshift operators we don't allow in templates. It separates template code and python code in expressions.

View Source

  1"""Implements a Jinja / Python combination lexer. The ``Lexer`` class
  2is used to do some preprocessing. It filters out invalid operators like
  3the bitshift operators we don't allow in templates. It separates
  4template code and python code in expressions.
  5"""
  6
  7import re
  8import typing as t
  9from ast import literal_eval
 10from collections import deque
 11from sys import intern
 12
 13from ._identifier import pattern as name_re
 14from .exceptions import TemplateSyntaxError
 15from .utils import LRUCache
 16
 17if t.TYPE_CHECKING:
 18    import typing_extensions as te
 19
 20    from .environment import Environment
 21
 22# cache for the lexers. Exists in order to be able to have multiple
 23# environments with the same lexer
 24_lexer_cache: t.MutableMapping[t.Tuple, "Lexer"] = LRUCache(50)  # type: ignore
 25
 26# static regular expressions
 27whitespace_re = re.compile(r"\s+")
 28newline_re = re.compile(r"(\r\n|\r|\n)")
 29string_re = re.compile(
 30    r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S
 31)
 32integer_re = re.compile(
 33    r"""
 34    (
 35        0b(_?[0-1])+ # binary
 36    |
 37        0o(_?[0-7])+ # octal
 38    |
 39        0x(_?[\da-f])+ # hex
 40    |
 41        [1-9](_?\d)* # decimal
 42    |
 43        0(_?0)* # decimal zero
 44    )
 45    """,
 46    re.IGNORECASE | re.VERBOSE,
 47)
 48float_re = re.compile(
 49    r"""
 50    (?<!\.)  # doesn't start with a .
 51    (\d+_)*\d+  # digits, possibly _ separated
 52    (
 53        (\.(\d+_)*\d+)?  # optional fractional part
 54        e[+\-]?(\d+_)*\d+  # exponent part
 55    |
 56        \.(\d+_)*\d+  # required fractional part
 57    )
 58    """,
 59    re.IGNORECASE | re.VERBOSE,
 60)
 61
 62# internal the tokens and keep references to them
 63TOKEN_ADD = intern("add")
 64TOKEN_ASSIGN = intern("assign")
 65TOKEN_COLON = intern("colon")
 66TOKEN_COMMA = intern("comma")
 67TOKEN_DIV = intern("div")
 68TOKEN_DOT = intern("dot")
 69TOKEN_EQ = intern("eq")
 70TOKEN_FLOORDIV = intern("floordiv")
 71TOKEN_GT = intern("gt")
 72TOKEN_GTEQ = intern("gteq")
 73TOKEN_LBRACE = intern("lbrace")
 74TOKEN_LBRACKET = intern("lbracket")
 75TOKEN_LPAREN = intern("lparen")
 76TOKEN_LT = intern("lt")
 77TOKEN_LTEQ = intern("lteq")
 78TOKEN_MOD = intern("mod")
 79TOKEN_MUL = intern("mul")
 80TOKEN_NE = intern("ne")
 81TOKEN_PIPE = intern("pipe")
 82TOKEN_POW = intern("pow")
 83TOKEN_RBRACE = intern("rbrace")
 84TOKEN_RBRACKET = intern("rbracket")
 85TOKEN_RPAREN = intern("rparen")
 86TOKEN_SEMICOLON = intern("semicolon")
 87TOKEN_SUB = intern("sub")
 88TOKEN_TILDE = intern("tilde")
 89TOKEN_WHITESPACE = intern("whitespace")
 90TOKEN_FLOAT = intern("float")
 91TOKEN_INTEGER = intern("integer")
 92TOKEN_NAME = intern("name")
 93TOKEN_STRING = intern("string")
 94TOKEN_OPERATOR = intern("operator")
 95TOKEN_BLOCK_BEGIN = intern("block_begin")
 96TOKEN_BLOCK_END = intern("block_end")
 97TOKEN_VARIABLE_BEGIN = intern("variable_begin")
 98TOKEN_VARIABLE_END = intern("variable_end")
 99TOKEN_RAW_BEGIN = intern("raw_begin")
100TOKEN_RAW_END = intern("raw_end")
101TOKEN_COMMENT_BEGIN = intern("comment_begin")
102TOKEN_COMMENT_END = intern("comment_end")
103TOKEN_COMMENT = intern("comment")
104TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin")
105TOKEN_LINESTATEMENT_END = intern("linestatement_end")
106TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin")
107TOKEN_LINECOMMENT_END = intern("linecomment_end")
108TOKEN_LINECOMMENT = intern("linecomment")
109TOKEN_DATA = intern("data")
110TOKEN_INITIAL = intern("initial")
111TOKEN_EOF = intern("eof")
112
113# bind operators to token types
114operators = {
115    "+": TOKEN_ADD,
116    "-": TOKEN_SUB,
117    "/": TOKEN_DIV,
118    "//": TOKEN_FLOORDIV,
119    "*": TOKEN_MUL,
120    "%": TOKEN_MOD,
121    "**": TOKEN_POW,
122    "~": TOKEN_TILDE,
123    "[": TOKEN_LBRACKET,
124    "]": TOKEN_RBRACKET,
125    "(": TOKEN_LPAREN,
126    ")": TOKEN_RPAREN,
127    "{": TOKEN_LBRACE,
128    "}": TOKEN_RBRACE,
129    "==": TOKEN_EQ,
130    "!=": TOKEN_NE,
131    ">": TOKEN_GT,
132    ">=": TOKEN_GTEQ,
133    "<": TOKEN_LT,
134    "<=": TOKEN_LTEQ,
135    "=": TOKEN_ASSIGN,
136    ".": TOKEN_DOT,
137    ":": TOKEN_COLON,
138    "|": TOKEN_PIPE,
139    ",": TOKEN_COMMA,
140    ";": TOKEN_SEMICOLON,
141}
142
143reverse_operators = {v: k for k, v in operators.items()}
144assert len(operators) == len(reverse_operators), "operators dropped"
145operator_re = re.compile(
146    f"({'|'.join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))})"
147)
148
149ignored_tokens = frozenset(
150    [
151        TOKEN_COMMENT_BEGIN,
152        TOKEN_COMMENT,
153        TOKEN_COMMENT_END,
154        TOKEN_WHITESPACE,
155        TOKEN_LINECOMMENT_BEGIN,
156        TOKEN_LINECOMMENT_END,
157        TOKEN_LINECOMMENT,
158    ]
159)
160ignore_if_empty = frozenset(
161    [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT]
162)
163
164
165def _describe_token_type(token_type: str) -> str:
166    if token_type in reverse_operators:
167        return reverse_operators[token_type]
168
169    return {
170        TOKEN_COMMENT_BEGIN: "begin of comment",
171        TOKEN_COMMENT_END: "end of comment",
172        TOKEN_COMMENT: "comment",
173        TOKEN_LINECOMMENT: "comment",
174        TOKEN_BLOCK_BEGIN: "begin of statement block",
175        TOKEN_BLOCK_END: "end of statement block",
176        TOKEN_VARIABLE_BEGIN: "begin of print statement",
177        TOKEN_VARIABLE_END: "end of print statement",
178        TOKEN_LINESTATEMENT_BEGIN: "begin of line statement",
179        TOKEN_LINESTATEMENT_END: "end of line statement",
180        TOKEN_DATA: "template data / text",
181        TOKEN_EOF: "end of template",
182    }.get(token_type, token_type)
183
184
185def describe_token(token: "Token") -> str:
186    """Returns a description of the token."""
187    if token.type == TOKEN_NAME:
188        return token.value
189
190    return _describe_token_type(token.type)
191
192
193def describe_token_expr(expr: str) -> str:
194    """Like `describe_token` but for token expressions."""
195    if ":" in expr:
196        type, value = expr.split(":", 1)
197
198        if type == TOKEN_NAME:
199            return value
200    else:
201        type = expr
202
203    return _describe_token_type(type)
204
205
206def count_newlines(value: str) -> int:
207    """Count the number of newline characters in the string.  This is
208    useful for extensions that filter a stream.
209    """
210    return len(newline_re.findall(value))
211
212
213def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]:
214    """Compiles all the rules from the environment into a list of rules."""
215    e = re.escape
216    rules = [
217        (
218            len(environment.comment_start_string),
219            TOKEN_COMMENT_BEGIN,
220            e(environment.comment_start_string),
221        ),
222        (
223            len(environment.block_start_string),
224            TOKEN_BLOCK_BEGIN,
225            e(environment.block_start_string),
226        ),
227        (
228            len(environment.variable_start_string),
229            TOKEN_VARIABLE_BEGIN,
230            e(environment.variable_start_string),
231        ),
232    ]
233
234    if environment.line_statement_prefix is not None:
235        rules.append(
236            (
237                len(environment.line_statement_prefix),
238                TOKEN_LINESTATEMENT_BEGIN,
239                r"^[ \t\v]*" + e(environment.line_statement_prefix),
240            )
241        )
242    if environment.line_comment_prefix is not None:
243        rules.append(
244            (
245                len(environment.line_comment_prefix),
246                TOKEN_LINECOMMENT_BEGIN,
247                r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix),
248            )
249        )
250
251    return [x[1:] for x in sorted(rules, reverse=True)]
252
253
254class Failure:
255    """Class that raises a `TemplateSyntaxError` if called.
256    Used by the `Lexer` to specify known errors.
257    """
258
259    def __init__(
260        self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
261    ) -> None:
262        self.message = message
263        self.error_class = cls
264
265    def __call__(self, lineno: int, filename: str) -> "te.NoReturn":
266        raise self.error_class(self.message, lineno, filename)
267
268
269class Token(t.NamedTuple):
270    lineno: int
271    type: str
272    value: str
273
274    def __str__(self) -> str:
275        return describe_token(self)
276
277    def test(self, expr: str) -> bool:
278        """Test a token against a token expression.  This can either be a
279        token type or ``'token_type:token_value'``.  This can only test
280        against string values and types.
281        """
282        # here we do a regular string equality check as test_any is usually
283        # passed an iterable of not interned strings.
284        if self.type == expr:
285            return True
286
287        if ":" in expr:
288            return expr.split(":", 1) == [self.type, self.value]
289
290        return False
291
292    def test_any(self, *iterable: str) -> bool:
293        """Test against multiple token expressions."""
294        return any(self.test(expr) for expr in iterable)
295
296
297class TokenStreamIterator:
298    """The iterator for tokenstreams.  Iterate over the stream
299    until the eof token is reached.
300    """
301
302    def __init__(self, stream: "TokenStream") -> None:
303        self.stream = stream
304
305    def __iter__(self) -> "TokenStreamIterator":
306        return self
307
308    def __next__(self) -> Token:
309        token = self.stream.current
310
311        if token.type is TOKEN_EOF:
312            self.stream.close()
313            raise StopIteration
314
315        next(self.stream)
316        return token
317
318
319class TokenStream:
320    """A token stream is an iterable that yields :class:`Token`\\s.  The
321    parser however does not iterate over it but calls :meth:`next` to go
322    one token ahead.  The current active token is stored as :attr:`current`.
323    """
324
325    def __init__(
326        self,
327        generator: t.Iterable[Token],
328        name: t.Optional[str],
329        filename: t.Optional[str],
330    ):
331        self._iter = iter(generator)
332        self._pushed: "te.Deque[Token]" = deque()
333        self.name = name
334        self.filename = filename
335        self.closed = False
336        self.current = Token(1, TOKEN_INITIAL, "")
337        next(self)
338
339    def __iter__(self) -> TokenStreamIterator:
340        return TokenStreamIterator(self)
341
342    def __bool__(self) -> bool:
343        return bool(self._pushed) or self.current.type is not TOKEN_EOF
344
345    @property
346    def eos(self) -> bool:
347        """Are we at the end of the stream?"""
348        return not self
349
350    def push(self, token: Token) -> None:
351        """Push a token back to the stream."""
352        self._pushed.append(token)
353
354    def look(self) -> Token:
355        """Look at the next token."""
356        old_token = next(self)
357        result = self.current
358        self.push(result)
359        self.current = old_token
360        return result
361
362    def skip(self, n: int = 1) -> None:
363        """Got n tokens ahead."""
364        for _ in range(n):
365            next(self)
366
367    def next_if(self, expr: str) -> t.Optional[Token]:
368        """Perform the token test and return the token if it matched.
369        Otherwise the return value is `None`.
370        """
371        if self.current.test(expr):
372            return next(self)
373
374        return None
375
376    def skip_if(self, expr: str) -> bool:
377        """Like :meth:`next_if` but only returns `True` or `False`."""
378        return self.next_if(expr) is not None
379
380    def __next__(self) -> Token:
381        """Go one token ahead and return the old one.
382
383        Use the built-in :func:`next` instead of calling this directly.
384        """
385        rv = self.current
386
387        if self._pushed:
388            self.current = self._pushed.popleft()
389        elif self.current.type is not TOKEN_EOF:
390            try:
391                self.current = next(self._iter)
392            except StopIteration:
393                self.close()
394
395        return rv
396
397    def close(self) -> None:
398        """Close the stream."""
399        self.current = Token(self.current.lineno, TOKEN_EOF, "")
400        self._iter = iter(())
401        self.closed = True
402
403    def expect(self, expr: str) -> Token:
404        """Expect a given token type and return it.  This accepts the same
405        argument as :meth:`jinja2.lexer.Token.test`.
406        """
407        if not self.current.test(expr):
408            expr = describe_token_expr(expr)
409
410            if self.current.type is TOKEN_EOF:
411                raise TemplateSyntaxError(
412                    f"unexpected end of template, expected {expr!r}.",
413                    self.current.lineno,
414                    self.name,
415                    self.filename,
416                )
417
418            raise TemplateSyntaxError(
419                f"expected token {expr!r}, got {describe_token(self.current)!r}",
420                self.current.lineno,
421                self.name,
422                self.filename,
423            )
424
425        return next(self)
426
427
428def get_lexer(environment: "Environment") -> "Lexer":
429    """Return a lexer which is probably cached."""
430    key = (
431        environment.block_start_string,
432        environment.block_end_string,
433        environment.variable_start_string,
434        environment.variable_end_string,
435        environment.comment_start_string,
436        environment.comment_end_string,
437        environment.line_statement_prefix,
438        environment.line_comment_prefix,
439        environment.trim_blocks,
440        environment.lstrip_blocks,
441        environment.newline_sequence,
442        environment.keep_trailing_newline,
443    )
444    lexer = _lexer_cache.get(key)
445
446    if lexer is None:
447        _lexer_cache[key] = lexer = Lexer(environment)
448
449    return lexer
450
451
452class OptionalLStrip(tuple):  # type: ignore[type-arg]
453    """A special tuple for marking a point in the state that can have
454    lstrip applied.
455    """
456
457    __slots__ = ()
458
459    # Even though it looks like a no-op, creating instances fails
460    # without this.
461    def __new__(cls, *members, **kwargs):  # type: ignore
462        return super().__new__(cls, members)
463
464
465class _Rule(t.NamedTuple):
466    pattern: t.Pattern[str]
467    tokens: t.Union[str, t.Tuple[str, ...], t.Tuple[Failure]]
468    command: t.Optional[str]
469
470
471class Lexer:
472    """Class that implements a lexer for a given environment. Automatically
473    created by the environment class, usually you don't have to do that.
474
475    Note that the lexer is not automatically bound to an environment.
476    Multiple environments can share the same lexer.
477    """
478
479    def __init__(self, environment: "Environment") -> None:
480        # shortcuts
481        e = re.escape
482
483        def c(x: str) -> t.Pattern[str]:
484            return re.compile(x, re.M | re.S)
485
486        # lexing rules for tags
487        tag_rules: t.List[_Rule] = [
488            _Rule(whitespace_re, TOKEN_WHITESPACE, None),
489            _Rule(float_re, TOKEN_FLOAT, None),
490            _Rule(integer_re, TOKEN_INTEGER, None),
491            _Rule(name_re, TOKEN_NAME, None),
492            _Rule(string_re, TOKEN_STRING, None),
493            _Rule(operator_re, TOKEN_OPERATOR, None),
494        ]
495
496        # assemble the root lexing rule. because "|" is ungreedy
497        # we have to sort by length so that the lexer continues working
498        # as expected when we have parsing rules like <% for block and
499        # <%= for variables. (if someone wants asp like syntax)
500        # variables are just part of the rules if variable processing
501        # is required.
502        root_tag_rules = compile_rules(environment)
503
504        block_start_re = e(environment.block_start_string)
505        block_end_re = e(environment.block_end_string)
506        comment_end_re = e(environment.comment_end_string)
507        variable_end_re = e(environment.variable_end_string)
508
509        # block suffix if trimming is enabled
510        block_suffix_re = "\\n?" if environment.trim_blocks else ""
511
512        self.lstrip_blocks = environment.lstrip_blocks
513
514        self.newline_sequence = environment.newline_sequence
515        self.keep_trailing_newline = environment.keep_trailing_newline
516
517        root_raw_re = (
518            rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
519            rf"(?:\-{block_end_re}\s*|{block_end_re}))"
520        )
521        root_parts_re = "|".join(
522            [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
523        )
524
525        # global lexing rules
526        self.rules: t.Dict[str, t.List[_Rule]] = {
527            "root": [
528                # directives
529                _Rule(
530                    c(rf"(.*?)(?:{root_parts_re})"),
531                    OptionalLStrip(TOKEN_DATA, "#bygroup"),  # type: ignore
532                    "#bygroup",
533                ),
534                # data
535                _Rule(c(".+"), TOKEN_DATA, None),
536            ],
537            # comments
538            TOKEN_COMMENT_BEGIN: [
539                _Rule(
540                    c(
541                        rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
542                        rf"|{comment_end_re}{block_suffix_re}))"
543                    ),
544                    (TOKEN_COMMENT, TOKEN_COMMENT_END),
545                    "#pop",
546                ),
547                _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
548            ],
549            # blocks
550            TOKEN_BLOCK_BEGIN: [
551                _Rule(
552                    c(
553                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
554                        rf"|{block_end_re}{block_suffix_re})"
555                    ),
556                    TOKEN_BLOCK_END,
557                    "#pop",
558                ),
559            ]
560            + tag_rules,
561            # variables
562            TOKEN_VARIABLE_BEGIN: [
563                _Rule(
564                    c(rf"\-{variable_end_re}\s*|{variable_end_re}"),
565                    TOKEN_VARIABLE_END,
566                    "#pop",
567                )
568            ]
569            + tag_rules,
570            # raw block
571            TOKEN_RAW_BEGIN: [
572                _Rule(
573                    c(
574                        rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
575                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
576                        rf"|{block_end_re}{block_suffix_re}))"
577                    ),
578                    OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END),  # type: ignore
579                    "#pop",
580                ),
581                _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
582            ],
583            # line statements
584            TOKEN_LINESTATEMENT_BEGIN: [
585                _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
586            ]
587            + tag_rules,
588            # line comments
589            TOKEN_LINECOMMENT_BEGIN: [
590                _Rule(
591                    c(r"(.*?)()(?=\n|$)"),
592                    (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
593                    "#pop",
594                )
595            ],
596        }
597
598    def _normalize_newlines(self, value: str) -> str:
599        """Replace all newlines with the configured sequence in strings
600        and template data.
601        """
602        return newline_re.sub(self.newline_sequence, value)
603
604    def tokenize(
605        self,
606        source: str,
607        name: t.Optional[str] = None,
608        filename: t.Optional[str] = None,
609        state: t.Optional[str] = None,
610    ) -> TokenStream:
611        """Calls tokeniter + tokenize and wraps it in a token stream."""
612        stream = self.tokeniter(source, name, filename, state)
613        return TokenStream(self.wrap(stream, name, filename), name, filename)
614
615    def wrap(
616        self,
617        stream: t.Iterable[t.Tuple[int, str, str]],
618        name: t.Optional[str] = None,
619        filename: t.Optional[str] = None,
620    ) -> t.Iterator[Token]:
621        """This is called with the stream as returned by `tokenize` and wraps
622        every token in a :class:`Token` and converts the value.
623        """
624        for lineno, token, value_str in stream:
625            if token in ignored_tokens:
626                continue
627
628            value: t.Any = value_str
629
630            if token == TOKEN_LINESTATEMENT_BEGIN:
631                token = TOKEN_BLOCK_BEGIN
632            elif token == TOKEN_LINESTATEMENT_END:
633                token = TOKEN_BLOCK_END
634            # we are not interested in those tokens in the parser
635            elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
636                continue
637            elif token == TOKEN_DATA:
638                value = self._normalize_newlines(value_str)
639            elif token == "keyword":
640                token = value_str
641            elif token == TOKEN_NAME:
642                value = value_str
643
644                if not value.isidentifier():
645                    raise TemplateSyntaxError(
646                        "Invalid character in identifier", lineno, name, filename
647                    )
648            elif token == TOKEN_STRING:
649                # try to unescape string
650                try:
651                    value = (
652                        self._normalize_newlines(value_str[1:-1])
653                        .encode("ascii", "backslashreplace")
654                        .decode("unicode-escape")
655                    )
656                except Exception as e:
657                    msg = str(e).split(":")[-1].strip()
658                    raise TemplateSyntaxError(msg, lineno, name, filename) from e
659            elif token == TOKEN_INTEGER:
660                value = int(value_str.replace("_", ""), 0)
661            elif token == TOKEN_FLOAT:
662                # remove all "_" first to support more Python versions
663                value = literal_eval(value_str.replace("_", ""))
664            elif token == TOKEN_OPERATOR:
665                token = operators[value_str]
666
667            yield Token(lineno, token, value)
668
669    def tokeniter(
670        self,
671        source: str,
672        name: t.Optional[str],
673        filename: t.Optional[str] = None,
674        state: t.Optional[str] = None,
675    ) -> t.Iterator[t.Tuple[int, str, str]]:
676        """This method tokenizes the text and returns the tokens in a
677        generator. Use this method if you just want to tokenize a template.
678
679        .. versionchanged:: 3.0
680            Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
681            breaks.
682        """
683        lines = newline_re.split(source)[::2]
684
685        if not self.keep_trailing_newline and lines[-1] == "":
686            del lines[-1]
687
688        source = "\n".join(lines)
689        pos = 0
690        lineno = 1
691        stack = ["root"]
692
693        if state is not None and state != "root":
694            assert state in ("variable", "block"), "invalid state"
695            stack.append(state + "_begin")
696
697        statetokens = self.rules[stack[-1]]
698        source_length = len(source)
699        balancing_stack: t.List[str] = []
700        newlines_stripped = 0
701        line_starting = True
702
703        while True:
704            # tokenizer loop
705            for regex, tokens, new_state in statetokens:
706                m = regex.match(source, pos)
707
708                # if no match we try again with the next rule
709                if m is None:
710                    continue
711
712                # we only match blocks and variables if braces / parentheses
713                # are balanced. continue parsing with the lower rule which
714                # is the operator rule. do this only if the end tags look
715                # like operators
716                if balancing_stack and tokens in (
717                    TOKEN_VARIABLE_END,
718                    TOKEN_BLOCK_END,
719                    TOKEN_LINESTATEMENT_END,
720                ):
721                    continue
722
723                # tuples support more options
724                if isinstance(tokens, tuple):
725                    groups: t.Sequence[str] = m.groups()
726
727                    if isinstance(tokens, OptionalLStrip):
728                        # Rule supports lstrip. Match will look like
729                        # text, block type, whitespace control, type, control, ...
730                        text = groups[0]
731                        # Skipping the text and first type, every other group is the
732                        # whitespace control for each type. One of the groups will be
733                        # -, +, or empty string instead of None.
734                        strip_sign = next(g for g in groups[2::2] if g is not None)
735
736                        if strip_sign == "-":
737                            # Strip all whitespace between the text and the tag.
738                            stripped = text.rstrip()
739                            newlines_stripped = text[len(stripped) :].count("\n")
740                            groups = [stripped, *groups[1:]]
741                        elif (
742                            # Not marked for preserving whitespace.
743                            strip_sign != "+"
744                            # lstrip is enabled.
745                            and self.lstrip_blocks
746                            # Not a variable expression.
747                            and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
748                        ):
749                            # The start of text between the last newline and the tag.
750                            l_pos = text.rfind("\n") + 1
751
752                            if l_pos > 0 or line_starting:
753                                # If there's only whitespace between the newline and the
754                                # tag, strip it.
755                                if whitespace_re.fullmatch(text, l_pos):
756                                    groups = [text[:l_pos], *groups[1:]]
757
758                    for idx, token in enumerate(tokens):
759                        # failure group
760                        if token.__class__ is Failure:
761                            raise token(lineno, filename)
762                        # bygroup is a bit more complex, in that case we
763                        # yield for the current token the first named
764                        # group that matched
765                        elif token == "#bygroup":
766                            for key, value in m.groupdict().items():
767                                if value is not None:
768                                    yield lineno, key, value
769                                    lineno += value.count("\n")
770                                    break
771                            else:
772                                raise RuntimeError(
773                                    f"{regex!r} wanted to resolve the token dynamically"
774                                    " but no group matched"
775                                )
776                        # normal group
777                        else:
778                            data = groups[idx]
779
780                            if data or token not in ignore_if_empty:
781                                yield lineno, token, data
782
783                            lineno += data.count("\n") + newlines_stripped
784                            newlines_stripped = 0
785
786                # strings as token just are yielded as it.
787                else:
788                    data = m.group()
789
790                    # update brace/parentheses balance
791                    if tokens == TOKEN_OPERATOR:
792                        if data == "{":
793                            balancing_stack.append("}")
794                        elif data == "(":
795                            balancing_stack.append(")")
796                        elif data == "[":
797                            balancing_stack.append("]")
798                        elif data in ("}", ")", "]"):
799                            if not balancing_stack:
800                                raise TemplateSyntaxError(
801                                    f"unexpected '{data}'", lineno, name, filename
802                                )
803
804                            expected_op = balancing_stack.pop()
805
806                            if expected_op != data:
807                                raise TemplateSyntaxError(
808                                    f"unexpected '{data}', expected '{expected_op}'",
809                                    lineno,
810                                    name,
811                                    filename,
812                                )
813
814                    # yield items
815                    if data or tokens not in ignore_if_empty:
816                        yield lineno, tokens, data
817
818                    lineno += data.count("\n")
819
820                line_starting = m.group()[-1:] == "\n"
821                # fetch new position into new variable so that we can check
822                # if there is a internal parsing error which would result
823                # in an infinite loop
824                pos2 = m.end()
825
826                # handle state changes
827                if new_state is not None:
828                    # remove the uppermost state
829                    if new_state == "#pop":
830                        stack.pop()
831                    # resolve the new state by group checking
832                    elif new_state == "#bygroup":
833                        for key, value in m.groupdict().items():
834                            if value is not None:
835                                stack.append(key)
836                                break
837                        else:
838                            raise RuntimeError(
839                                f"{regex!r} wanted to resolve the new state dynamically"
840                                f" but no group matched"
841                            )
842                    # direct state name given
843                    else:
844                        stack.append(new_state)
845
846                    statetokens = self.rules[stack[-1]]
847                # we are still at the same position and no stack change.
848                # this means a loop without break condition, avoid that and
849                # raise error
850                elif pos2 == pos:
851                    raise RuntimeError(
852                        f"{regex!r} yielded empty string without stack change"
853                    )
854
855                # publish new function and start again
856                pos = pos2
857                break
858            # if loop terminated without break we haven't found a single match
859            # either we are at the end of the file or we have a problem
860            else:
861                # end of text
862                if pos >= source_length:
863                    return
864
865                # something went wrong
866                raise TemplateSyntaxError(
867                    f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
868                )

whitespace_re = re.compile('\\s+')

newline_re = re.compile('(\\r\\n|\\r|\\n)')

string_re = re.compile('(\'([^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)")', re.DOTALL)

float_re = re.compile("\n (?<!\\.) # doesn't start with a .\n (\\d+_)*\\d+ # digits, possibly _ separated\n (\n (\\.(\\d+_)*\\d+)? # optional fractional part\n e[+\\-]?(\\d+_)*\\d+ # exponent par, re.IGNORECASE|re.VERBOSE)

TOKEN_ADD = 'add'

TOKEN_ASSIGN = 'assign'

TOKEN_COLON = 'colon'

TOKEN_COMMA = 'comma'

TOKEN_DIV = 'div'

TOKEN_DOT = 'dot'

TOKEN_EQ = 'eq'

TOKEN_FLOORDIV = 'floordiv'

TOKEN_GT = 'gt'

TOKEN_GTEQ = 'gteq'

TOKEN_LBRACE = 'lbrace'

TOKEN_LBRACKET = 'lbracket'

TOKEN_LPAREN = 'lparen'

TOKEN_LT = 'lt'

TOKEN_LTEQ = 'lteq'

TOKEN_MOD = 'mod'

TOKEN_MUL = 'mul'

TOKEN_NE = 'ne'

TOKEN_PIPE = 'pipe'

TOKEN_POW = 'pow'

TOKEN_RBRACE = 'rbrace'

TOKEN_RBRACKET = 'rbracket'

TOKEN_RPAREN = 'rparen'

TOKEN_SEMICOLON = 'semicolon'

TOKEN_SUB = 'sub'

TOKEN_TILDE = 'tilde'

TOKEN_WHITESPACE = 'whitespace'

TOKEN_FLOAT = 'float'

TOKEN_INTEGER = 'integer'

TOKEN_NAME = 'name'

TOKEN_STRING = 'string'

TOKEN_OPERATOR = 'operator'

TOKEN_BLOCK_BEGIN = 'block_begin'

TOKEN_BLOCK_END = 'block_end'

TOKEN_VARIABLE_BEGIN = 'variable_begin'

TOKEN_VARIABLE_END = 'variable_end'

TOKEN_RAW_BEGIN = 'raw_begin'

TOKEN_RAW_END = 'raw_end'

TOKEN_COMMENT_BEGIN = 'comment_begin'

TOKEN_COMMENT_END = 'comment_end'

TOKEN_COMMENT = 'comment'

TOKEN_LINESTATEMENT_BEGIN = 'linestatement_begin'

TOKEN_LINESTATEMENT_END = 'linestatement_end'

TOKEN_LINECOMMENT_BEGIN = 'linecomment_begin'

TOKEN_LINECOMMENT_END = 'linecomment_end'

TOKEN_LINECOMMENT = 'linecomment'

TOKEN_DATA = 'data'

TOKEN_INITIAL = 'initial'

TOKEN_EOF = 'eof'

operators = {'+': 'add', '-': 'sub', '/': 'div', '//': 'floordiv', '*': 'mul', '%': 'mod', '**': 'pow', '~': 'tilde', '[': 'lbracket', ']': 'rbracket', '(': 'lparen', ')': 'rparen', '{': 'lbrace', '}': 'rbrace', '==': 'eq', '!=': 'ne', '>': 'gt', '>=': 'gteq', '<': 'lt', '<=': 'lteq', '=': 'assign', '.': 'dot', ':': 'colon', '|': 'pipe', ',': 'comma', ';': 'semicolon'}

reverse_operators = {'add': '+', 'sub': '-', 'div': '/', 'floordiv': '//', 'mul': '*', 'mod': '%', 'pow': '**', 'tilde': '~', 'lbracket': '[', 'rbracket': ']', 'lparen': '(', 'rparen': ')', 'lbrace': '{', 'rbrace': '}', 'eq': '==', 'ne': '!=', 'gt': '>', 'gteq': '>=', 'lt': '<', 'lteq': '<=', 'assign': '=', 'dot': '.', 'colon': ':', 'pipe': '|', 'comma': ',', 'semicolon': ';'}

operator_re = re.compile('(//|\\*\\*|==|!=|>=|<=|\\+|\\-|/|\\*|%|\\~|\\[|\\]|\\(|\\)|\\{|\\}|>|<|=|\\.|:|\\||,|;)')

ignored_tokens = frozenset({'comment', 'linecomment', 'linecomment_begin', 'whitespace', 'linecomment_end', 'comment_begin', 'comment_end'})

ignore_if_empty = frozenset({'data', 'linecomment', 'comment', 'whitespace'})

def describe_token(token: Token) -> str: View Source

186def describe_token(token: "Token") -> str:
187    """Returns a description of the token."""
188    if token.type == TOKEN_NAME:
189        return token.value
190
191    return _describe_token_type(token.type)

Returns a description of the token.

def describe_token_expr(expr: str) -> str: View Source

194def describe_token_expr(expr: str) -> str:
195    """Like `describe_token` but for token expressions."""
196    if ":" in expr:
197        type, value = expr.split(":", 1)
198
199        if type == TOKEN_NAME:
200            return value
201    else:
202        type = expr
203
204    return _describe_token_type(type)

Like describe_token but for token expressions.

def count_newlines(value: str) -> int: View Source

207def count_newlines(value: str) -> int:
208    """Count the number of newline characters in the string.  This is
209    useful for extensions that filter a stream.
210    """
211    return len(newline_re.findall(value))

Count the number of newline characters in the string. This is useful for extensions that filter a stream.

def compile_rules(environment: jinja2.environment.Environment) -> List[Tuple[str, str]]: View Source

214def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]:
215    """Compiles all the rules from the environment into a list of rules."""
216    e = re.escape
217    rules = [
218        (
219            len(environment.comment_start_string),
220            TOKEN_COMMENT_BEGIN,
221            e(environment.comment_start_string),
222        ),
223        (
224            len(environment.block_start_string),
225            TOKEN_BLOCK_BEGIN,
226            e(environment.block_start_string),
227        ),
228        (
229            len(environment.variable_start_string),
230            TOKEN_VARIABLE_BEGIN,
231            e(environment.variable_start_string),
232        ),
233    ]
234
235    if environment.line_statement_prefix is not None:
236        rules.append(
237            (
238                len(environment.line_statement_prefix),
239                TOKEN_LINESTATEMENT_BEGIN,
240                r"^[ \t\v]*" + e(environment.line_statement_prefix),
241            )
242        )
243    if environment.line_comment_prefix is not None:
244        rules.append(
245            (
246                len(environment.line_comment_prefix),
247                TOKEN_LINECOMMENT_BEGIN,
248                r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix),
249            )
250        )
251
252    return [x[1:] for x in sorted(rules, reverse=True)]

Compiles all the rules from the environment into a list of rules.

class Failure: View Source

255class Failure:
256    """Class that raises a `TemplateSyntaxError` if called.
257    Used by the `Lexer` to specify known errors.
258    """
259
260    def __init__(
261        self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
262    ) -> None:
263        self.message = message
264        self.error_class = cls
265
266    def __call__(self, lineno: int, filename: str) -> "te.NoReturn":
267        raise self.error_class(self.message, lineno, filename)

Class that raises a TemplateSyntaxError if called. Used by the Lexer to specify known errors.

Failure( message: str, cls: Type[jinja2.exceptions.TemplateSyntaxError] = <class 'jinja2.exceptions.TemplateSyntaxError'>) View Source

260    def __init__(
261        self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
262    ) -> None:
263        self.message = message
264        self.error_class = cls

message

error_class

class Token(typing.NamedTuple): View Source

270class Token(t.NamedTuple):
271    lineno: int
272    type: str
273    value: str
274
275    def __str__(self) -> str:
276        return describe_token(self)
277
278    def test(self, expr: str) -> bool:
279        """Test a token against a token expression.  This can either be a
280        token type or ``'token_type:token_value'``.  This can only test
281        against string values and types.
282        """
283        # here we do a regular string equality check as test_any is usually
284        # passed an iterable of not interned strings.
285        if self.type == expr:
286            return True
287
288        if ":" in expr:
289            return expr.split(":", 1) == [self.type, self.value]
290
291        return False
292
293    def test_any(self, *iterable: str) -> bool:
294        """Test against multiple token expressions."""
295        return any(self.test(expr) for expr in iterable)

Token(lineno, type, value)

Token(lineno: int, type: str, value: str)

Create new instance of Token(lineno, type, value)

lineno: int

Alias for field number 0

type: str

Alias for field number 1

value: str

Alias for field number 2

def test(self, expr: str) -> bool: View Source

278    def test(self, expr: str) -> bool:
279        """Test a token against a token expression.  This can either be a
280        token type or ``'token_type:token_value'``.  This can only test
281        against string values and types.
282        """
283        # here we do a regular string equality check as test_any is usually
284        # passed an iterable of not interned strings.
285        if self.type == expr:
286            return True
287
288        if ":" in expr:
289            return expr.split(":", 1) == [self.type, self.value]
290
291        return False

Test a token against a token expression. This can either be a token type or 'token_type:token_value'. This can only test against string values and types.

def test_any(self, *iterable: str) -> bool: View Source

293    def test_any(self, *iterable: str) -> bool:
294        """Test against multiple token expressions."""
295        return any(self.test(expr) for expr in iterable)

Test against multiple token expressions.

Inherited Members

builtins.tuple: index; count

class TokenStreamIterator: View Source

298class TokenStreamIterator:
299    """The iterator for tokenstreams.  Iterate over the stream
300    until the eof token is reached.
301    """
302
303    def __init__(self, stream: "TokenStream") -> None:
304        self.stream = stream
305
306    def __iter__(self) -> "TokenStreamIterator":
307        return self
308
309    def __next__(self) -> Token:
310        token = self.stream.current
311
312        if token.type is TOKEN_EOF:
313            self.stream.close()
314            raise StopIteration
315
316        next(self.stream)
317        return token

The iterator for tokenstreams. Iterate over the stream until the eof token is reached.

TokenStreamIterator(stream: TokenStream) View Source

303    def __init__(self, stream: "TokenStream") -> None:
304        self.stream = stream

stream

def get_lexer(environment: jinja2.environment.Environment) -> Lexer: View Source

429def get_lexer(environment: "Environment") -> "Lexer":
430    """Return a lexer which is probably cached."""
431    key = (
432        environment.block_start_string,
433        environment.block_end_string,
434        environment.variable_start_string,
435        environment.variable_end_string,
436        environment.comment_start_string,
437        environment.comment_end_string,
438        environment.line_statement_prefix,
439        environment.line_comment_prefix,
440        environment.trim_blocks,
441        environment.lstrip_blocks,
442        environment.newline_sequence,
443        environment.keep_trailing_newline,
444    )
445    lexer = _lexer_cache.get(key)
446
447    if lexer is None:
448        _lexer_cache[key] = lexer = Lexer(environment)
449
450    return lexer

Return a lexer which is probably cached.

class OptionalLStrip(builtins.tuple): View Source

453class OptionalLStrip(tuple):  # type: ignore[type-arg]
454    """A special tuple for marking a point in the state that can have
455    lstrip applied.
456    """
457
458    __slots__ = ()
459
460    # Even though it looks like a no-op, creating instances fails
461    # without this.
462    def __new__(cls, *members, **kwargs):  # type: ignore
463        return super().__new__(cls, members)

A special tuple for marking a point in the state that can have lstrip applied.

Inherited Members

builtins.tuple: index; count

class Lexer: View Source

472class Lexer:
473    """Class that implements a lexer for a given environment. Automatically
474    created by the environment class, usually you don't have to do that.
475
476    Note that the lexer is not automatically bound to an environment.
477    Multiple environments can share the same lexer.
478    """
479
480    def __init__(self, environment: "Environment") -> None:
481        # shortcuts
482        e = re.escape
483
484        def c(x: str) -> t.Pattern[str]:
485            return re.compile(x, re.M | re.S)
486
487        # lexing rules for tags
488        tag_rules: t.List[_Rule] = [
489            _Rule(whitespace_re, TOKEN_WHITESPACE, None),
490            _Rule(float_re, TOKEN_FLOAT, None),
491            _Rule(integer_re, TOKEN_INTEGER, None),
492            _Rule(name_re, TOKEN_NAME, None),
493            _Rule(string_re, TOKEN_STRING, None),
494            _Rule(operator_re, TOKEN_OPERATOR, None),
495        ]
496
497        # assemble the root lexing rule. because "|" is ungreedy
498        # we have to sort by length so that the lexer continues working
499        # as expected when we have parsing rules like <% for block and
500        # <%= for variables. (if someone wants asp like syntax)
501        # variables are just part of the rules if variable processing
502        # is required.
503        root_tag_rules = compile_rules(environment)
504
505        block_start_re = e(environment.block_start_string)
506        block_end_re = e(environment.block_end_string)
507        comment_end_re = e(environment.comment_end_string)
508        variable_end_re = e(environment.variable_end_string)
509
510        # block suffix if trimming is enabled
511        block_suffix_re = "\\n?" if environment.trim_blocks else ""
512
513        self.lstrip_blocks = environment.lstrip_blocks
514
515        self.newline_sequence = environment.newline_sequence
516        self.keep_trailing_newline = environment.keep_trailing_newline
517
518        root_raw_re = (
519            rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
520            rf"(?:\-{block_end_re}\s*|{block_end_re}))"
521        )
522        root_parts_re = "|".join(
523            [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
524        )
525
526        # global lexing rules
527        self.rules: t.Dict[str, t.List[_Rule]] = {
528            "root": [
529                # directives
530                _Rule(
531                    c(rf"(.*?)(?:{root_parts_re})"),
532                    OptionalLStrip(TOKEN_DATA, "#bygroup"),  # type: ignore
533                    "#bygroup",
534                ),
535                # data
536                _Rule(c(".+"), TOKEN_DATA, None),
537            ],
538            # comments
539            TOKEN_COMMENT_BEGIN: [
540                _Rule(
541                    c(
542                        rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
543                        rf"|{comment_end_re}{block_suffix_re}))"
544                    ),
545                    (TOKEN_COMMENT, TOKEN_COMMENT_END),
546                    "#pop",
547                ),
548                _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
549            ],
550            # blocks
551            TOKEN_BLOCK_BEGIN: [
552                _Rule(
553                    c(
554                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
555                        rf"|{block_end_re}{block_suffix_re})"
556                    ),
557                    TOKEN_BLOCK_END,
558                    "#pop",
559                ),
560            ]
561            + tag_rules,
562            # variables
563            TOKEN_VARIABLE_BEGIN: [
564                _Rule(
565                    c(rf"\-{variable_end_re}\s*|{variable_end_re}"),
566                    TOKEN_VARIABLE_END,
567                    "#pop",
568                )
569            ]
570            + tag_rules,
571            # raw block
572            TOKEN_RAW_BEGIN: [
573                _Rule(
574                    c(
575                        rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
576                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
577                        rf"|{block_end_re}{block_suffix_re}))"
578                    ),
579                    OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END),  # type: ignore
580                    "#pop",
581                ),
582                _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
583            ],
584            # line statements
585            TOKEN_LINESTATEMENT_BEGIN: [
586                _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
587            ]
588            + tag_rules,
589            # line comments
590            TOKEN_LINECOMMENT_BEGIN: [
591                _Rule(
592                    c(r"(.*?)()(?=\n|$)"),
593                    (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
594                    "#pop",
595                )
596            ],
597        }
598
599    def _normalize_newlines(self, value: str) -> str:
600        """Replace all newlines with the configured sequence in strings
601        and template data.
602        """
603        return newline_re.sub(self.newline_sequence, value)
604
605    def tokenize(
606        self,
607        source: str,
608        name: t.Optional[str] = None,
609        filename: t.Optional[str] = None,
610        state: t.Optional[str] = None,
611    ) -> TokenStream:
612        """Calls tokeniter + tokenize and wraps it in a token stream."""
613        stream = self.tokeniter(source, name, filename, state)
614        return TokenStream(self.wrap(stream, name, filename), name, filename)
615
616    def wrap(
617        self,
618        stream: t.Iterable[t.Tuple[int, str, str]],
619        name: t.Optional[str] = None,
620        filename: t.Optional[str] = None,
621    ) -> t.Iterator[Token]:
622        """This is called with the stream as returned by `tokenize` and wraps
623        every token in a :class:`Token` and converts the value.
624        """
625        for lineno, token, value_str in stream:
626            if token in ignored_tokens:
627                continue
628
629            value: t.Any = value_str
630
631            if token == TOKEN_LINESTATEMENT_BEGIN:
632                token = TOKEN_BLOCK_BEGIN
633            elif token == TOKEN_LINESTATEMENT_END:
634                token = TOKEN_BLOCK_END
635            # we are not interested in those tokens in the parser
636            elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
637                continue
638            elif token == TOKEN_DATA:
639                value = self._normalize_newlines(value_str)
640            elif token == "keyword":
641                token = value_str
642            elif token == TOKEN_NAME:
643                value = value_str
644
645                if not value.isidentifier():
646                    raise TemplateSyntaxError(
647                        "Invalid character in identifier", lineno, name, filename
648                    )
649            elif token == TOKEN_STRING:
650                # try to unescape string
651                try:
652                    value = (
653                        self._normalize_newlines(value_str[1:-1])
654                        .encode("ascii", "backslashreplace")
655                        .decode("unicode-escape")
656                    )
657                except Exception as e:
658                    msg = str(e).split(":")[-1].strip()
659                    raise TemplateSyntaxError(msg, lineno, name, filename) from e
660            elif token == TOKEN_INTEGER:
661                value = int(value_str.replace("_", ""), 0)
662            elif token == TOKEN_FLOAT:
663                # remove all "_" first to support more Python versions
664                value = literal_eval(value_str.replace("_", ""))
665            elif token == TOKEN_OPERATOR:
666                token = operators[value_str]
667
668            yield Token(lineno, token, value)
669
670    def tokeniter(
671        self,
672        source: str,
673        name: t.Optional[str],
674        filename: t.Optional[str] = None,
675        state: t.Optional[str] = None,
676    ) -> t.Iterator[t.Tuple[int, str, str]]:
677        """This method tokenizes the text and returns the tokens in a
678        generator. Use this method if you just want to tokenize a template.
679
680        .. versionchanged:: 3.0
681            Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
682            breaks.
683        """
684        lines = newline_re.split(source)[::2]
685
686        if not self.keep_trailing_newline and lines[-1] == "":
687            del lines[-1]
688
689        source = "\n".join(lines)
690        pos = 0
691        lineno = 1
692        stack = ["root"]
693
694        if state is not None and state != "root":
695            assert state in ("variable", "block"), "invalid state"
696            stack.append(state + "_begin")
697
698        statetokens = self.rules[stack[-1]]
699        source_length = len(source)
700        balancing_stack: t.List[str] = []
701        newlines_stripped = 0
702        line_starting = True
703
704        while True:
705            # tokenizer loop
706            for regex, tokens, new_state in statetokens:
707                m = regex.match(source, pos)
708
709                # if no match we try again with the next rule
710                if m is None:
711                    continue
712
713                # we only match blocks and variables if braces / parentheses
714                # are balanced. continue parsing with the lower rule which
715                # is the operator rule. do this only if the end tags look
716                # like operators
717                if balancing_stack and tokens in (
718                    TOKEN_VARIABLE_END,
719                    TOKEN_BLOCK_END,
720                    TOKEN_LINESTATEMENT_END,
721                ):
722                    continue
723
724                # tuples support more options
725                if isinstance(tokens, tuple):
726                    groups: t.Sequence[str] = m.groups()
727
728                    if isinstance(tokens, OptionalLStrip):
729                        # Rule supports lstrip. Match will look like
730                        # text, block type, whitespace control, type, control, ...
731                        text = groups[0]
732                        # Skipping the text and first type, every other group is the
733                        # whitespace control for each type. One of the groups will be
734                        # -, +, or empty string instead of None.
735                        strip_sign = next(g for g in groups[2::2] if g is not None)
736
737                        if strip_sign == "-":
738                            # Strip all whitespace between the text and the tag.
739                            stripped = text.rstrip()
740                            newlines_stripped = text[len(stripped) :].count("\n")
741                            groups = [stripped, *groups[1:]]
742                        elif (
743                            # Not marked for preserving whitespace.
744                            strip_sign != "+"
745                            # lstrip is enabled.
746                            and self.lstrip_blocks
747                            # Not a variable expression.
748                            and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
749                        ):
750                            # The start of text between the last newline and the tag.
751                            l_pos = text.rfind("\n") + 1
752
753                            if l_pos > 0 or line_starting:
754                                # If there's only whitespace between the newline and the
755                                # tag, strip it.
756                                if whitespace_re.fullmatch(text, l_pos):
757                                    groups = [text[:l_pos], *groups[1:]]
758
759                    for idx, token in enumerate(tokens):
760                        # failure group
761                        if token.__class__ is Failure:
762                            raise token(lineno, filename)
763                        # bygroup is a bit more complex, in that case we
764                        # yield for the current token the first named
765                        # group that matched
766                        elif token == "#bygroup":
767                            for key, value in m.groupdict().items():
768                                if value is not None:
769                                    yield lineno, key, value
770                                    lineno += value.count("\n")
771                                    break
772                            else:
773                                raise RuntimeError(
774                                    f"{regex!r} wanted to resolve the token dynamically"
775                                    " but no group matched"
776                                )
777                        # normal group
778                        else:
779                            data = groups[idx]
780
781                            if data or token not in ignore_if_empty:
782                                yield lineno, token, data
783
784                            lineno += data.count("\n") + newlines_stripped
785                            newlines_stripped = 0
786
787                # strings as token just are yielded as it.
788                else:
789                    data = m.group()
790
791                    # update brace/parentheses balance
792                    if tokens == TOKEN_OPERATOR:
793                        if data == "{":
794                            balancing_stack.append("}")
795                        elif data == "(":
796                            balancing_stack.append(")")
797                        elif data == "[":
798                            balancing_stack.append("]")
799                        elif data in ("}", ")", "]"):
800                            if not balancing_stack:
801                                raise TemplateSyntaxError(
802                                    f"unexpected '{data}'", lineno, name, filename
803                                )
804
805                            expected_op = balancing_stack.pop()
806
807                            if expected_op != data:
808                                raise TemplateSyntaxError(
809                                    f"unexpected '{data}', expected '{expected_op}'",
810                                    lineno,
811                                    name,
812                                    filename,
813                                )
814
815                    # yield items
816                    if data or tokens not in ignore_if_empty:
817                        yield lineno, tokens, data
818
819                    lineno += data.count("\n")
820
821                line_starting = m.group()[-1:] == "\n"
822                # fetch new position into new variable so that we can check
823                # if there is a internal parsing error which would result
824                # in an infinite loop
825                pos2 = m.end()
826
827                # handle state changes
828                if new_state is not None:
829                    # remove the uppermost state
830                    if new_state == "#pop":
831                        stack.pop()
832                    # resolve the new state by group checking
833                    elif new_state == "#bygroup":
834                        for key, value in m.groupdict().items():
835                            if value is not None:
836                                stack.append(key)
837                                break
838                        else:
839                            raise RuntimeError(
840                                f"{regex!r} wanted to resolve the new state dynamically"
841                                f" but no group matched"
842                            )
843                    # direct state name given
844                    else:
845                        stack.append(new_state)
846
847                    statetokens = self.rules[stack[-1]]
848                # we are still at the same position and no stack change.
849                # this means a loop without break condition, avoid that and
850                # raise error
851                elif pos2 == pos:
852                    raise RuntimeError(
853                        f"{regex!r} yielded empty string without stack change"
854                    )
855
856                # publish new function and start again
857                pos = pos2
858                break
859            # if loop terminated without break we haven't found a single match
860            # either we are at the end of the file or we have a problem
861            else:
862                # end of text
863                if pos >= source_length:
864                    return
865
866                # something went wrong
867                raise TemplateSyntaxError(
868                    f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
869                )

Class that implements a lexer for a given environment. Automatically created by the environment class, usually you don't have to do that.

Note that the lexer is not automatically bound to an environment. Multiple environments can share the same lexer.

Lexer(environment: jinja2.environment.Environment) View Source

480    def __init__(self, environment: "Environment") -> None:
481        # shortcuts
482        e = re.escape
483
484        def c(x: str) -> t.Pattern[str]:
485            return re.compile(x, re.M | re.S)
486
487        # lexing rules for tags
488        tag_rules: t.List[_Rule] = [
489            _Rule(whitespace_re, TOKEN_WHITESPACE, None),
490            _Rule(float_re, TOKEN_FLOAT, None),
491            _Rule(integer_re, TOKEN_INTEGER, None),
492            _Rule(name_re, TOKEN_NAME, None),
493            _Rule(string_re, TOKEN_STRING, None),
494            _Rule(operator_re, TOKEN_OPERATOR, None),
495        ]
496
497        # assemble the root lexing rule. because "|" is ungreedy
498        # we have to sort by length so that the lexer continues working
499        # as expected when we have parsing rules like <% for block and
500        # <%= for variables. (if someone wants asp like syntax)
501        # variables are just part of the rules if variable processing
502        # is required.
503        root_tag_rules = compile_rules(environment)
504
505        block_start_re = e(environment.block_start_string)
506        block_end_re = e(environment.block_end_string)
507        comment_end_re = e(environment.comment_end_string)
508        variable_end_re = e(environment.variable_end_string)
509
510        # block suffix if trimming is enabled
511        block_suffix_re = "\\n?" if environment.trim_blocks else ""
512
513        self.lstrip_blocks = environment.lstrip_blocks
514
515        self.newline_sequence = environment.newline_sequence
516        self.keep_trailing_newline = environment.keep_trailing_newline
517
518        root_raw_re = (
519            rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
520            rf"(?:\-{block_end_re}\s*|{block_end_re}))"
521        )
522        root_parts_re = "|".join(
523            [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
524        )
525
526        # global lexing rules
527        self.rules: t.Dict[str, t.List[_Rule]] = {
528            "root": [
529                # directives
530                _Rule(
531                    c(rf"(.*?)(?:{root_parts_re})"),
532                    OptionalLStrip(TOKEN_DATA, "#bygroup"),  # type: ignore
533                    "#bygroup",
534                ),
535                # data
536                _Rule(c(".+"), TOKEN_DATA, None),
537            ],
538            # comments
539            TOKEN_COMMENT_BEGIN: [
540                _Rule(
541                    c(
542                        rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
543                        rf"|{comment_end_re}{block_suffix_re}))"
544                    ),
545                    (TOKEN_COMMENT, TOKEN_COMMENT_END),
546                    "#pop",
547                ),
548                _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
549            ],
550            # blocks
551            TOKEN_BLOCK_BEGIN: [
552                _Rule(
553                    c(
554                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
555                        rf"|{block_end_re}{block_suffix_re})"
556                    ),
557                    TOKEN_BLOCK_END,
558                    "#pop",
559                ),
560            ]
561            + tag_rules,
562            # variables
563            TOKEN_VARIABLE_BEGIN: [
564                _Rule(
565                    c(rf"\-{variable_end_re}\s*|{variable_end_re}"),
566                    TOKEN_VARIABLE_END,
567                    "#pop",
568                )
569            ]
570            + tag_rules,
571            # raw block
572            TOKEN_RAW_BEGIN: [
573                _Rule(
574                    c(
575                        rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
576                        rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
577                        rf"|{block_end_re}{block_suffix_re}))"
578                    ),
579                    OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END),  # type: ignore
580                    "#pop",
581                ),
582                _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
583            ],
584            # line statements
585            TOKEN_LINESTATEMENT_BEGIN: [
586                _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
587            ]
588            + tag_rules,
589            # line comments
590            TOKEN_LINECOMMENT_BEGIN: [
591                _Rule(
592                    c(r"(.*?)()(?=\n|$)"),
593                    (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
594                    "#pop",
595                )
596            ],
597        }

lstrip_blocks

newline_sequence

keep_trailing_newline

rules: Dict[str, List[jinja2.lexer._Rule]]

def tokenize( self, source: str, name: Optional[str] = None, filename: Optional[str] = None, state: Optional[str] = None) -> TokenStream: View Source

605    def tokenize(
606        self,
607        source: str,
608        name: t.Optional[str] = None,
609        filename: t.Optional[str] = None,
610        state: t.Optional[str] = None,
611    ) -> TokenStream:
612        """Calls tokeniter + tokenize and wraps it in a token stream."""
613        stream = self.tokeniter(source, name, filename, state)
614        return TokenStream(self.wrap(stream, name, filename), name, filename)

Calls tokeniter + tokenize and wraps it in a token stream.

def wrap( self, stream: Iterable[Tuple[int, str, str]], name: Optional[str] = None, filename: Optional[str] = None) -> Iterator[Token]: View Source

616    def wrap(
617        self,
618        stream: t.Iterable[t.Tuple[int, str, str]],
619        name: t.Optional[str] = None,
620        filename: t.Optional[str] = None,
621    ) -> t.Iterator[Token]:
622        """This is called with the stream as returned by `tokenize` and wraps
623        every token in a :class:`Token` and converts the value.
624        """
625        for lineno, token, value_str in stream:
626            if token in ignored_tokens:
627                continue
628
629            value: t.Any = value_str
630
631            if token == TOKEN_LINESTATEMENT_BEGIN:
632                token = TOKEN_BLOCK_BEGIN
633            elif token == TOKEN_LINESTATEMENT_END:
634                token = TOKEN_BLOCK_END
635            # we are not interested in those tokens in the parser
636            elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
637                continue
638            elif token == TOKEN_DATA:
639                value = self._normalize_newlines(value_str)
640            elif token == "keyword":
641                token = value_str
642            elif token == TOKEN_NAME:
643                value = value_str
644
645                if not value.isidentifier():
646                    raise TemplateSyntaxError(
647                        "Invalid character in identifier", lineno, name, filename
648                    )
649            elif token == TOKEN_STRING:
650                # try to unescape string
651                try:
652                    value = (
653                        self._normalize_newlines(value_str[1:-1])
654                        .encode("ascii", "backslashreplace")
655                        .decode("unicode-escape")
656                    )
657                except Exception as e:
658                    msg = str(e).split(":")[-1].strip()
659                    raise TemplateSyntaxError(msg, lineno, name, filename) from e
660            elif token == TOKEN_INTEGER:
661                value = int(value_str.replace("_", ""), 0)
662            elif token == TOKEN_FLOAT:
663                # remove all "_" first to support more Python versions
664                value = literal_eval(value_str.replace("_", ""))
665            elif token == TOKEN_OPERATOR:
666                token = operators[value_str]
667
668            yield Token(lineno, token, value)

This is called with the stream as returned by tokenize and wraps every token in a Token and converts the value.

def tokeniter( self, source: str, name: Optional[str], filename: Optional[str] = None, state: Optional[str] = None) -> Iterator[Tuple[int, str, str]]: View Source

670    def tokeniter(
671        self,
672        source: str,
673        name: t.Optional[str],
674        filename: t.Optional[str] = None,
675        state: t.Optional[str] = None,
676    ) -> t.Iterator[t.Tuple[int, str, str]]:
677        """This method tokenizes the text and returns the tokens in a
678        generator. Use this method if you just want to tokenize a template.
679
680        .. versionchanged:: 3.0
681            Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
682            breaks.
683        """
684        lines = newline_re.split(source)[::2]
685
686        if not self.keep_trailing_newline and lines[-1] == "":
687            del lines[-1]
688
689        source = "\n".join(lines)
690        pos = 0
691        lineno = 1
692        stack = ["root"]
693
694        if state is not None and state != "root":
695            assert state in ("variable", "block"), "invalid state"
696            stack.append(state + "_begin")
697
698        statetokens = self.rules[stack[-1]]
699        source_length = len(source)
700        balancing_stack: t.List[str] = []
701        newlines_stripped = 0
702        line_starting = True
703
704        while True:
705            # tokenizer loop
706            for regex, tokens, new_state in statetokens:
707                m = regex.match(source, pos)
708
709                # if no match we try again with the next rule
710                if m is None:
711                    continue
712
713                # we only match blocks and variables if braces / parentheses
714                # are balanced. continue parsing with the lower rule which
715                # is the operator rule. do this only if the end tags look
716                # like operators
717                if balancing_stack and tokens in (
718                    TOKEN_VARIABLE_END,
719                    TOKEN_BLOCK_END,
720                    TOKEN_LINESTATEMENT_END,
721                ):
722                    continue
723
724                # tuples support more options
725                if isinstance(tokens, tuple):
726                    groups: t.Sequence[str] = m.groups()
727
728                    if isinstance(tokens, OptionalLStrip):
729                        # Rule supports lstrip. Match will look like
730                        # text, block type, whitespace control, type, control, ...
731                        text = groups[0]
732                        # Skipping the text and first type, every other group is the
733                        # whitespace control for each type. One of the groups will be
734                        # -, +, or empty string instead of None.
735                        strip_sign = next(g for g in groups[2::2] if g is not None)
736
737                        if strip_sign == "-":
738                            # Strip all whitespace between the text and the tag.
739                            stripped = text.rstrip()
740                            newlines_stripped = text[len(stripped) :].count("\n")
741                            groups = [stripped, *groups[1:]]
742                        elif (
743                            # Not marked for preserving whitespace.
744                            strip_sign != "+"
745                            # lstrip is enabled.
746                            and self.lstrip_blocks
747                            # Not a variable expression.
748                            and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
749                        ):
750                            # The start of text between the last newline and the tag.
751                            l_pos = text.rfind("\n") + 1
752
753                            if l_pos > 0 or line_starting:
754                                # If there's only whitespace between the newline and the
755                                # tag, strip it.
756                                if whitespace_re.fullmatch(text, l_pos):
757                                    groups = [text[:l_pos], *groups[1:]]
758
759                    for idx, token in enumerate(tokens):
760                        # failure group
761                        if token.__class__ is Failure:
762                            raise token(lineno, filename)
763                        # bygroup is a bit more complex, in that case we
764                        # yield for the current token the first named
765                        # group that matched
766                        elif token == "#bygroup":
767                            for key, value in m.groupdict().items():
768                                if value is not None:
769                                    yield lineno, key, value
770                                    lineno += value.count("\n")
771                                    break
772                            else:
773                                raise RuntimeError(
774                                    f"{regex!r} wanted to resolve the token dynamically"
775                                    " but no group matched"
776                                )
777                        # normal group
778                        else:
779                            data = groups[idx]
780
781                            if data or token not in ignore_if_empty:
782                                yield lineno, token, data
783
784                            lineno += data.count("\n") + newlines_stripped
785                            newlines_stripped = 0
786
787                # strings as token just are yielded as it.
788                else:
789                    data = m.group()
790
791                    # update brace/parentheses balance
792                    if tokens == TOKEN_OPERATOR:
793                        if data == "{":
794                            balancing_stack.append("}")
795                        elif data == "(":
796                            balancing_stack.append(")")
797                        elif data == "[":
798                            balancing_stack.append("]")
799                        elif data in ("}", ")", "]"):
800                            if not balancing_stack:
801                                raise TemplateSyntaxError(
802                                    f"unexpected '{data}'", lineno, name, filename
803                                )
804
805                            expected_op = balancing_stack.pop()
806
807                            if expected_op != data:
808                                raise TemplateSyntaxError(
809                                    f"unexpected '{data}', expected '{expected_op}'",
810                                    lineno,
811                                    name,
812                                    filename,
813                                )
814
815                    # yield items
816                    if data or tokens not in ignore_if_empty:
817                        yield lineno, tokens, data
818
819                    lineno += data.count("\n")
820
821                line_starting = m.group()[-1:] == "\n"
822                # fetch new position into new variable so that we can check
823                # if there is a internal parsing error which would result
824                # in an infinite loop
825                pos2 = m.end()
826
827                # handle state changes
828                if new_state is not None:
829                    # remove the uppermost state
830                    if new_state == "#pop":
831                        stack.pop()
832                    # resolve the new state by group checking
833                    elif new_state == "#bygroup":
834                        for key, value in m.groupdict().items():
835                            if value is not None:
836                                stack.append(key)
837                                break
838                        else:
839                            raise RuntimeError(
840                                f"{regex!r} wanted to resolve the new state dynamically"
841                                f" but no group matched"
842                            )
843                    # direct state name given
844                    else:
845                        stack.append(new_state)
846
847                    statetokens = self.rules[stack[-1]]
848                # we are still at the same position and no stack change.
849                # this means a loop without break condition, avoid that and
850                # raise error
851                elif pos2 == pos:
852                    raise RuntimeError(
853                        f"{regex!r} yielded empty string without stack change"
854                    )
855
856                # publish new function and start again
857                pos = pos2
858                break
859            # if loop terminated without break we haven't found a single match
860            # either we are at the end of the file or we have a problem
861            else:
862                # end of text
863                if pos >= source_length:
864                    return
865
866                # something went wrong
867                raise TemplateSyntaxError(
868                    f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
869                )

This method tokenizes the text and returns the tokens in a generator. Use this method if you just want to tokenize a template.

Changed in version 3.0: Only \n, \r\n and \r are treated as line breaks.