jinja2.lexer
Implements a Jinja / Python combination lexer. The Lexer class
is used to do some preprocessing. It filters out invalid operators like
the bitshift operators we don't allow in templates. It separates
template code and python code in expressions.
1"""Implements a Jinja / Python combination lexer. The ``Lexer`` class 2is used to do some preprocessing. It filters out invalid operators like 3the bitshift operators we don't allow in templates. It separates 4template code and python code in expressions. 5""" 6import re 7import typing as t 8from ast import literal_eval 9from collections import deque 10from sys import intern 11 12from ._identifier import pattern as name_re 13from .exceptions import TemplateSyntaxError 14from .utils import LRUCache 15 16if t.TYPE_CHECKING: 17 import typing_extensions as te 18 from .environment import Environment 19 20# cache for the lexers. Exists in order to be able to have multiple 21# environments with the same lexer 22_lexer_cache: t.MutableMapping[t.Tuple, "Lexer"] = LRUCache(50) # type: ignore 23 24# static regular expressions 25whitespace_re = re.compile(r"\s+") 26newline_re = re.compile(r"(\r\n|\r|\n)") 27string_re = re.compile( 28 r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S 29) 30integer_re = re.compile( 31 r""" 32 ( 33 0b(_?[0-1])+ # binary 34 | 35 0o(_?[0-7])+ # octal 36 | 37 0x(_?[\da-f])+ # hex 38 | 39 [1-9](_?\d)* # decimal 40 | 41 0(_?0)* # decimal zero 42 ) 43 """, 44 re.IGNORECASE | re.VERBOSE, 45) 46float_re = re.compile( 47 r""" 48 (?<!\.) # doesn't start with a . 49 (\d+_)*\d+ # digits, possibly _ separated 50 ( 51 (\.(\d+_)*\d+)? # optional fractional part 52 e[+\-]?(\d+_)*\d+ # exponent part 53 | 54 \.(\d+_)*\d+ # required fractional part 55 ) 56 """, 57 re.IGNORECASE | re.VERBOSE, 58) 59 60# internal the tokens and keep references to them 61TOKEN_ADD = intern("add") 62TOKEN_ASSIGN = intern("assign") 63TOKEN_COLON = intern("colon") 64TOKEN_COMMA = intern("comma") 65TOKEN_DIV = intern("div") 66TOKEN_DOT = intern("dot") 67TOKEN_EQ = intern("eq") 68TOKEN_FLOORDIV = intern("floordiv") 69TOKEN_GT = intern("gt") 70TOKEN_GTEQ = intern("gteq") 71TOKEN_LBRACE = intern("lbrace") 72TOKEN_LBRACKET = intern("lbracket") 73TOKEN_LPAREN = intern("lparen") 74TOKEN_LT = intern("lt") 75TOKEN_LTEQ = intern("lteq") 76TOKEN_MOD = intern("mod") 77TOKEN_MUL = intern("mul") 78TOKEN_NE = intern("ne") 79TOKEN_PIPE = intern("pipe") 80TOKEN_POW = intern("pow") 81TOKEN_RBRACE = intern("rbrace") 82TOKEN_RBRACKET = intern("rbracket") 83TOKEN_RPAREN = intern("rparen") 84TOKEN_SEMICOLON = intern("semicolon") 85TOKEN_SUB = intern("sub") 86TOKEN_TILDE = intern("tilde") 87TOKEN_WHITESPACE = intern("whitespace") 88TOKEN_FLOAT = intern("float") 89TOKEN_INTEGER = intern("integer") 90TOKEN_NAME = intern("name") 91TOKEN_STRING = intern("string") 92TOKEN_OPERATOR = intern("operator") 93TOKEN_BLOCK_BEGIN = intern("block_begin") 94TOKEN_BLOCK_END = intern("block_end") 95TOKEN_VARIABLE_BEGIN = intern("variable_begin") 96TOKEN_VARIABLE_END = intern("variable_end") 97TOKEN_RAW_BEGIN = intern("raw_begin") 98TOKEN_RAW_END = intern("raw_end") 99TOKEN_COMMENT_BEGIN = intern("comment_begin") 100TOKEN_COMMENT_END = intern("comment_end") 101TOKEN_COMMENT = intern("comment") 102TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin") 103TOKEN_LINESTATEMENT_END = intern("linestatement_end") 104TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin") 105TOKEN_LINECOMMENT_END = intern("linecomment_end") 106TOKEN_LINECOMMENT = intern("linecomment") 107TOKEN_DATA = intern("data") 108TOKEN_INITIAL = intern("initial") 109TOKEN_EOF = intern("eof") 110 111# bind operators to token types 112operators = { 113 "+": TOKEN_ADD, 114 "-": TOKEN_SUB, 115 "/": TOKEN_DIV, 116 "//": TOKEN_FLOORDIV, 117 "*": TOKEN_MUL, 118 "%": TOKEN_MOD, 119 "**": TOKEN_POW, 120 "~": TOKEN_TILDE, 121 "[": TOKEN_LBRACKET, 122 "]": TOKEN_RBRACKET, 123 "(": TOKEN_LPAREN, 124 ")": TOKEN_RPAREN, 125 "{": TOKEN_LBRACE, 126 "}": TOKEN_RBRACE, 127 "==": TOKEN_EQ, 128 "!=": TOKEN_NE, 129 ">": TOKEN_GT, 130 ">=": TOKEN_GTEQ, 131 "<": TOKEN_LT, 132 "<=": TOKEN_LTEQ, 133 "=": TOKEN_ASSIGN, 134 ".": TOKEN_DOT, 135 ":": TOKEN_COLON, 136 "|": TOKEN_PIPE, 137 ",": TOKEN_COMMA, 138 ";": TOKEN_SEMICOLON, 139} 140 141reverse_operators = {v: k for k, v in operators.items()} 142assert len(operators) == len(reverse_operators), "operators dropped" 143operator_re = re.compile( 144 f"({'|'.join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))})" 145) 146 147ignored_tokens = frozenset( 148 [ 149 TOKEN_COMMENT_BEGIN, 150 TOKEN_COMMENT, 151 TOKEN_COMMENT_END, 152 TOKEN_WHITESPACE, 153 TOKEN_LINECOMMENT_BEGIN, 154 TOKEN_LINECOMMENT_END, 155 TOKEN_LINECOMMENT, 156 ] 157) 158ignore_if_empty = frozenset( 159 [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT] 160) 161 162 163def _describe_token_type(token_type: str) -> str: 164 if token_type in reverse_operators: 165 return reverse_operators[token_type] 166 167 return { 168 TOKEN_COMMENT_BEGIN: "begin of comment", 169 TOKEN_COMMENT_END: "end of comment", 170 TOKEN_COMMENT: "comment", 171 TOKEN_LINECOMMENT: "comment", 172 TOKEN_BLOCK_BEGIN: "begin of statement block", 173 TOKEN_BLOCK_END: "end of statement block", 174 TOKEN_VARIABLE_BEGIN: "begin of print statement", 175 TOKEN_VARIABLE_END: "end of print statement", 176 TOKEN_LINESTATEMENT_BEGIN: "begin of line statement", 177 TOKEN_LINESTATEMENT_END: "end of line statement", 178 TOKEN_DATA: "template data / text", 179 TOKEN_EOF: "end of template", 180 }.get(token_type, token_type) 181 182 183def describe_token(token: "Token") -> str: 184 """Returns a description of the token.""" 185 if token.type == TOKEN_NAME: 186 return token.value 187 188 return _describe_token_type(token.type) 189 190 191def describe_token_expr(expr: str) -> str: 192 """Like `describe_token` but for token expressions.""" 193 if ":" in expr: 194 type, value = expr.split(":", 1) 195 196 if type == TOKEN_NAME: 197 return value 198 else: 199 type = expr 200 201 return _describe_token_type(type) 202 203 204def count_newlines(value: str) -> int: 205 """Count the number of newline characters in the string. This is 206 useful for extensions that filter a stream. 207 """ 208 return len(newline_re.findall(value)) 209 210 211def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]: 212 """Compiles all the rules from the environment into a list of rules.""" 213 e = re.escape 214 rules = [ 215 ( 216 len(environment.comment_start_string), 217 TOKEN_COMMENT_BEGIN, 218 e(environment.comment_start_string), 219 ), 220 ( 221 len(environment.block_start_string), 222 TOKEN_BLOCK_BEGIN, 223 e(environment.block_start_string), 224 ), 225 ( 226 len(environment.variable_start_string), 227 TOKEN_VARIABLE_BEGIN, 228 e(environment.variable_start_string), 229 ), 230 ] 231 232 if environment.line_statement_prefix is not None: 233 rules.append( 234 ( 235 len(environment.line_statement_prefix), 236 TOKEN_LINESTATEMENT_BEGIN, 237 r"^[ \t\v]*" + e(environment.line_statement_prefix), 238 ) 239 ) 240 if environment.line_comment_prefix is not None: 241 rules.append( 242 ( 243 len(environment.line_comment_prefix), 244 TOKEN_LINECOMMENT_BEGIN, 245 r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix), 246 ) 247 ) 248 249 return [x[1:] for x in sorted(rules, reverse=True)] 250 251 252class Failure: 253 """Class that raises a `TemplateSyntaxError` if called. 254 Used by the `Lexer` to specify known errors. 255 """ 256 257 def __init__( 258 self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError 259 ) -> None: 260 self.message = message 261 self.error_class = cls 262 263 def __call__(self, lineno: int, filename: str) -> "te.NoReturn": 264 raise self.error_class(self.message, lineno, filename) 265 266 267class Token(t.NamedTuple): 268 lineno: int 269 type: str 270 value: str 271 272 def __str__(self) -> str: 273 return describe_token(self) 274 275 def test(self, expr: str) -> bool: 276 """Test a token against a token expression. This can either be a 277 token type or ``'token_type:token_value'``. This can only test 278 against string values and types. 279 """ 280 # here we do a regular string equality check as test_any is usually 281 # passed an iterable of not interned strings. 282 if self.type == expr: 283 return True 284 285 if ":" in expr: 286 return expr.split(":", 1) == [self.type, self.value] 287 288 return False 289 290 def test_any(self, *iterable: str) -> bool: 291 """Test against multiple token expressions.""" 292 return any(self.test(expr) for expr in iterable) 293 294 295class TokenStreamIterator: 296 """The iterator for tokenstreams. Iterate over the stream 297 until the eof token is reached. 298 """ 299 300 def __init__(self, stream: "TokenStream") -> None: 301 self.stream = stream 302 303 def __iter__(self) -> "TokenStreamIterator": 304 return self 305 306 def __next__(self) -> Token: 307 token = self.stream.current 308 309 if token.type is TOKEN_EOF: 310 self.stream.close() 311 raise StopIteration 312 313 next(self.stream) 314 return token 315 316 317class TokenStream: 318 """A token stream is an iterable that yields :class:`Token`\\s. The 319 parser however does not iterate over it but calls :meth:`next` to go 320 one token ahead. The current active token is stored as :attr:`current`. 321 """ 322 323 def __init__( 324 self, 325 generator: t.Iterable[Token], 326 name: t.Optional[str], 327 filename: t.Optional[str], 328 ): 329 self._iter = iter(generator) 330 self._pushed: "te.Deque[Token]" = deque() 331 self.name = name 332 self.filename = filename 333 self.closed = False 334 self.current = Token(1, TOKEN_INITIAL, "") 335 next(self) 336 337 def __iter__(self) -> TokenStreamIterator: 338 return TokenStreamIterator(self) 339 340 def __bool__(self) -> bool: 341 return bool(self._pushed) or self.current.type is not TOKEN_EOF 342 343 @property 344 def eos(self) -> bool: 345 """Are we at the end of the stream?""" 346 return not self 347 348 def push(self, token: Token) -> None: 349 """Push a token back to the stream.""" 350 self._pushed.append(token) 351 352 def look(self) -> Token: 353 """Look at the next token.""" 354 old_token = next(self) 355 result = self.current 356 self.push(result) 357 self.current = old_token 358 return result 359 360 def skip(self, n: int = 1) -> None: 361 """Got n tokens ahead.""" 362 for _ in range(n): 363 next(self) 364 365 def next_if(self, expr: str) -> t.Optional[Token]: 366 """Perform the token test and return the token if it matched. 367 Otherwise the return value is `None`. 368 """ 369 if self.current.test(expr): 370 return next(self) 371 372 return None 373 374 def skip_if(self, expr: str) -> bool: 375 """Like :meth:`next_if` but only returns `True` or `False`.""" 376 return self.next_if(expr) is not None 377 378 def __next__(self) -> Token: 379 """Go one token ahead and return the old one. 380 381 Use the built-in :func:`next` instead of calling this directly. 382 """ 383 rv = self.current 384 385 if self._pushed: 386 self.current = self._pushed.popleft() 387 elif self.current.type is not TOKEN_EOF: 388 try: 389 self.current = next(self._iter) 390 except StopIteration: 391 self.close() 392 393 return rv 394 395 def close(self) -> None: 396 """Close the stream.""" 397 self.current = Token(self.current.lineno, TOKEN_EOF, "") 398 self._iter = iter(()) 399 self.closed = True 400 401 def expect(self, expr: str) -> Token: 402 """Expect a given token type and return it. This accepts the same 403 argument as :meth:`jinja2.lexer.Token.test`. 404 """ 405 if not self.current.test(expr): 406 expr = describe_token_expr(expr) 407 408 if self.current.type is TOKEN_EOF: 409 raise TemplateSyntaxError( 410 f"unexpected end of template, expected {expr!r}.", 411 self.current.lineno, 412 self.name, 413 self.filename, 414 ) 415 416 raise TemplateSyntaxError( 417 f"expected token {expr!r}, got {describe_token(self.current)!r}", 418 self.current.lineno, 419 self.name, 420 self.filename, 421 ) 422 423 return next(self) 424 425 426def get_lexer(environment: "Environment") -> "Lexer": 427 """Return a lexer which is probably cached.""" 428 key = ( 429 environment.block_start_string, 430 environment.block_end_string, 431 environment.variable_start_string, 432 environment.variable_end_string, 433 environment.comment_start_string, 434 environment.comment_end_string, 435 environment.line_statement_prefix, 436 environment.line_comment_prefix, 437 environment.trim_blocks, 438 environment.lstrip_blocks, 439 environment.newline_sequence, 440 environment.keep_trailing_newline, 441 ) 442 lexer = _lexer_cache.get(key) 443 444 if lexer is None: 445 _lexer_cache[key] = lexer = Lexer(environment) 446 447 return lexer 448 449 450class OptionalLStrip(tuple): 451 """A special tuple for marking a point in the state that can have 452 lstrip applied. 453 """ 454 455 __slots__ = () 456 457 # Even though it looks like a no-op, creating instances fails 458 # without this. 459 def __new__(cls, *members, **kwargs): # type: ignore 460 return super().__new__(cls, members) 461 462 463class _Rule(t.NamedTuple): 464 pattern: t.Pattern[str] 465 tokens: t.Union[str, t.Tuple[str, ...], t.Tuple[Failure]] 466 command: t.Optional[str] 467 468 469class Lexer: 470 """Class that implements a lexer for a given environment. Automatically 471 created by the environment class, usually you don't have to do that. 472 473 Note that the lexer is not automatically bound to an environment. 474 Multiple environments can share the same lexer. 475 """ 476 477 def __init__(self, environment: "Environment") -> None: 478 # shortcuts 479 e = re.escape 480 481 def c(x: str) -> t.Pattern[str]: 482 return re.compile(x, re.M | re.S) 483 484 # lexing rules for tags 485 tag_rules: t.List[_Rule] = [ 486 _Rule(whitespace_re, TOKEN_WHITESPACE, None), 487 _Rule(float_re, TOKEN_FLOAT, None), 488 _Rule(integer_re, TOKEN_INTEGER, None), 489 _Rule(name_re, TOKEN_NAME, None), 490 _Rule(string_re, TOKEN_STRING, None), 491 _Rule(operator_re, TOKEN_OPERATOR, None), 492 ] 493 494 # assemble the root lexing rule. because "|" is ungreedy 495 # we have to sort by length so that the lexer continues working 496 # as expected when we have parsing rules like <% for block and 497 # <%= for variables. (if someone wants asp like syntax) 498 # variables are just part of the rules if variable processing 499 # is required. 500 root_tag_rules = compile_rules(environment) 501 502 block_start_re = e(environment.block_start_string) 503 block_end_re = e(environment.block_end_string) 504 comment_end_re = e(environment.comment_end_string) 505 variable_end_re = e(environment.variable_end_string) 506 507 # block suffix if trimming is enabled 508 block_suffix_re = "\\n?" if environment.trim_blocks else "" 509 510 self.lstrip_blocks = environment.lstrip_blocks 511 512 self.newline_sequence = environment.newline_sequence 513 self.keep_trailing_newline = environment.keep_trailing_newline 514 515 root_raw_re = ( 516 rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*" 517 rf"(?:\-{block_end_re}\s*|{block_end_re}))" 518 ) 519 root_parts_re = "|".join( 520 [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules] 521 ) 522 523 # global lexing rules 524 self.rules: t.Dict[str, t.List[_Rule]] = { 525 "root": [ 526 # directives 527 _Rule( 528 c(rf"(.*?)(?:{root_parts_re})"), 529 OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore 530 "#bygroup", 531 ), 532 # data 533 _Rule(c(".+"), TOKEN_DATA, None), 534 ], 535 # comments 536 TOKEN_COMMENT_BEGIN: [ 537 _Rule( 538 c( 539 rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*" 540 rf"|{comment_end_re}{block_suffix_re}))" 541 ), 542 (TOKEN_COMMENT, TOKEN_COMMENT_END), 543 "#pop", 544 ), 545 _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None), 546 ], 547 # blocks 548 TOKEN_BLOCK_BEGIN: [ 549 _Rule( 550 c( 551 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 552 rf"|{block_end_re}{block_suffix_re})" 553 ), 554 TOKEN_BLOCK_END, 555 "#pop", 556 ), 557 ] 558 + tag_rules, 559 # variables 560 TOKEN_VARIABLE_BEGIN: [ 561 _Rule( 562 c(rf"\-{variable_end_re}\s*|{variable_end_re}"), 563 TOKEN_VARIABLE_END, 564 "#pop", 565 ) 566 ] 567 + tag_rules, 568 # raw block 569 TOKEN_RAW_BEGIN: [ 570 _Rule( 571 c( 572 rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*" 573 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 574 rf"|{block_end_re}{block_suffix_re}))" 575 ), 576 OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore 577 "#pop", 578 ), 579 _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None), 580 ], 581 # line statements 582 TOKEN_LINESTATEMENT_BEGIN: [ 583 _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") 584 ] 585 + tag_rules, 586 # line comments 587 TOKEN_LINECOMMENT_BEGIN: [ 588 _Rule( 589 c(r"(.*?)()(?=\n|$)"), 590 (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), 591 "#pop", 592 ) 593 ], 594 } 595 596 def _normalize_newlines(self, value: str) -> str: 597 """Replace all newlines with the configured sequence in strings 598 and template data. 599 """ 600 return newline_re.sub(self.newline_sequence, value) 601 602 def tokenize( 603 self, 604 source: str, 605 name: t.Optional[str] = None, 606 filename: t.Optional[str] = None, 607 state: t.Optional[str] = None, 608 ) -> TokenStream: 609 """Calls tokeniter + tokenize and wraps it in a token stream.""" 610 stream = self.tokeniter(source, name, filename, state) 611 return TokenStream(self.wrap(stream, name, filename), name, filename) 612 613 def wrap( 614 self, 615 stream: t.Iterable[t.Tuple[int, str, str]], 616 name: t.Optional[str] = None, 617 filename: t.Optional[str] = None, 618 ) -> t.Iterator[Token]: 619 """This is called with the stream as returned by `tokenize` and wraps 620 every token in a :class:`Token` and converts the value. 621 """ 622 for lineno, token, value_str in stream: 623 if token in ignored_tokens: 624 continue 625 626 value: t.Any = value_str 627 628 if token == TOKEN_LINESTATEMENT_BEGIN: 629 token = TOKEN_BLOCK_BEGIN 630 elif token == TOKEN_LINESTATEMENT_END: 631 token = TOKEN_BLOCK_END 632 # we are not interested in those tokens in the parser 633 elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): 634 continue 635 elif token == TOKEN_DATA: 636 value = self._normalize_newlines(value_str) 637 elif token == "keyword": 638 token = value_str 639 elif token == TOKEN_NAME: 640 value = value_str 641 642 if not value.isidentifier(): 643 raise TemplateSyntaxError( 644 "Invalid character in identifier", lineno, name, filename 645 ) 646 elif token == TOKEN_STRING: 647 # try to unescape string 648 try: 649 value = ( 650 self._normalize_newlines(value_str[1:-1]) 651 .encode("ascii", "backslashreplace") 652 .decode("unicode-escape") 653 ) 654 except Exception as e: 655 msg = str(e).split(":")[-1].strip() 656 raise TemplateSyntaxError(msg, lineno, name, filename) from e 657 elif token == TOKEN_INTEGER: 658 value = int(value_str.replace("_", ""), 0) 659 elif token == TOKEN_FLOAT: 660 # remove all "_" first to support more Python versions 661 value = literal_eval(value_str.replace("_", "")) 662 elif token == TOKEN_OPERATOR: 663 token = operators[value_str] 664 665 yield Token(lineno, token, value) 666 667 def tokeniter( 668 self, 669 source: str, 670 name: t.Optional[str], 671 filename: t.Optional[str] = None, 672 state: t.Optional[str] = None, 673 ) -> t.Iterator[t.Tuple[int, str, str]]: 674 """This method tokenizes the text and returns the tokens in a 675 generator. Use this method if you just want to tokenize a template. 676 677 .. versionchanged:: 3.0 678 Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line 679 breaks. 680 """ 681 lines = newline_re.split(source)[::2] 682 683 if not self.keep_trailing_newline and lines[-1] == "": 684 del lines[-1] 685 686 source = "\n".join(lines) 687 pos = 0 688 lineno = 1 689 stack = ["root"] 690 691 if state is not None and state != "root": 692 assert state in ("variable", "block"), "invalid state" 693 stack.append(state + "_begin") 694 695 statetokens = self.rules[stack[-1]] 696 source_length = len(source) 697 balancing_stack: t.List[str] = [] 698 newlines_stripped = 0 699 line_starting = True 700 701 while True: 702 # tokenizer loop 703 for regex, tokens, new_state in statetokens: 704 m = regex.match(source, pos) 705 706 # if no match we try again with the next rule 707 if m is None: 708 continue 709 710 # we only match blocks and variables if braces / parentheses 711 # are balanced. continue parsing with the lower rule which 712 # is the operator rule. do this only if the end tags look 713 # like operators 714 if balancing_stack and tokens in ( 715 TOKEN_VARIABLE_END, 716 TOKEN_BLOCK_END, 717 TOKEN_LINESTATEMENT_END, 718 ): 719 continue 720 721 # tuples support more options 722 if isinstance(tokens, tuple): 723 groups: t.Sequence[str] = m.groups() 724 725 if isinstance(tokens, OptionalLStrip): 726 # Rule supports lstrip. Match will look like 727 # text, block type, whitespace control, type, control, ... 728 text = groups[0] 729 # Skipping the text and first type, every other group is the 730 # whitespace control for each type. One of the groups will be 731 # -, +, or empty string instead of None. 732 strip_sign = next(g for g in groups[2::2] if g is not None) 733 734 if strip_sign == "-": 735 # Strip all whitespace between the text and the tag. 736 stripped = text.rstrip() 737 newlines_stripped = text[len(stripped) :].count("\n") 738 groups = [stripped, *groups[1:]] 739 elif ( 740 # Not marked for preserving whitespace. 741 strip_sign != "+" 742 # lstrip is enabled. 743 and self.lstrip_blocks 744 # Not a variable expression. 745 and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) 746 ): 747 # The start of text between the last newline and the tag. 748 l_pos = text.rfind("\n") + 1 749 750 if l_pos > 0 or line_starting: 751 # If there's only whitespace between the newline and the 752 # tag, strip it. 753 if whitespace_re.fullmatch(text, l_pos): 754 groups = [text[:l_pos], *groups[1:]] 755 756 for idx, token in enumerate(tokens): 757 # failure group 758 if token.__class__ is Failure: 759 raise token(lineno, filename) 760 # bygroup is a bit more complex, in that case we 761 # yield for the current token the first named 762 # group that matched 763 elif token == "#bygroup": 764 for key, value in m.groupdict().items(): 765 if value is not None: 766 yield lineno, key, value 767 lineno += value.count("\n") 768 break 769 else: 770 raise RuntimeError( 771 f"{regex!r} wanted to resolve the token dynamically" 772 " but no group matched" 773 ) 774 # normal group 775 else: 776 data = groups[idx] 777 778 if data or token not in ignore_if_empty: 779 yield lineno, token, data 780 781 lineno += data.count("\n") + newlines_stripped 782 newlines_stripped = 0 783 784 # strings as token just are yielded as it. 785 else: 786 data = m.group() 787 788 # update brace/parentheses balance 789 if tokens == TOKEN_OPERATOR: 790 if data == "{": 791 balancing_stack.append("}") 792 elif data == "(": 793 balancing_stack.append(")") 794 elif data == "[": 795 balancing_stack.append("]") 796 elif data in ("}", ")", "]"): 797 if not balancing_stack: 798 raise TemplateSyntaxError( 799 f"unexpected '{data}'", lineno, name, filename 800 ) 801 802 expected_op = balancing_stack.pop() 803 804 if expected_op != data: 805 raise TemplateSyntaxError( 806 f"unexpected '{data}', expected '{expected_op}'", 807 lineno, 808 name, 809 filename, 810 ) 811 812 # yield items 813 if data or tokens not in ignore_if_empty: 814 yield lineno, tokens, data 815 816 lineno += data.count("\n") 817 818 line_starting = m.group()[-1:] == "\n" 819 # fetch new position into new variable so that we can check 820 # if there is a internal parsing error which would result 821 # in an infinite loop 822 pos2 = m.end() 823 824 # handle state changes 825 if new_state is not None: 826 # remove the uppermost state 827 if new_state == "#pop": 828 stack.pop() 829 # resolve the new state by group checking 830 elif new_state == "#bygroup": 831 for key, value in m.groupdict().items(): 832 if value is not None: 833 stack.append(key) 834 break 835 else: 836 raise RuntimeError( 837 f"{regex!r} wanted to resolve the new state dynamically" 838 f" but no group matched" 839 ) 840 # direct state name given 841 else: 842 stack.append(new_state) 843 844 statetokens = self.rules[stack[-1]] 845 # we are still at the same position and no stack change. 846 # this means a loop without break condition, avoid that and 847 # raise error 848 elif pos2 == pos: 849 raise RuntimeError( 850 f"{regex!r} yielded empty string without stack change" 851 ) 852 853 # publish new function and start again 854 pos = pos2 855 break 856 # if loop terminated without break we haven't found a single match 857 # either we are at the end of the file or we have a problem 858 else: 859 # end of text 860 if pos >= source_length: 861 return 862 863 # something went wrong 864 raise TemplateSyntaxError( 865 f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename 866 )
184def describe_token(token: "Token") -> str: 185 """Returns a description of the token.""" 186 if token.type == TOKEN_NAME: 187 return token.value 188 189 return _describe_token_type(token.type)
Returns a description of the token.
192def describe_token_expr(expr: str) -> str: 193 """Like `describe_token` but for token expressions.""" 194 if ":" in expr: 195 type, value = expr.split(":", 1) 196 197 if type == TOKEN_NAME: 198 return value 199 else: 200 type = expr 201 202 return _describe_token_type(type)
Like describe_token but for token expressions.
205def count_newlines(value: str) -> int: 206 """Count the number of newline characters in the string. This is 207 useful for extensions that filter a stream. 208 """ 209 return len(newline_re.findall(value))
Count the number of newline characters in the string. This is useful for extensions that filter a stream.
212def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]: 213 """Compiles all the rules from the environment into a list of rules.""" 214 e = re.escape 215 rules = [ 216 ( 217 len(environment.comment_start_string), 218 TOKEN_COMMENT_BEGIN, 219 e(environment.comment_start_string), 220 ), 221 ( 222 len(environment.block_start_string), 223 TOKEN_BLOCK_BEGIN, 224 e(environment.block_start_string), 225 ), 226 ( 227 len(environment.variable_start_string), 228 TOKEN_VARIABLE_BEGIN, 229 e(environment.variable_start_string), 230 ), 231 ] 232 233 if environment.line_statement_prefix is not None: 234 rules.append( 235 ( 236 len(environment.line_statement_prefix), 237 TOKEN_LINESTATEMENT_BEGIN, 238 r"^[ \t\v]*" + e(environment.line_statement_prefix), 239 ) 240 ) 241 if environment.line_comment_prefix is not None: 242 rules.append( 243 ( 244 len(environment.line_comment_prefix), 245 TOKEN_LINECOMMENT_BEGIN, 246 r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix), 247 ) 248 ) 249 250 return [x[1:] for x in sorted(rules, reverse=True)]
Compiles all the rules from the environment into a list of rules.
253class Failure: 254 """Class that raises a `TemplateSyntaxError` if called. 255 Used by the `Lexer` to specify known errors. 256 """ 257 258 def __init__( 259 self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError 260 ) -> None: 261 self.message = message 262 self.error_class = cls 263 264 def __call__(self, lineno: int, filename: str) -> "te.NoReturn": 265 raise self.error_class(self.message, lineno, filename)
Class that raises a TemplateSyntaxError if called.
Used by the Lexer to specify known errors.
268class Token(t.NamedTuple): 269 lineno: int 270 type: str 271 value: str 272 273 def __str__(self) -> str: 274 return describe_token(self) 275 276 def test(self, expr: str) -> bool: 277 """Test a token against a token expression. This can either be a 278 token type or ``'token_type:token_value'``. This can only test 279 against string values and types. 280 """ 281 # here we do a regular string equality check as test_any is usually 282 # passed an iterable of not interned strings. 283 if self.type == expr: 284 return True 285 286 if ":" in expr: 287 return expr.split(":", 1) == [self.type, self.value] 288 289 return False 290 291 def test_any(self, *iterable: str) -> bool: 292 """Test against multiple token expressions.""" 293 return any(self.test(expr) for expr in iterable)
Token(lineno, type, value)
276 def test(self, expr: str) -> bool: 277 """Test a token against a token expression. This can either be a 278 token type or ``'token_type:token_value'``. This can only test 279 against string values and types. 280 """ 281 # here we do a regular string equality check as test_any is usually 282 # passed an iterable of not interned strings. 283 if self.type == expr: 284 return True 285 286 if ":" in expr: 287 return expr.split(":", 1) == [self.type, self.value] 288 289 return False
Test a token against a token expression. This can either be a
token type or 'token_type:token_value'. This can only test
against string values and types.
291 def test_any(self, *iterable: str) -> bool: 292 """Test against multiple token expressions.""" 293 return any(self.test(expr) for expr in iterable)
Test against multiple token expressions.
Inherited Members
- builtins.tuple
- index
- count
296class TokenStreamIterator: 297 """The iterator for tokenstreams. Iterate over the stream 298 until the eof token is reached. 299 """ 300 301 def __init__(self, stream: "TokenStream") -> None: 302 self.stream = stream 303 304 def __iter__(self) -> "TokenStreamIterator": 305 return self 306 307 def __next__(self) -> Token: 308 token = self.stream.current 309 310 if token.type is TOKEN_EOF: 311 self.stream.close() 312 raise StopIteration 313 314 next(self.stream) 315 return token
The iterator for tokenstreams. Iterate over the stream until the eof token is reached.
318class TokenStream: 319 """A token stream is an iterable that yields :class:`Token`\\s. The 320 parser however does not iterate over it but calls :meth:`next` to go 321 one token ahead. The current active token is stored as :attr:`current`. 322 """ 323 324 def __init__( 325 self, 326 generator: t.Iterable[Token], 327 name: t.Optional[str], 328 filename: t.Optional[str], 329 ): 330 self._iter = iter(generator) 331 self._pushed: "te.Deque[Token]" = deque() 332 self.name = name 333 self.filename = filename 334 self.closed = False 335 self.current = Token(1, TOKEN_INITIAL, "") 336 next(self) 337 338 def __iter__(self) -> TokenStreamIterator: 339 return TokenStreamIterator(self) 340 341 def __bool__(self) -> bool: 342 return bool(self._pushed) or self.current.type is not TOKEN_EOF 343 344 @property 345 def eos(self) -> bool: 346 """Are we at the end of the stream?""" 347 return not self 348 349 def push(self, token: Token) -> None: 350 """Push a token back to the stream.""" 351 self._pushed.append(token) 352 353 def look(self) -> Token: 354 """Look at the next token.""" 355 old_token = next(self) 356 result = self.current 357 self.push(result) 358 self.current = old_token 359 return result 360 361 def skip(self, n: int = 1) -> None: 362 """Got n tokens ahead.""" 363 for _ in range(n): 364 next(self) 365 366 def next_if(self, expr: str) -> t.Optional[Token]: 367 """Perform the token test and return the token if it matched. 368 Otherwise the return value is `None`. 369 """ 370 if self.current.test(expr): 371 return next(self) 372 373 return None 374 375 def skip_if(self, expr: str) -> bool: 376 """Like :meth:`next_if` but only returns `True` or `False`.""" 377 return self.next_if(expr) is not None 378 379 def __next__(self) -> Token: 380 """Go one token ahead and return the old one. 381 382 Use the built-in :func:`next` instead of calling this directly. 383 """ 384 rv = self.current 385 386 if self._pushed: 387 self.current = self._pushed.popleft() 388 elif self.current.type is not TOKEN_EOF: 389 try: 390 self.current = next(self._iter) 391 except StopIteration: 392 self.close() 393 394 return rv 395 396 def close(self) -> None: 397 """Close the stream.""" 398 self.current = Token(self.current.lineno, TOKEN_EOF, "") 399 self._iter = iter(()) 400 self.closed = True 401 402 def expect(self, expr: str) -> Token: 403 """Expect a given token type and return it. This accepts the same 404 argument as :meth:`jinja2.lexer.Token.test`. 405 """ 406 if not self.current.test(expr): 407 expr = describe_token_expr(expr) 408 409 if self.current.type is TOKEN_EOF: 410 raise TemplateSyntaxError( 411 f"unexpected end of template, expected {expr!r}.", 412 self.current.lineno, 413 self.name, 414 self.filename, 415 ) 416 417 raise TemplateSyntaxError( 418 f"expected token {expr!r}, got {describe_token(self.current)!r}", 419 self.current.lineno, 420 self.name, 421 self.filename, 422 ) 423 424 return next(self)
A token stream is an iterable that yields Token\s. The
parser however does not iterate over it but calls next() to go
one token ahead. The current active token is stored as current.
324 def __init__( 325 self, 326 generator: t.Iterable[Token], 327 name: t.Optional[str], 328 filename: t.Optional[str], 329 ): 330 self._iter = iter(generator) 331 self._pushed: "te.Deque[Token]" = deque() 332 self.name = name 333 self.filename = filename 334 self.closed = False 335 self.current = Token(1, TOKEN_INITIAL, "") 336 next(self)
344 @property 345 def eos(self) -> bool: 346 """Are we at the end of the stream?""" 347 return not self
Are we at the end of the stream?
349 def push(self, token: Token) -> None: 350 """Push a token back to the stream.""" 351 self._pushed.append(token)
Push a token back to the stream.
353 def look(self) -> Token: 354 """Look at the next token.""" 355 old_token = next(self) 356 result = self.current 357 self.push(result) 358 self.current = old_token 359 return result
Look at the next token.
361 def skip(self, n: int = 1) -> None: 362 """Got n tokens ahead.""" 363 for _ in range(n): 364 next(self)
Got n tokens ahead.
366 def next_if(self, expr: str) -> t.Optional[Token]: 367 """Perform the token test and return the token if it matched. 368 Otherwise the return value is `None`. 369 """ 370 if self.current.test(expr): 371 return next(self) 372 373 return None
Perform the token test and return the token if it matched.
Otherwise the return value is None.
375 def skip_if(self, expr: str) -> bool: 376 """Like :meth:`next_if` but only returns `True` or `False`.""" 377 return self.next_if(expr) is not None
Like next_if() but only returns True or False.
396 def close(self) -> None: 397 """Close the stream.""" 398 self.current = Token(self.current.lineno, TOKEN_EOF, "") 399 self._iter = iter(()) 400 self.closed = True
Close the stream.
402 def expect(self, expr: str) -> Token: 403 """Expect a given token type and return it. This accepts the same 404 argument as :meth:`jinja2.lexer.Token.test`. 405 """ 406 if not self.current.test(expr): 407 expr = describe_token_expr(expr) 408 409 if self.current.type is TOKEN_EOF: 410 raise TemplateSyntaxError( 411 f"unexpected end of template, expected {expr!r}.", 412 self.current.lineno, 413 self.name, 414 self.filename, 415 ) 416 417 raise TemplateSyntaxError( 418 f"expected token {expr!r}, got {describe_token(self.current)!r}", 419 self.current.lineno, 420 self.name, 421 self.filename, 422 ) 423 424 return next(self)
Expect a given token type and return it. This accepts the same
argument as Token.test().
427def get_lexer(environment: "Environment") -> "Lexer": 428 """Return a lexer which is probably cached.""" 429 key = ( 430 environment.block_start_string, 431 environment.block_end_string, 432 environment.variable_start_string, 433 environment.variable_end_string, 434 environment.comment_start_string, 435 environment.comment_end_string, 436 environment.line_statement_prefix, 437 environment.line_comment_prefix, 438 environment.trim_blocks, 439 environment.lstrip_blocks, 440 environment.newline_sequence, 441 environment.keep_trailing_newline, 442 ) 443 lexer = _lexer_cache.get(key) 444 445 if lexer is None: 446 _lexer_cache[key] = lexer = Lexer(environment) 447 448 return lexer
Return a lexer which is probably cached.
451class OptionalLStrip(tuple): 452 """A special tuple for marking a point in the state that can have 453 lstrip applied. 454 """ 455 456 __slots__ = () 457 458 # Even though it looks like a no-op, creating instances fails 459 # without this. 460 def __new__(cls, *members, **kwargs): # type: ignore 461 return super().__new__(cls, members)
A special tuple for marking a point in the state that can have lstrip applied.
Inherited Members
- builtins.tuple
- index
- count
470class Lexer: 471 """Class that implements a lexer for a given environment. Automatically 472 created by the environment class, usually you don't have to do that. 473 474 Note that the lexer is not automatically bound to an environment. 475 Multiple environments can share the same lexer. 476 """ 477 478 def __init__(self, environment: "Environment") -> None: 479 # shortcuts 480 e = re.escape 481 482 def c(x: str) -> t.Pattern[str]: 483 return re.compile(x, re.M | re.S) 484 485 # lexing rules for tags 486 tag_rules: t.List[_Rule] = [ 487 _Rule(whitespace_re, TOKEN_WHITESPACE, None), 488 _Rule(float_re, TOKEN_FLOAT, None), 489 _Rule(integer_re, TOKEN_INTEGER, None), 490 _Rule(name_re, TOKEN_NAME, None), 491 _Rule(string_re, TOKEN_STRING, None), 492 _Rule(operator_re, TOKEN_OPERATOR, None), 493 ] 494 495 # assemble the root lexing rule. because "|" is ungreedy 496 # we have to sort by length so that the lexer continues working 497 # as expected when we have parsing rules like <% for block and 498 # <%= for variables. (if someone wants asp like syntax) 499 # variables are just part of the rules if variable processing 500 # is required. 501 root_tag_rules = compile_rules(environment) 502 503 block_start_re = e(environment.block_start_string) 504 block_end_re = e(environment.block_end_string) 505 comment_end_re = e(environment.comment_end_string) 506 variable_end_re = e(environment.variable_end_string) 507 508 # block suffix if trimming is enabled 509 block_suffix_re = "\\n?" if environment.trim_blocks else "" 510 511 self.lstrip_blocks = environment.lstrip_blocks 512 513 self.newline_sequence = environment.newline_sequence 514 self.keep_trailing_newline = environment.keep_trailing_newline 515 516 root_raw_re = ( 517 rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*" 518 rf"(?:\-{block_end_re}\s*|{block_end_re}))" 519 ) 520 root_parts_re = "|".join( 521 [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules] 522 ) 523 524 # global lexing rules 525 self.rules: t.Dict[str, t.List[_Rule]] = { 526 "root": [ 527 # directives 528 _Rule( 529 c(rf"(.*?)(?:{root_parts_re})"), 530 OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore 531 "#bygroup", 532 ), 533 # data 534 _Rule(c(".+"), TOKEN_DATA, None), 535 ], 536 # comments 537 TOKEN_COMMENT_BEGIN: [ 538 _Rule( 539 c( 540 rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*" 541 rf"|{comment_end_re}{block_suffix_re}))" 542 ), 543 (TOKEN_COMMENT, TOKEN_COMMENT_END), 544 "#pop", 545 ), 546 _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None), 547 ], 548 # blocks 549 TOKEN_BLOCK_BEGIN: [ 550 _Rule( 551 c( 552 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 553 rf"|{block_end_re}{block_suffix_re})" 554 ), 555 TOKEN_BLOCK_END, 556 "#pop", 557 ), 558 ] 559 + tag_rules, 560 # variables 561 TOKEN_VARIABLE_BEGIN: [ 562 _Rule( 563 c(rf"\-{variable_end_re}\s*|{variable_end_re}"), 564 TOKEN_VARIABLE_END, 565 "#pop", 566 ) 567 ] 568 + tag_rules, 569 # raw block 570 TOKEN_RAW_BEGIN: [ 571 _Rule( 572 c( 573 rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*" 574 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 575 rf"|{block_end_re}{block_suffix_re}))" 576 ), 577 OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore 578 "#pop", 579 ), 580 _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None), 581 ], 582 # line statements 583 TOKEN_LINESTATEMENT_BEGIN: [ 584 _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") 585 ] 586 + tag_rules, 587 # line comments 588 TOKEN_LINECOMMENT_BEGIN: [ 589 _Rule( 590 c(r"(.*?)()(?=\n|$)"), 591 (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), 592 "#pop", 593 ) 594 ], 595 } 596 597 def _normalize_newlines(self, value: str) -> str: 598 """Replace all newlines with the configured sequence in strings 599 and template data. 600 """ 601 return newline_re.sub(self.newline_sequence, value) 602 603 def tokenize( 604 self, 605 source: str, 606 name: t.Optional[str] = None, 607 filename: t.Optional[str] = None, 608 state: t.Optional[str] = None, 609 ) -> TokenStream: 610 """Calls tokeniter + tokenize and wraps it in a token stream.""" 611 stream = self.tokeniter(source, name, filename, state) 612 return TokenStream(self.wrap(stream, name, filename), name, filename) 613 614 def wrap( 615 self, 616 stream: t.Iterable[t.Tuple[int, str, str]], 617 name: t.Optional[str] = None, 618 filename: t.Optional[str] = None, 619 ) -> t.Iterator[Token]: 620 """This is called with the stream as returned by `tokenize` and wraps 621 every token in a :class:`Token` and converts the value. 622 """ 623 for lineno, token, value_str in stream: 624 if token in ignored_tokens: 625 continue 626 627 value: t.Any = value_str 628 629 if token == TOKEN_LINESTATEMENT_BEGIN: 630 token = TOKEN_BLOCK_BEGIN 631 elif token == TOKEN_LINESTATEMENT_END: 632 token = TOKEN_BLOCK_END 633 # we are not interested in those tokens in the parser 634 elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): 635 continue 636 elif token == TOKEN_DATA: 637 value = self._normalize_newlines(value_str) 638 elif token == "keyword": 639 token = value_str 640 elif token == TOKEN_NAME: 641 value = value_str 642 643 if not value.isidentifier(): 644 raise TemplateSyntaxError( 645 "Invalid character in identifier", lineno, name, filename 646 ) 647 elif token == TOKEN_STRING: 648 # try to unescape string 649 try: 650 value = ( 651 self._normalize_newlines(value_str[1:-1]) 652 .encode("ascii", "backslashreplace") 653 .decode("unicode-escape") 654 ) 655 except Exception as e: 656 msg = str(e).split(":")[-1].strip() 657 raise TemplateSyntaxError(msg, lineno, name, filename) from e 658 elif token == TOKEN_INTEGER: 659 value = int(value_str.replace("_", ""), 0) 660 elif token == TOKEN_FLOAT: 661 # remove all "_" first to support more Python versions 662 value = literal_eval(value_str.replace("_", "")) 663 elif token == TOKEN_OPERATOR: 664 token = operators[value_str] 665 666 yield Token(lineno, token, value) 667 668 def tokeniter( 669 self, 670 source: str, 671 name: t.Optional[str], 672 filename: t.Optional[str] = None, 673 state: t.Optional[str] = None, 674 ) -> t.Iterator[t.Tuple[int, str, str]]: 675 """This method tokenizes the text and returns the tokens in a 676 generator. Use this method if you just want to tokenize a template. 677 678 .. versionchanged:: 3.0 679 Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line 680 breaks. 681 """ 682 lines = newline_re.split(source)[::2] 683 684 if not self.keep_trailing_newline and lines[-1] == "": 685 del lines[-1] 686 687 source = "\n".join(lines) 688 pos = 0 689 lineno = 1 690 stack = ["root"] 691 692 if state is not None and state != "root": 693 assert state in ("variable", "block"), "invalid state" 694 stack.append(state + "_begin") 695 696 statetokens = self.rules[stack[-1]] 697 source_length = len(source) 698 balancing_stack: t.List[str] = [] 699 newlines_stripped = 0 700 line_starting = True 701 702 while True: 703 # tokenizer loop 704 for regex, tokens, new_state in statetokens: 705 m = regex.match(source, pos) 706 707 # if no match we try again with the next rule 708 if m is None: 709 continue 710 711 # we only match blocks and variables if braces / parentheses 712 # are balanced. continue parsing with the lower rule which 713 # is the operator rule. do this only if the end tags look 714 # like operators 715 if balancing_stack and tokens in ( 716 TOKEN_VARIABLE_END, 717 TOKEN_BLOCK_END, 718 TOKEN_LINESTATEMENT_END, 719 ): 720 continue 721 722 # tuples support more options 723 if isinstance(tokens, tuple): 724 groups: t.Sequence[str] = m.groups() 725 726 if isinstance(tokens, OptionalLStrip): 727 # Rule supports lstrip. Match will look like 728 # text, block type, whitespace control, type, control, ... 729 text = groups[0] 730 # Skipping the text and first type, every other group is the 731 # whitespace control for each type. One of the groups will be 732 # -, +, or empty string instead of None. 733 strip_sign = next(g for g in groups[2::2] if g is not None) 734 735 if strip_sign == "-": 736 # Strip all whitespace between the text and the tag. 737 stripped = text.rstrip() 738 newlines_stripped = text[len(stripped) :].count("\n") 739 groups = [stripped, *groups[1:]] 740 elif ( 741 # Not marked for preserving whitespace. 742 strip_sign != "+" 743 # lstrip is enabled. 744 and self.lstrip_blocks 745 # Not a variable expression. 746 and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) 747 ): 748 # The start of text between the last newline and the tag. 749 l_pos = text.rfind("\n") + 1 750 751 if l_pos > 0 or line_starting: 752 # If there's only whitespace between the newline and the 753 # tag, strip it. 754 if whitespace_re.fullmatch(text, l_pos): 755 groups = [text[:l_pos], *groups[1:]] 756 757 for idx, token in enumerate(tokens): 758 # failure group 759 if token.__class__ is Failure: 760 raise token(lineno, filename) 761 # bygroup is a bit more complex, in that case we 762 # yield for the current token the first named 763 # group that matched 764 elif token == "#bygroup": 765 for key, value in m.groupdict().items(): 766 if value is not None: 767 yield lineno, key, value 768 lineno += value.count("\n") 769 break 770 else: 771 raise RuntimeError( 772 f"{regex!r} wanted to resolve the token dynamically" 773 " but no group matched" 774 ) 775 # normal group 776 else: 777 data = groups[idx] 778 779 if data or token not in ignore_if_empty: 780 yield lineno, token, data 781 782 lineno += data.count("\n") + newlines_stripped 783 newlines_stripped = 0 784 785 # strings as token just are yielded as it. 786 else: 787 data = m.group() 788 789 # update brace/parentheses balance 790 if tokens == TOKEN_OPERATOR: 791 if data == "{": 792 balancing_stack.append("}") 793 elif data == "(": 794 balancing_stack.append(")") 795 elif data == "[": 796 balancing_stack.append("]") 797 elif data in ("}", ")", "]"): 798 if not balancing_stack: 799 raise TemplateSyntaxError( 800 f"unexpected '{data}'", lineno, name, filename 801 ) 802 803 expected_op = balancing_stack.pop() 804 805 if expected_op != data: 806 raise TemplateSyntaxError( 807 f"unexpected '{data}', expected '{expected_op}'", 808 lineno, 809 name, 810 filename, 811 ) 812 813 # yield items 814 if data or tokens not in ignore_if_empty: 815 yield lineno, tokens, data 816 817 lineno += data.count("\n") 818 819 line_starting = m.group()[-1:] == "\n" 820 # fetch new position into new variable so that we can check 821 # if there is a internal parsing error which would result 822 # in an infinite loop 823 pos2 = m.end() 824 825 # handle state changes 826 if new_state is not None: 827 # remove the uppermost state 828 if new_state == "#pop": 829 stack.pop() 830 # resolve the new state by group checking 831 elif new_state == "#bygroup": 832 for key, value in m.groupdict().items(): 833 if value is not None: 834 stack.append(key) 835 break 836 else: 837 raise RuntimeError( 838 f"{regex!r} wanted to resolve the new state dynamically" 839 f" but no group matched" 840 ) 841 # direct state name given 842 else: 843 stack.append(new_state) 844 845 statetokens = self.rules[stack[-1]] 846 # we are still at the same position and no stack change. 847 # this means a loop without break condition, avoid that and 848 # raise error 849 elif pos2 == pos: 850 raise RuntimeError( 851 f"{regex!r} yielded empty string without stack change" 852 ) 853 854 # publish new function and start again 855 pos = pos2 856 break 857 # if loop terminated without break we haven't found a single match 858 # either we are at the end of the file or we have a problem 859 else: 860 # end of text 861 if pos >= source_length: 862 return 863 864 # something went wrong 865 raise TemplateSyntaxError( 866 f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename 867 )
Class that implements a lexer for a given environment. Automatically created by the environment class, usually you don't have to do that.
Note that the lexer is not automatically bound to an environment. Multiple environments can share the same lexer.
478 def __init__(self, environment: "Environment") -> None: 479 # shortcuts 480 e = re.escape 481 482 def c(x: str) -> t.Pattern[str]: 483 return re.compile(x, re.M | re.S) 484 485 # lexing rules for tags 486 tag_rules: t.List[_Rule] = [ 487 _Rule(whitespace_re, TOKEN_WHITESPACE, None), 488 _Rule(float_re, TOKEN_FLOAT, None), 489 _Rule(integer_re, TOKEN_INTEGER, None), 490 _Rule(name_re, TOKEN_NAME, None), 491 _Rule(string_re, TOKEN_STRING, None), 492 _Rule(operator_re, TOKEN_OPERATOR, None), 493 ] 494 495 # assemble the root lexing rule. because "|" is ungreedy 496 # we have to sort by length so that the lexer continues working 497 # as expected when we have parsing rules like <% for block and 498 # <%= for variables. (if someone wants asp like syntax) 499 # variables are just part of the rules if variable processing 500 # is required. 501 root_tag_rules = compile_rules(environment) 502 503 block_start_re = e(environment.block_start_string) 504 block_end_re = e(environment.block_end_string) 505 comment_end_re = e(environment.comment_end_string) 506 variable_end_re = e(environment.variable_end_string) 507 508 # block suffix if trimming is enabled 509 block_suffix_re = "\\n?" if environment.trim_blocks else "" 510 511 self.lstrip_blocks = environment.lstrip_blocks 512 513 self.newline_sequence = environment.newline_sequence 514 self.keep_trailing_newline = environment.keep_trailing_newline 515 516 root_raw_re = ( 517 rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*" 518 rf"(?:\-{block_end_re}\s*|{block_end_re}))" 519 ) 520 root_parts_re = "|".join( 521 [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules] 522 ) 523 524 # global lexing rules 525 self.rules: t.Dict[str, t.List[_Rule]] = { 526 "root": [ 527 # directives 528 _Rule( 529 c(rf"(.*?)(?:{root_parts_re})"), 530 OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore 531 "#bygroup", 532 ), 533 # data 534 _Rule(c(".+"), TOKEN_DATA, None), 535 ], 536 # comments 537 TOKEN_COMMENT_BEGIN: [ 538 _Rule( 539 c( 540 rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*" 541 rf"|{comment_end_re}{block_suffix_re}))" 542 ), 543 (TOKEN_COMMENT, TOKEN_COMMENT_END), 544 "#pop", 545 ), 546 _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None), 547 ], 548 # blocks 549 TOKEN_BLOCK_BEGIN: [ 550 _Rule( 551 c( 552 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 553 rf"|{block_end_re}{block_suffix_re})" 554 ), 555 TOKEN_BLOCK_END, 556 "#pop", 557 ), 558 ] 559 + tag_rules, 560 # variables 561 TOKEN_VARIABLE_BEGIN: [ 562 _Rule( 563 c(rf"\-{variable_end_re}\s*|{variable_end_re}"), 564 TOKEN_VARIABLE_END, 565 "#pop", 566 ) 567 ] 568 + tag_rules, 569 # raw block 570 TOKEN_RAW_BEGIN: [ 571 _Rule( 572 c( 573 rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*" 574 rf"(?:\+{block_end_re}|\-{block_end_re}\s*" 575 rf"|{block_end_re}{block_suffix_re}))" 576 ), 577 OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore 578 "#pop", 579 ), 580 _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None), 581 ], 582 # line statements 583 TOKEN_LINESTATEMENT_BEGIN: [ 584 _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") 585 ] 586 + tag_rules, 587 # line comments 588 TOKEN_LINECOMMENT_BEGIN: [ 589 _Rule( 590 c(r"(.*?)()(?=\n|$)"), 591 (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), 592 "#pop", 593 ) 594 ], 595 }
603 def tokenize( 604 self, 605 source: str, 606 name: t.Optional[str] = None, 607 filename: t.Optional[str] = None, 608 state: t.Optional[str] = None, 609 ) -> TokenStream: 610 """Calls tokeniter + tokenize and wraps it in a token stream.""" 611 stream = self.tokeniter(source, name, filename, state) 612 return TokenStream(self.wrap(stream, name, filename), name, filename)
Calls tokeniter + tokenize and wraps it in a token stream.
614 def wrap( 615 self, 616 stream: t.Iterable[t.Tuple[int, str, str]], 617 name: t.Optional[str] = None, 618 filename: t.Optional[str] = None, 619 ) -> t.Iterator[Token]: 620 """This is called with the stream as returned by `tokenize` and wraps 621 every token in a :class:`Token` and converts the value. 622 """ 623 for lineno, token, value_str in stream: 624 if token in ignored_tokens: 625 continue 626 627 value: t.Any = value_str 628 629 if token == TOKEN_LINESTATEMENT_BEGIN: 630 token = TOKEN_BLOCK_BEGIN 631 elif token == TOKEN_LINESTATEMENT_END: 632 token = TOKEN_BLOCK_END 633 # we are not interested in those tokens in the parser 634 elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): 635 continue 636 elif token == TOKEN_DATA: 637 value = self._normalize_newlines(value_str) 638 elif token == "keyword": 639 token = value_str 640 elif token == TOKEN_NAME: 641 value = value_str 642 643 if not value.isidentifier(): 644 raise TemplateSyntaxError( 645 "Invalid character in identifier", lineno, name, filename 646 ) 647 elif token == TOKEN_STRING: 648 # try to unescape string 649 try: 650 value = ( 651 self._normalize_newlines(value_str[1:-1]) 652 .encode("ascii", "backslashreplace") 653 .decode("unicode-escape") 654 ) 655 except Exception as e: 656 msg = str(e).split(":")[-1].strip() 657 raise TemplateSyntaxError(msg, lineno, name, filename) from e 658 elif token == TOKEN_INTEGER: 659 value = int(value_str.replace("_", ""), 0) 660 elif token == TOKEN_FLOAT: 661 # remove all "_" first to support more Python versions 662 value = literal_eval(value_str.replace("_", "")) 663 elif token == TOKEN_OPERATOR: 664 token = operators[value_str] 665 666 yield Token(lineno, token, value)
668 def tokeniter( 669 self, 670 source: str, 671 name: t.Optional[str], 672 filename: t.Optional[str] = None, 673 state: t.Optional[str] = None, 674 ) -> t.Iterator[t.Tuple[int, str, str]]: 675 """This method tokenizes the text and returns the tokens in a 676 generator. Use this method if you just want to tokenize a template. 677 678 .. versionchanged:: 3.0 679 Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line 680 breaks. 681 """ 682 lines = newline_re.split(source)[::2] 683 684 if not self.keep_trailing_newline and lines[-1] == "": 685 del lines[-1] 686 687 source = "\n".join(lines) 688 pos = 0 689 lineno = 1 690 stack = ["root"] 691 692 if state is not None and state != "root": 693 assert state in ("variable", "block"), "invalid state" 694 stack.append(state + "_begin") 695 696 statetokens = self.rules[stack[-1]] 697 source_length = len(source) 698 balancing_stack: t.List[str] = [] 699 newlines_stripped = 0 700 line_starting = True 701 702 while True: 703 # tokenizer loop 704 for regex, tokens, new_state in statetokens: 705 m = regex.match(source, pos) 706 707 # if no match we try again with the next rule 708 if m is None: 709 continue 710 711 # we only match blocks and variables if braces / parentheses 712 # are balanced. continue parsing with the lower rule which 713 # is the operator rule. do this only if the end tags look 714 # like operators 715 if balancing_stack and tokens in ( 716 TOKEN_VARIABLE_END, 717 TOKEN_BLOCK_END, 718 TOKEN_LINESTATEMENT_END, 719 ): 720 continue 721 722 # tuples support more options 723 if isinstance(tokens, tuple): 724 groups: t.Sequence[str] = m.groups() 725 726 if isinstance(tokens, OptionalLStrip): 727 # Rule supports lstrip. Match will look like 728 # text, block type, whitespace control, type, control, ... 729 text = groups[0] 730 # Skipping the text and first type, every other group is the 731 # whitespace control for each type. One of the groups will be 732 # -, +, or empty string instead of None. 733 strip_sign = next(g for g in groups[2::2] if g is not None) 734 735 if strip_sign == "-": 736 # Strip all whitespace between the text and the tag. 737 stripped = text.rstrip() 738 newlines_stripped = text[len(stripped) :].count("\n") 739 groups = [stripped, *groups[1:]] 740 elif ( 741 # Not marked for preserving whitespace. 742 strip_sign != "+" 743 # lstrip is enabled. 744 and self.lstrip_blocks 745 # Not a variable expression. 746 and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) 747 ): 748 # The start of text between the last newline and the tag. 749 l_pos = text.rfind("\n") + 1 750 751 if l_pos > 0 or line_starting: 752 # If there's only whitespace between the newline and the 753 # tag, strip it. 754 if whitespace_re.fullmatch(text, l_pos): 755 groups = [text[:l_pos], *groups[1:]] 756 757 for idx, token in enumerate(tokens): 758 # failure group 759 if token.__class__ is Failure: 760 raise token(lineno, filename) 761 # bygroup is a bit more complex, in that case we 762 # yield for the current token the first named 763 # group that matched 764 elif token == "#bygroup": 765 for key, value in m.groupdict().items(): 766 if value is not None: 767 yield lineno, key, value 768 lineno += value.count("\n") 769 break 770 else: 771 raise RuntimeError( 772 f"{regex!r} wanted to resolve the token dynamically" 773 " but no group matched" 774 ) 775 # normal group 776 else: 777 data = groups[idx] 778 779 if data or token not in ignore_if_empty: 780 yield lineno, token, data 781 782 lineno += data.count("\n") + newlines_stripped 783 newlines_stripped = 0 784 785 # strings as token just are yielded as it. 786 else: 787 data = m.group() 788 789 # update brace/parentheses balance 790 if tokens == TOKEN_OPERATOR: 791 if data == "{": 792 balancing_stack.append("}") 793 elif data == "(": 794 balancing_stack.append(")") 795 elif data == "[": 796 balancing_stack.append("]") 797 elif data in ("}", ")", "]"): 798 if not balancing_stack: 799 raise TemplateSyntaxError( 800 f"unexpected '{data}'", lineno, name, filename 801 ) 802 803 expected_op = balancing_stack.pop() 804 805 if expected_op != data: 806 raise TemplateSyntaxError( 807 f"unexpected '{data}', expected '{expected_op}'", 808 lineno, 809 name, 810 filename, 811 ) 812 813 # yield items 814 if data or tokens not in ignore_if_empty: 815 yield lineno, tokens, data 816 817 lineno += data.count("\n") 818 819 line_starting = m.group()[-1:] == "\n" 820 # fetch new position into new variable so that we can check 821 # if there is a internal parsing error which would result 822 # in an infinite loop 823 pos2 = m.end() 824 825 # handle state changes 826 if new_state is not None: 827 # remove the uppermost state 828 if new_state == "#pop": 829 stack.pop() 830 # resolve the new state by group checking 831 elif new_state == "#bygroup": 832 for key, value in m.groupdict().items(): 833 if value is not None: 834 stack.append(key) 835 break 836 else: 837 raise RuntimeError( 838 f"{regex!r} wanted to resolve the new state dynamically" 839 f" but no group matched" 840 ) 841 # direct state name given 842 else: 843 stack.append(new_state) 844 845 statetokens = self.rules[stack[-1]] 846 # we are still at the same position and no stack change. 847 # this means a loop without break condition, avoid that and 848 # raise error 849 elif pos2 == pos: 850 raise RuntimeError( 851 f"{regex!r} yielded empty string without stack change" 852 ) 853 854 # publish new function and start again 855 pos = pos2 856 break 857 # if loop terminated without break we haven't found a single match 858 # either we are at the end of the file or we have a problem 859 else: 860 # end of text 861 if pos >= source_length: 862 return 863 864 # something went wrong 865 raise TemplateSyntaxError( 866 f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename 867 )
This method tokenizes the text and returns the tokens in a generator. Use this method if you just want to tokenize a template.
Changed in version 3.0:
Only \n, \r\n and \r are treated as line
breaks.