blob: 07cf5c606d40bcc4fcc6e3c2791df217fd106020 [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher55494e42010-01-22 09:41:48 +010014 :copyright: (c) 2010 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacherc87d4cf2013-05-19 13:46:22 +010018import six
19
Armin Ronacher4325e372008-05-01 22:59:47 +020020from operator import itemgetter
21from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020022from jinja2.exceptions import TemplateSyntaxError
Thomas Waldmann7d295622013-05-18 00:06:22 +020023from jinja2.utils import LRUCache
Armin Ronacherc87d4cf2013-05-19 13:46:22 +010024from jinja2._compat import next
Armin Ronacher92f572f2007-02-26 22:17:32 +010025
26
Armin Ronacher21580912007-04-17 17:13:10 +020027# cache for the lexers. Exists in order to be able to have multiple
28# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020029_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020030
Armin Ronacher92f572f2007-02-26 22:17:32 +010031# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020032whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010033string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020034 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020035integer_re = re.compile(r'\d+')
Armin Ronacherbd357722009-08-05 20:25:06 +020036
37# we use the unicode identifier rule if this python version is able
38# to handle unicode identifiers, otherwise the standard ASCII one.
39try:
40 compile('föö', '<unknown>', 'eval')
41except SyntaxError:
42 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
43else:
44 from jinja2 import _stringdefs
45 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
46 _stringdefs.xid_continue))
47
Armin Ronachercb1b97f2008-09-10 14:03:53 +020048float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020049newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010050
Thomas Waldmann7d295622013-05-18 00:06:22 +020051try:
52 intern = intern # py2
53except NameError:
54 import sys
55 intern = sys.intern # py3
56
Armin Ronacherb3b58022009-02-04 19:33:58 +010057# internal the tokens and keep references to them
58TOKEN_ADD = intern('add')
59TOKEN_ASSIGN = intern('assign')
60TOKEN_COLON = intern('colon')
61TOKEN_COMMA = intern('comma')
62TOKEN_DIV = intern('div')
63TOKEN_DOT = intern('dot')
64TOKEN_EQ = intern('eq')
65TOKEN_FLOORDIV = intern('floordiv')
66TOKEN_GT = intern('gt')
67TOKEN_GTEQ = intern('gteq')
68TOKEN_LBRACE = intern('lbrace')
69TOKEN_LBRACKET = intern('lbracket')
70TOKEN_LPAREN = intern('lparen')
71TOKEN_LT = intern('lt')
72TOKEN_LTEQ = intern('lteq')
73TOKEN_MOD = intern('mod')
74TOKEN_MUL = intern('mul')
75TOKEN_NE = intern('ne')
76TOKEN_PIPE = intern('pipe')
77TOKEN_POW = intern('pow')
78TOKEN_RBRACE = intern('rbrace')
79TOKEN_RBRACKET = intern('rbracket')
80TOKEN_RPAREN = intern('rparen')
81TOKEN_SEMICOLON = intern('semicolon')
82TOKEN_SUB = intern('sub')
83TOKEN_TILDE = intern('tilde')
84TOKEN_WHITESPACE = intern('whitespace')
85TOKEN_FLOAT = intern('float')
86TOKEN_INTEGER = intern('integer')
87TOKEN_NAME = intern('name')
88TOKEN_STRING = intern('string')
89TOKEN_OPERATOR = intern('operator')
90TOKEN_BLOCK_BEGIN = intern('block_begin')
91TOKEN_BLOCK_END = intern('block_end')
92TOKEN_VARIABLE_BEGIN = intern('variable_begin')
93TOKEN_VARIABLE_END = intern('variable_end')
94TOKEN_RAW_BEGIN = intern('raw_begin')
95TOKEN_RAW_END = intern('raw_end')
96TOKEN_COMMENT_BEGIN = intern('comment_begin')
97TOKEN_COMMENT_END = intern('comment_end')
98TOKEN_COMMENT = intern('comment')
99TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
100TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200101TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
102TOKEN_LINECOMMENT_END = intern('linecomment_end')
103TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +0100104TOKEN_DATA = intern('data')
105TOKEN_INITIAL = intern('initial')
106TOKEN_EOF = intern('eof')
107
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200108# bind operators to token types
109operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +0100110 '+': TOKEN_ADD,
111 '-': TOKEN_SUB,
112 '/': TOKEN_DIV,
113 '//': TOKEN_FLOORDIV,
114 '*': TOKEN_MUL,
115 '%': TOKEN_MOD,
116 '**': TOKEN_POW,
117 '~': TOKEN_TILDE,
118 '[': TOKEN_LBRACKET,
119 ']': TOKEN_RBRACKET,
120 '(': TOKEN_LPAREN,
121 ')': TOKEN_RPAREN,
122 '{': TOKEN_LBRACE,
123 '}': TOKEN_RBRACE,
124 '==': TOKEN_EQ,
125 '!=': TOKEN_NE,
126 '>': TOKEN_GT,
127 '>=': TOKEN_GTEQ,
128 '<': TOKEN_LT,
129 '<=': TOKEN_LTEQ,
130 '=': TOKEN_ASSIGN,
131 '.': TOKEN_DOT,
132 ':': TOKEN_COLON,
133 '|': TOKEN_PIPE,
134 ',': TOKEN_COMMA,
135 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200136}
137
Thomas Waldmanne0003552013-05-17 23:52:14 +0200138reverse_operators = dict([(v, k) for k, v in six.iteritems(operators)])
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200139assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200140operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
141 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200142
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200143ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
144 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
145 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
146 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200147ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
148 TOKEN_COMMENT, TOKEN_LINECOMMENT])
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200149
Armin Ronacher92f572f2007-02-26 22:17:32 +0100150
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100151def _describe_token_type(token_type):
152 if token_type in reverse_operators:
153 return reverse_operators[token_type]
154 return {
155 TOKEN_COMMENT_BEGIN: 'begin of comment',
156 TOKEN_COMMENT_END: 'end of comment',
157 TOKEN_COMMENT: 'comment',
158 TOKEN_LINECOMMENT: 'comment',
159 TOKEN_BLOCK_BEGIN: 'begin of statement block',
160 TOKEN_BLOCK_END: 'end of statement block',
161 TOKEN_VARIABLE_BEGIN: 'begin of print statement',
162 TOKEN_VARIABLE_END: 'end of print statement',
163 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement',
164 TOKEN_LINESTATEMENT_END: 'end of line statement',
165 TOKEN_DATA: 'template data / text',
166 TOKEN_EOF: 'end of template'
167 }.get(token_type, token_type)
168
169
170def describe_token(token):
171 """Returns a description of the token."""
172 if token.type == 'name':
173 return token.value
174 return _describe_token_type(token.type)
175
176
177def describe_token_expr(expr):
178 """Like `describe_token` but for token expressions."""
179 if ':' in expr:
180 type, value = expr.split(':', 1)
181 if type == 'name':
182 return value
183 else:
184 type = expr
185 return _describe_token_type(type)
186
187
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200188def count_newlines(value):
189 """Count the number of newline characters in the string. This is
190 useful for extensions that filter a stream.
191 """
192 return len(newline_re.findall(value))
193
194
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200195def compile_rules(environment):
196 """Compiles all the rules from the environment into a list of rules."""
197 e = re.escape
198 rules = [
199 (len(environment.comment_start_string), 'comment',
200 e(environment.comment_start_string)),
201 (len(environment.block_start_string), 'block',
202 e(environment.block_start_string)),
203 (len(environment.variable_start_string), 'variable',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200204 e(environment.variable_start_string))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200205 ]
206
207 if environment.line_statement_prefix is not None:
208 rules.append((len(environment.line_statement_prefix), 'linestatement',
Adam Spiers08f38a82013-05-01 18:42:55 +0100209 r'^[ \t\v]*' + e(environment.line_statement_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200210 if environment.line_comment_prefix is not None:
211 rules.append((len(environment.line_comment_prefix), 'linecomment',
Armin Ronachera0727a62009-04-02 14:14:30 +0200212 r'(?:^|(?<=\S))[^\S\r\n]*' +
213 e(environment.line_comment_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200214
215 return [x[1:] for x in sorted(rules, reverse=True)]
216
217
Armin Ronacher92f572f2007-02-26 22:17:32 +0100218class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200219 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100220 Used by the `Lexer` to specify known errors.
221 """
222
223 def __init__(self, message, cls=TemplateSyntaxError):
224 self.message = message
225 self.error_class = cls
226
Armin Ronacher720e55b2007-05-30 00:57:49 +0200227 def __call__(self, lineno, filename):
228 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100229
230
Armin Ronacher4325e372008-05-01 22:59:47 +0200231class Token(tuple):
232 """Token class."""
233 __slots__ = ()
234 lineno, type, value = (property(itemgetter(x)) for x in range(3))
235
236 def __new__(cls, lineno, type, value):
237 return tuple.__new__(cls, (lineno, intern(str(type)), value))
238
239 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200240 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200241 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100242 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200243 return self.value
244 return self.type
245
246 def test(self, expr):
247 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200248 token type or ``'token_type:token_value'``. This can only test
249 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200250 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200251 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200252 # passed an iterable of not interned strings.
253 if self.type == expr:
254 return True
255 elif ':' in expr:
256 return expr.split(':', 1) == [self.type, self.value]
257 return False
258
Armin Ronachercda43df2008-05-03 17:10:05 +0200259 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200260 """Test against multiple token expressions."""
261 for expr in iterable:
262 if self.test(expr):
263 return True
264 return False
265
266 def __repr__(self):
267 return 'Token(%r, %r, %r)' % (
268 self.lineno,
269 self.type,
270 self.value
271 )
272
273
Thomas Waldmann7d295622013-05-18 00:06:22 +0200274class TokenStreamIterator(six.Iterator):
Armin Ronacher4325e372008-05-01 22:59:47 +0200275 """The iterator for tokenstreams. Iterate over the stream
276 until the eof token is reached.
277 """
278
279 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200280 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200281
282 def __iter__(self):
283 return self
284
Thomas Waldmann7d295622013-05-18 00:06:22 +0200285 def __next__(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200286 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100287 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200288 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200289 raise StopIteration()
Armin Ronacherc87d4cf2013-05-19 13:46:22 +0100290 next(self.stream)
Armin Ronacher4325e372008-05-01 22:59:47 +0200291 return token
292
293
Thomas Waldmann7d295622013-05-18 00:06:22 +0200294class TokenStream(six.Iterator):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200295 """A token stream is an iterable that yields :class:`Token`\s. The
296 parser however does not iterate over it but calls :meth:`next` to go
297 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200298 """
299
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200300 def __init__(self, generator, name, filename):
Thomas Waldmann7d295622013-05-18 00:06:22 +0200301 self._iter = iter(generator)
Armin Ronacher4325e372008-05-01 22:59:47 +0200302 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200303 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200304 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200305 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100306 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacherc87d4cf2013-05-19 13:46:22 +0100307 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200308
309 def __iter__(self):
310 return TokenStreamIterator(self)
311
Thomas Waldmann7d295622013-05-18 00:06:22 +0200312 def __bool__(self):
Armin Ronacherb3b58022009-02-04 19:33:58 +0100313 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Thomas Waldmann7d295622013-05-18 00:06:22 +0200314 __nonzero__ = __bool__ # py2
Armin Ronacher4325e372008-05-01 22:59:47 +0200315
Armin Ronacher42a19882009-08-05 18:45:39 +0200316 eos = property(lambda x: not x, doc="Are we at the end of the stream?")
Armin Ronacher4325e372008-05-01 22:59:47 +0200317
318 def push(self, token):
319 """Push a token back to the stream."""
320 self._pushed.append(token)
321
322 def look(self):
323 """Look at the next token."""
Armin Ronacherc87d4cf2013-05-19 13:46:22 +0100324 old_token = next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200325 result = self.current
326 self.push(result)
327 self.current = old_token
328 return result
329
Armin Ronacherea847c52008-05-02 20:04:32 +0200330 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200331 """Got n tokens ahead."""
Thomas Waldmanne0003552013-05-17 23:52:14 +0200332 for x in range(n):
Armin Ronacherc87d4cf2013-05-19 13:46:22 +0100333 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200334
Armin Ronacherfdf95302008-05-11 22:20:51 +0200335 def next_if(self, expr):
336 """Perform the token test and return the token if it matched.
337 Otherwise the return value is `None`.
338 """
339 if self.current.test(expr):
Armin Ronacherc87d4cf2013-05-19 13:46:22 +0100340 return next(self)
Armin Ronacherfdf95302008-05-11 22:20:51 +0200341
342 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200343 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200344 return self.next_if(expr) is not None
345
Thomas Waldmann7d295622013-05-18 00:06:22 +0200346 def __next__(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200347 """Go one token ahead and return the old one"""
348 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200349 if self._pushed:
350 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100351 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200352 try:
Armin Ronacherc87d4cf2013-05-19 13:46:22 +0100353 self.current = next(self._iter)
Armin Ronacherfdf95302008-05-11 22:20:51 +0200354 except StopIteration:
355 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200356 return rv
357
358 def close(self):
359 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100360 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Thomas Waldmann7d295622013-05-18 00:06:22 +0200361 self._iter = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200362 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200363
364 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200365 """Expect a given token type and return it. This accepts the same
366 argument as :meth:`jinja2.lexer.Token.test`.
367 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200368 if not self.current.test(expr):
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100369 expr = describe_token_expr(expr)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100370 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200371 raise TemplateSyntaxError('unexpected end of template, '
372 'expected %r.' % expr,
373 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200374 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200375 raise TemplateSyntaxError("expected token %r, got %r" %
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100376 (expr, describe_token(self.current)),
Armin Ronacher4325e372008-05-01 22:59:47 +0200377 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200378 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200379 try:
380 return self.current
381 finally:
Armin Ronacherc87d4cf2013-05-19 13:46:22 +0100382 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200383
384
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200385def get_lexer(environment):
386 """Return a lexer which is probably cached."""
387 key = (environment.block_start_string,
388 environment.block_end_string,
389 environment.variable_start_string,
390 environment.variable_end_string,
391 environment.comment_start_string,
392 environment.comment_end_string,
393 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200394 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200395 environment.trim_blocks,
Kristi Tsukidac5f65312012-07-11 09:15:42 -0700396 environment.lstrip_blocks,
W. Trevor King7e912c62013-01-11 08:23:24 -0500397 environment.newline_sequence,
398 environment.keep_trailing_newline)
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200399 lexer = _lexer_cache.get(key)
400 if lexer is None:
401 lexer = Lexer(environment)
402 _lexer_cache[key] = lexer
403 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200404
405
Armin Ronacher92f572f2007-02-26 22:17:32 +0100406class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200407 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100408 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200409
410 Note that the lexer is not automatically bound to an environment.
411 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100412 """
413
414 def __init__(self, environment):
415 # shortcuts
416 c = lambda x: re.compile(x, re.M | re.S)
417 e = re.escape
418
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200419 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100420 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100421 (whitespace_re, TOKEN_WHITESPACE, None),
422 (float_re, TOKEN_FLOAT, None),
423 (integer_re, TOKEN_INTEGER, None),
424 (name_re, TOKEN_NAME, None),
425 (string_re, TOKEN_STRING, None),
426 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100427 ]
428
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200429 # assemble the root lexing rule. because "|" is ungreedy
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100430 # we have to sort by length so that the lexer continues working
431 # as expected when we have parsing rules like <% for block and
432 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200433 # variables are just part of the rules if variable processing
434 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200435 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100436
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200437 # block suffix if trimming is enabled
438 block_suffix_re = environment.trim_blocks and '\\n?' or ''
439
Kristi Tsukida59f33662012-07-10 17:13:50 -0700440 # strip leading spaces if lstrip_blocks is enabled
Kristi Tsukidafdf82012012-07-12 13:01:42 -0700441 prefix_re = {}
442 if environment.lstrip_blocks:
Kristi Tsukida54f7d2e2012-07-12 13:37:21 -0700443 # use '{%+' to manually disable lstrip_blocks behavior
444 no_lstrip_re = e('+')
445 # detect overlap between block and variable or comment strings
446 block_diff = c(r'^%s(.*)' % e(environment.block_start_string))
447 # make sure we don't mistake a block for a variable or a comment
448 m = block_diff.match(environment.comment_start_string)
449 no_lstrip_re += m and r'|%s' % e(m.group(1)) or ''
450 m = block_diff.match(environment.variable_start_string)
451 no_lstrip_re += m and r'|%s' % e(m.group(1)) or ''
452
453 # detect overlap between comment and variable strings
454 comment_diff = c(r'^%s(.*)' % e(environment.comment_start_string))
455 m = comment_diff.match(environment.variable_start_string)
456 no_variable_re = m and r'(?!%s)' % e(m.group(1)) or ''
457
Kristi Tsukidafdf82012012-07-12 13:01:42 -0700458 lstrip_re = r'^[ \t]*'
459 block_prefix_re = r'%s%s(?!%s)|%s\+?' % (
460 lstrip_re,
461 e(environment.block_start_string),
462 no_lstrip_re,
463 e(environment.block_start_string),
464 )
Kristi Tsukida54f7d2e2012-07-12 13:37:21 -0700465 comment_prefix_re = r'%s%s%s|%s\+?' % (
466 lstrip_re,
467 e(environment.comment_start_string),
468 no_variable_re,
469 e(environment.comment_start_string),
470 )
Kristi Tsukidafdf82012012-07-12 13:01:42 -0700471 prefix_re['block'] = block_prefix_re
Kristi Tsukida54f7d2e2012-07-12 13:37:21 -0700472 prefix_re['comment'] = comment_prefix_re
Kristi Tsukidafdf82012012-07-12 13:01:42 -0700473 else:
474 block_prefix_re = '%s' % e(environment.block_start_string)
Kristi Tsukida59f33662012-07-10 17:13:50 -0700475
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200476 self.newline_sequence = environment.newline_sequence
W. Trevor King7e912c62013-01-11 08:23:24 -0500477 self.keep_trailing_newline = environment.keep_trailing_newline
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200478
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200479 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100480 self.rules = {
481 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100482 # directives
483 (c('(.*?)(?:%s)' % '|'.join(
Armin Ronacherac0c0d02010-06-05 14:37:32 +0200484 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*(?:\-%s\s*|%s))' % (
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100485 e(environment.block_start_string),
Kristi Tsukida15605a82012-07-10 13:25:38 -0700486 block_prefix_re,
Armin Ronacherac0c0d02010-06-05 14:37:32 +0200487 e(environment.block_end_string),
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100488 e(environment.block_end_string)
489 )] + [
Kristi Tsukidafdf82012012-07-12 13:01:42 -0700490 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, prefix_re.get(n,r))
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100491 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100492 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200493 # data
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200494 (c('.+'), TOKEN_DATA, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100495 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200496 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100497 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200498 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200499 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200500 e(environment.comment_end_string),
501 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100502 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100503 (c('(.)'), (Failure('Missing end of comment tag'),), None)
504 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200505 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100506 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200507 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200508 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200509 e(environment.block_end_string),
510 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100511 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100512 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200513 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100514 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200515 (c('\-%s\s*|%s' % (
516 e(environment.variable_end_string),
517 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100518 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200519 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200520 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100521 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200522 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
523 e(environment.block_start_string),
Kristi Tsukida15605a82012-07-10 13:25:38 -0700524 block_prefix_re,
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200525 e(environment.block_end_string),
526 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200527 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100528 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200529 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200530 ],
531 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100532 TOKEN_LINESTATEMENT_BEGIN: [
533 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200534 ] + tag_rules,
535 # line comments
536 TOKEN_LINECOMMENT_BEGIN: [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200537 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
538 TOKEN_LINECOMMENT_END), '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200539 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200540 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200541
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200542 def _normalize_newlines(self, value):
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200543 """Called for strings and template data to normalize it to unicode."""
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200544 return newline_re.sub(self.newline_sequence, value)
545
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100546 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200547 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100548 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100549 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200550 return TokenStream(self.wrap(stream, name, filename), name, filename)
551
552 def wrap(self, stream, name=None, filename=None):
553 """This is called with the stream as returned by `tokenize` and wraps
554 every token in a :class:`Token` and converts the value.
555 """
556 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200557 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200558 continue
559 elif token == 'linestatement_begin':
560 token = 'block_begin'
561 elif token == 'linestatement_end':
562 token = 'block_end'
563 # we are not interested in those tokens in the parser
564 elif token in ('raw_begin', 'raw_end'):
565 continue
566 elif token == 'data':
567 value = self._normalize_newlines(value)
568 elif token == 'keyword':
569 token = value
570 elif token == 'name':
571 value = str(value)
572 elif token == 'string':
573 # try to unescape string
574 try:
575 value = self._normalize_newlines(value[1:-1]) \
576 .encode('ascii', 'backslashreplace') \
577 .decode('unicode-escape')
Thomas Waldmanne0003552013-05-17 23:52:14 +0200578 except Exception as e:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200579 msg = str(e).split(':')[-1].strip()
580 raise TemplateSyntaxError(msg, lineno, name, filename)
581 # if we can express it as bytestring (ascii only)
582 # we do that for support of semi broken APIs
Armin Ronacher0d242be2010-02-10 01:35:13 +0100583 # as datetime.datetime.strftime. On python 3 this
584 # call becomes a noop thanks to 2to3
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200585 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200586 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200587 except UnicodeError:
588 pass
589 elif token == 'integer':
590 value = int(value)
591 elif token == 'float':
592 value = float(value)
593 elif token == 'operator':
594 token = operators[value]
595 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100596
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100597 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200598 """This method tokenizes the text and returns the tokens in a
599 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100600 """
Armin Ronacherf3acf0b2013-05-19 11:09:19 +0100601 source = six.text_type(source)
W. Trevor King7e912c62013-01-11 08:23:24 -0500602 lines = source.splitlines()
603 if self.keep_trailing_newline and source:
604 for newline in ('\r\n', '\r', '\n'):
605 if source.endswith(newline):
606 lines.append('')
607 break
608 source = '\n'.join(lines)
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100609 pos = 0
610 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100611 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100612 if state is not None and state != 'root':
613 assert state in ('variable', 'block'), 'invalid state'
614 stack.append(state + '_begin')
615 else:
616 state = 'root'
617 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100618 source_length = len(source)
619
Armin Ronacher21580912007-04-17 17:13:10 +0200620 balancing_stack = []
621
Armin Ronacher71082072008-04-12 14:19:36 +0200622 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100623 # tokenizer loop
624 for regex, tokens, new_state in statetokens:
625 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200626 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200627 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200628 continue
629
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200630 # we only match blocks and variables if braces / parentheses
Armin Ronacher21580912007-04-17 17:13:10 +0200631 # are balanced. continue parsing with the lower rule which
632 # is the operator rule. do this only if the end tags look
633 # like operators
634 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200635 tokens in ('variable_end', 'block_end',
636 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200637 continue
638
639 # tuples support more options
640 if isinstance(tokens, tuple):
641 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200642 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200643 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200644 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200645 # bygroup is a bit more complex, in that case we
646 # yield for the current token the first named
647 # group that matched
648 elif token == '#bygroup':
Thomas Waldmanne0003552013-05-17 23:52:14 +0200649 for key, value in six.iteritems(m.groupdict()):
Armin Ronacher92f572f2007-02-26 22:17:32 +0100650 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200651 yield lineno, key, value
652 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100653 break
654 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200655 raise RuntimeError('%r wanted to resolve '
656 'the token dynamically'
657 ' but no group matched'
658 % regex)
659 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100660 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200661 data = m.group(idx + 1)
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200662 if data or token not in ignore_if_empty:
Armin Ronacher21580912007-04-17 17:13:10 +0200663 yield lineno, token, data
664 lineno += data.count('\n')
665
Armin Ronacher71082072008-04-12 14:19:36 +0200666 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200667 else:
668 data = m.group()
669 # update brace/parentheses balance
670 if tokens == 'operator':
671 if data == '{':
672 balancing_stack.append('}')
673 elif data == '(':
674 balancing_stack.append(')')
675 elif data == '[':
676 balancing_stack.append(']')
677 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200678 if not balancing_stack:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100679 raise TemplateSyntaxError('unexpected \'%s\'' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200680 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200681 filename)
682 expected_op = balancing_stack.pop()
683 if expected_op != data:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100684 raise TemplateSyntaxError('unexpected \'%s\', '
685 'expected \'%s\'' %
Armin Ronacherf750daa2007-05-29 23:22:38 +0200686 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200687 lineno, name,
688 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200689 # yield items
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200690 if data or tokens not in ignore_if_empty:
691 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200692 lineno += data.count('\n')
693
694 # fetch new position into new variable so that we can check
695 # if there is a internal parsing error which would result
696 # in an infinite loop
697 pos2 = m.end()
698
699 # handle state changes
700 if new_state is not None:
701 # remove the uppermost state
702 if new_state == '#pop':
703 stack.pop()
704 # resolve the new state by group checking
705 elif new_state == '#bygroup':
Thomas Waldmanne0003552013-05-17 23:52:14 +0200706 for key, value in six.iteritems(m.groupdict()):
Armin Ronacher21580912007-04-17 17:13:10 +0200707 if value is not None:
708 stack.append(key)
709 break
710 else:
711 raise RuntimeError('%r wanted to resolve the '
712 'new state dynamically but'
713 ' no group matched' %
714 regex)
715 # direct state name given
716 else:
717 stack.append(new_state)
718 statetokens = self.rules[stack[-1]]
719 # we are still at the same position and no stack change.
720 # this means a loop without break condition, avoid that and
721 # raise error
722 elif pos2 == pos:
723 raise RuntimeError('%r yielded empty string without '
724 'stack change' % regex)
725 # publish new function and start again
726 pos = pos2
727 break
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200728 # if loop terminated without break we haven't found a single match
Armin Ronacher92f572f2007-02-26 22:17:32 +0100729 # either we are at the end of the file or we have a problem
730 else:
731 # end of text
732 if pos >= source_length:
733 return
734 # something went wrong
735 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200736 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200737 name, filename)