blob: 5594e0a64b6cac9a7185c18c36651452cae6479e [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher55494e42010-01-22 09:41:48 +010014 :copyright: (c) 2010 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Thomas Waldmann7d295622013-05-18 00:06:22 +020021from jinja2.utils import LRUCache
Thomas Waldmanne0003552013-05-17 23:52:14 +020022import six
Armin Ronacher92f572f2007-02-26 22:17:32 +010023
24
Armin Ronacher21580912007-04-17 17:13:10 +020025# cache for the lexers. Exists in order to be able to have multiple
26# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020027_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020028
Armin Ronacher92f572f2007-02-26 22:17:32 +010029# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020030whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010031string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020032 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020033integer_re = re.compile(r'\d+')
Armin Ronacherbd357722009-08-05 20:25:06 +020034
35# we use the unicode identifier rule if this python version is able
36# to handle unicode identifiers, otherwise the standard ASCII one.
37try:
38 compile('föö', '<unknown>', 'eval')
39except SyntaxError:
40 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
41else:
42 from jinja2 import _stringdefs
43 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
44 _stringdefs.xid_continue))
45
Armin Ronachercb1b97f2008-09-10 14:03:53 +020046float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020047newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010048
Thomas Waldmann7d295622013-05-18 00:06:22 +020049try:
50 intern = intern # py2
51except NameError:
52 import sys
53 intern = sys.intern # py3
54
Armin Ronacherb3b58022009-02-04 19:33:58 +010055# internal the tokens and keep references to them
56TOKEN_ADD = intern('add')
57TOKEN_ASSIGN = intern('assign')
58TOKEN_COLON = intern('colon')
59TOKEN_COMMA = intern('comma')
60TOKEN_DIV = intern('div')
61TOKEN_DOT = intern('dot')
62TOKEN_EQ = intern('eq')
63TOKEN_FLOORDIV = intern('floordiv')
64TOKEN_GT = intern('gt')
65TOKEN_GTEQ = intern('gteq')
66TOKEN_LBRACE = intern('lbrace')
67TOKEN_LBRACKET = intern('lbracket')
68TOKEN_LPAREN = intern('lparen')
69TOKEN_LT = intern('lt')
70TOKEN_LTEQ = intern('lteq')
71TOKEN_MOD = intern('mod')
72TOKEN_MUL = intern('mul')
73TOKEN_NE = intern('ne')
74TOKEN_PIPE = intern('pipe')
75TOKEN_POW = intern('pow')
76TOKEN_RBRACE = intern('rbrace')
77TOKEN_RBRACKET = intern('rbracket')
78TOKEN_RPAREN = intern('rparen')
79TOKEN_SEMICOLON = intern('semicolon')
80TOKEN_SUB = intern('sub')
81TOKEN_TILDE = intern('tilde')
82TOKEN_WHITESPACE = intern('whitespace')
83TOKEN_FLOAT = intern('float')
84TOKEN_INTEGER = intern('integer')
85TOKEN_NAME = intern('name')
86TOKEN_STRING = intern('string')
87TOKEN_OPERATOR = intern('operator')
88TOKEN_BLOCK_BEGIN = intern('block_begin')
89TOKEN_BLOCK_END = intern('block_end')
90TOKEN_VARIABLE_BEGIN = intern('variable_begin')
91TOKEN_VARIABLE_END = intern('variable_end')
92TOKEN_RAW_BEGIN = intern('raw_begin')
93TOKEN_RAW_END = intern('raw_end')
94TOKEN_COMMENT_BEGIN = intern('comment_begin')
95TOKEN_COMMENT_END = intern('comment_end')
96TOKEN_COMMENT = intern('comment')
97TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
98TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +020099TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
100TOKEN_LINECOMMENT_END = intern('linecomment_end')
101TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +0100102TOKEN_DATA = intern('data')
103TOKEN_INITIAL = intern('initial')
104TOKEN_EOF = intern('eof')
105
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200106# bind operators to token types
107operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +0100108 '+': TOKEN_ADD,
109 '-': TOKEN_SUB,
110 '/': TOKEN_DIV,
111 '//': TOKEN_FLOORDIV,
112 '*': TOKEN_MUL,
113 '%': TOKEN_MOD,
114 '**': TOKEN_POW,
115 '~': TOKEN_TILDE,
116 '[': TOKEN_LBRACKET,
117 ']': TOKEN_RBRACKET,
118 '(': TOKEN_LPAREN,
119 ')': TOKEN_RPAREN,
120 '{': TOKEN_LBRACE,
121 '}': TOKEN_RBRACE,
122 '==': TOKEN_EQ,
123 '!=': TOKEN_NE,
124 '>': TOKEN_GT,
125 '>=': TOKEN_GTEQ,
126 '<': TOKEN_LT,
127 '<=': TOKEN_LTEQ,
128 '=': TOKEN_ASSIGN,
129 '.': TOKEN_DOT,
130 ':': TOKEN_COLON,
131 '|': TOKEN_PIPE,
132 ',': TOKEN_COMMA,
133 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200134}
135
Thomas Waldmanne0003552013-05-17 23:52:14 +0200136reverse_operators = dict([(v, k) for k, v in six.iteritems(operators)])
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200137assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200138operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
139 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200140
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200141ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
142 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
143 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
144 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200145ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
146 TOKEN_COMMENT, TOKEN_LINECOMMENT])
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200147
Armin Ronacher92f572f2007-02-26 22:17:32 +0100148
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100149def _describe_token_type(token_type):
150 if token_type in reverse_operators:
151 return reverse_operators[token_type]
152 return {
153 TOKEN_COMMENT_BEGIN: 'begin of comment',
154 TOKEN_COMMENT_END: 'end of comment',
155 TOKEN_COMMENT: 'comment',
156 TOKEN_LINECOMMENT: 'comment',
157 TOKEN_BLOCK_BEGIN: 'begin of statement block',
158 TOKEN_BLOCK_END: 'end of statement block',
159 TOKEN_VARIABLE_BEGIN: 'begin of print statement',
160 TOKEN_VARIABLE_END: 'end of print statement',
161 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement',
162 TOKEN_LINESTATEMENT_END: 'end of line statement',
163 TOKEN_DATA: 'template data / text',
164 TOKEN_EOF: 'end of template'
165 }.get(token_type, token_type)
166
167
168def describe_token(token):
169 """Returns a description of the token."""
170 if token.type == 'name':
171 return token.value
172 return _describe_token_type(token.type)
173
174
175def describe_token_expr(expr):
176 """Like `describe_token` but for token expressions."""
177 if ':' in expr:
178 type, value = expr.split(':', 1)
179 if type == 'name':
180 return value
181 else:
182 type = expr
183 return _describe_token_type(type)
184
185
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200186def count_newlines(value):
187 """Count the number of newline characters in the string. This is
188 useful for extensions that filter a stream.
189 """
190 return len(newline_re.findall(value))
191
192
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200193def compile_rules(environment):
194 """Compiles all the rules from the environment into a list of rules."""
195 e = re.escape
196 rules = [
197 (len(environment.comment_start_string), 'comment',
198 e(environment.comment_start_string)),
199 (len(environment.block_start_string), 'block',
200 e(environment.block_start_string)),
201 (len(environment.variable_start_string), 'variable',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200202 e(environment.variable_start_string))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200203 ]
204
205 if environment.line_statement_prefix is not None:
206 rules.append((len(environment.line_statement_prefix), 'linestatement',
Adam Spiers08f38a82013-05-01 18:42:55 +0100207 r'^[ \t\v]*' + e(environment.line_statement_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200208 if environment.line_comment_prefix is not None:
209 rules.append((len(environment.line_comment_prefix), 'linecomment',
Armin Ronachera0727a62009-04-02 14:14:30 +0200210 r'(?:^|(?<=\S))[^\S\r\n]*' +
211 e(environment.line_comment_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200212
213 return [x[1:] for x in sorted(rules, reverse=True)]
214
215
Armin Ronacher92f572f2007-02-26 22:17:32 +0100216class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200217 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100218 Used by the `Lexer` to specify known errors.
219 """
220
221 def __init__(self, message, cls=TemplateSyntaxError):
222 self.message = message
223 self.error_class = cls
224
Armin Ronacher720e55b2007-05-30 00:57:49 +0200225 def __call__(self, lineno, filename):
226 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100227
228
Armin Ronacher4325e372008-05-01 22:59:47 +0200229class Token(tuple):
230 """Token class."""
231 __slots__ = ()
232 lineno, type, value = (property(itemgetter(x)) for x in range(3))
233
234 def __new__(cls, lineno, type, value):
235 return tuple.__new__(cls, (lineno, intern(str(type)), value))
236
237 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200238 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200239 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100240 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200241 return self.value
242 return self.type
243
244 def test(self, expr):
245 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200246 token type or ``'token_type:token_value'``. This can only test
247 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200248 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200249 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200250 # passed an iterable of not interned strings.
251 if self.type == expr:
252 return True
253 elif ':' in expr:
254 return expr.split(':', 1) == [self.type, self.value]
255 return False
256
Armin Ronachercda43df2008-05-03 17:10:05 +0200257 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200258 """Test against multiple token expressions."""
259 for expr in iterable:
260 if self.test(expr):
261 return True
262 return False
263
264 def __repr__(self):
265 return 'Token(%r, %r, %r)' % (
266 self.lineno,
267 self.type,
268 self.value
269 )
270
271
Thomas Waldmann7d295622013-05-18 00:06:22 +0200272class TokenStreamIterator(six.Iterator):
Armin Ronacher4325e372008-05-01 22:59:47 +0200273 """The iterator for tokenstreams. Iterate over the stream
274 until the eof token is reached.
275 """
276
277 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200278 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200279
280 def __iter__(self):
281 return self
282
Thomas Waldmann7d295622013-05-18 00:06:22 +0200283 def __next__(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200284 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100285 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200286 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200287 raise StopIteration()
Thomas Waldmann7d295622013-05-18 00:06:22 +0200288 six.advance_iterator(self.stream)
Armin Ronacher4325e372008-05-01 22:59:47 +0200289 return token
290
291
Thomas Waldmann7d295622013-05-18 00:06:22 +0200292class TokenStream(six.Iterator):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200293 """A token stream is an iterable that yields :class:`Token`\s. The
294 parser however does not iterate over it but calls :meth:`next` to go
295 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200296 """
297
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200298 def __init__(self, generator, name, filename):
Thomas Waldmann7d295622013-05-18 00:06:22 +0200299 self._iter = iter(generator)
Armin Ronacher4325e372008-05-01 22:59:47 +0200300 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200301 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200302 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200303 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100304 self.current = Token(1, TOKEN_INITIAL, '')
Thomas Waldmann7d295622013-05-18 00:06:22 +0200305 six.advance_iterator(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200306
307 def __iter__(self):
308 return TokenStreamIterator(self)
309
Thomas Waldmann7d295622013-05-18 00:06:22 +0200310 def __bool__(self):
Armin Ronacherb3b58022009-02-04 19:33:58 +0100311 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Thomas Waldmann7d295622013-05-18 00:06:22 +0200312 __nonzero__ = __bool__ # py2
Armin Ronacher4325e372008-05-01 22:59:47 +0200313
Armin Ronacher42a19882009-08-05 18:45:39 +0200314 eos = property(lambda x: not x, doc="Are we at the end of the stream?")
Armin Ronacher4325e372008-05-01 22:59:47 +0200315
316 def push(self, token):
317 """Push a token back to the stream."""
318 self._pushed.append(token)
319
320 def look(self):
321 """Look at the next token."""
Thomas Waldmann7d295622013-05-18 00:06:22 +0200322 old_token = six.advance_iterator(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200323 result = self.current
324 self.push(result)
325 self.current = old_token
326 return result
327
Armin Ronacherea847c52008-05-02 20:04:32 +0200328 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200329 """Got n tokens ahead."""
Thomas Waldmanne0003552013-05-17 23:52:14 +0200330 for x in range(n):
Thomas Waldmann7d295622013-05-18 00:06:22 +0200331 six.advance_iterator(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200332
Armin Ronacherfdf95302008-05-11 22:20:51 +0200333 def next_if(self, expr):
334 """Perform the token test and return the token if it matched.
335 Otherwise the return value is `None`.
336 """
337 if self.current.test(expr):
Thomas Waldmann7d295622013-05-18 00:06:22 +0200338 return six.advance_iterator(self)
Armin Ronacherfdf95302008-05-11 22:20:51 +0200339
340 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200341 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200342 return self.next_if(expr) is not None
343
Thomas Waldmann7d295622013-05-18 00:06:22 +0200344 def __next__(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200345 """Go one token ahead and return the old one"""
346 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200347 if self._pushed:
348 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100349 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200350 try:
Thomas Waldmann7d295622013-05-18 00:06:22 +0200351 self.current = six.advance_iterator(self._iter)
Armin Ronacherfdf95302008-05-11 22:20:51 +0200352 except StopIteration:
353 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200354 return rv
355
356 def close(self):
357 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100358 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Thomas Waldmann7d295622013-05-18 00:06:22 +0200359 self._iter = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200360 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200361
362 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200363 """Expect a given token type and return it. This accepts the same
364 argument as :meth:`jinja2.lexer.Token.test`.
365 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200366 if not self.current.test(expr):
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100367 expr = describe_token_expr(expr)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100368 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200369 raise TemplateSyntaxError('unexpected end of template, '
370 'expected %r.' % expr,
371 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200372 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200373 raise TemplateSyntaxError("expected token %r, got %r" %
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100374 (expr, describe_token(self.current)),
Armin Ronacher4325e372008-05-01 22:59:47 +0200375 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200376 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200377 try:
378 return self.current
379 finally:
Thomas Waldmann7d295622013-05-18 00:06:22 +0200380 six.advance_iterator(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200381
382
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200383def get_lexer(environment):
384 """Return a lexer which is probably cached."""
385 key = (environment.block_start_string,
386 environment.block_end_string,
387 environment.variable_start_string,
388 environment.variable_end_string,
389 environment.comment_start_string,
390 environment.comment_end_string,
391 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200392 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200393 environment.trim_blocks,
W. Trevor King7e912c62013-01-11 08:23:24 -0500394 environment.newline_sequence,
395 environment.keep_trailing_newline)
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200396 lexer = _lexer_cache.get(key)
397 if lexer is None:
398 lexer = Lexer(environment)
399 _lexer_cache[key] = lexer
400 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200401
402
Armin Ronacher92f572f2007-02-26 22:17:32 +0100403class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200404 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100405 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200406
407 Note that the lexer is not automatically bound to an environment.
408 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100409 """
410
411 def __init__(self, environment):
412 # shortcuts
413 c = lambda x: re.compile(x, re.M | re.S)
414 e = re.escape
415
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200416 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100417 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100418 (whitespace_re, TOKEN_WHITESPACE, None),
419 (float_re, TOKEN_FLOAT, None),
420 (integer_re, TOKEN_INTEGER, None),
421 (name_re, TOKEN_NAME, None),
422 (string_re, TOKEN_STRING, None),
423 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100424 ]
425
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200426 # assemble the root lexing rule. because "|" is ungreedy
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100427 # we have to sort by length so that the lexer continues working
428 # as expected when we have parsing rules like <% for block and
429 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200430 # variables are just part of the rules if variable processing
431 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200432 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100433
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200434 # block suffix if trimming is enabled
435 block_suffix_re = environment.trim_blocks and '\\n?' or ''
436
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200437 self.newline_sequence = environment.newline_sequence
W. Trevor King7e912c62013-01-11 08:23:24 -0500438 self.keep_trailing_newline = environment.keep_trailing_newline
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200439
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200440 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100441 self.rules = {
442 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100443 # directives
444 (c('(.*?)(?:%s)' % '|'.join(
Armin Ronacherac0c0d02010-06-05 14:37:32 +0200445 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*(?:\-%s\s*|%s))' % (
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100446 e(environment.block_start_string),
447 e(environment.block_start_string),
Armin Ronacherac0c0d02010-06-05 14:37:32 +0200448 e(environment.block_end_string),
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100449 e(environment.block_end_string)
450 )] + [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200451 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100452 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100453 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200454 # data
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200455 (c('.+'), TOKEN_DATA, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100456 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200457 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100458 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200459 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200460 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200461 e(environment.comment_end_string),
462 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100463 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100464 (c('(.)'), (Failure('Missing end of comment tag'),), None)
465 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200466 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100467 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200468 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200469 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200470 e(environment.block_end_string),
471 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100472 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100473 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200474 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100475 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200476 (c('\-%s\s*|%s' % (
477 e(environment.variable_end_string),
478 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100479 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200480 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200481 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100482 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200483 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
484 e(environment.block_start_string),
485 e(environment.block_start_string),
486 e(environment.block_end_string),
487 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200488 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100489 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200490 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200491 ],
492 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100493 TOKEN_LINESTATEMENT_BEGIN: [
494 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200495 ] + tag_rules,
496 # line comments
497 TOKEN_LINECOMMENT_BEGIN: [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200498 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
499 TOKEN_LINECOMMENT_END), '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200500 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200501 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200502
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200503 def _normalize_newlines(self, value):
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200504 """Called for strings and template data to normalize it to unicode."""
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200505 return newline_re.sub(self.newline_sequence, value)
506
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100507 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200508 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100509 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100510 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200511 return TokenStream(self.wrap(stream, name, filename), name, filename)
512
513 def wrap(self, stream, name=None, filename=None):
514 """This is called with the stream as returned by `tokenize` and wraps
515 every token in a :class:`Token` and converts the value.
516 """
517 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200518 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200519 continue
520 elif token == 'linestatement_begin':
521 token = 'block_begin'
522 elif token == 'linestatement_end':
523 token = 'block_end'
524 # we are not interested in those tokens in the parser
525 elif token in ('raw_begin', 'raw_end'):
526 continue
527 elif token == 'data':
528 value = self._normalize_newlines(value)
529 elif token == 'keyword':
530 token = value
531 elif token == 'name':
532 value = str(value)
533 elif token == 'string':
534 # try to unescape string
535 try:
536 value = self._normalize_newlines(value[1:-1]) \
537 .encode('ascii', 'backslashreplace') \
538 .decode('unicode-escape')
Thomas Waldmanne0003552013-05-17 23:52:14 +0200539 except Exception as e:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200540 msg = str(e).split(':')[-1].strip()
541 raise TemplateSyntaxError(msg, lineno, name, filename)
542 # if we can express it as bytestring (ascii only)
543 # we do that for support of semi broken APIs
Armin Ronacher0d242be2010-02-10 01:35:13 +0100544 # as datetime.datetime.strftime. On python 3 this
545 # call becomes a noop thanks to 2to3
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200546 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200547 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200548 except UnicodeError:
549 pass
550 elif token == 'integer':
551 value = int(value)
552 elif token == 'float':
553 value = float(value)
554 elif token == 'operator':
555 token = operators[value]
556 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100557
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100558 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200559 """This method tokenizes the text and returns the tokens in a
560 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100561 """
Armin Ronacherf3acf0b2013-05-19 11:09:19 +0100562 source = six.text_type(source)
W. Trevor King7e912c62013-01-11 08:23:24 -0500563 lines = source.splitlines()
564 if self.keep_trailing_newline and source:
565 for newline in ('\r\n', '\r', '\n'):
566 if source.endswith(newline):
567 lines.append('')
568 break
569 source = '\n'.join(lines)
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100570 pos = 0
571 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100572 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100573 if state is not None and state != 'root':
574 assert state in ('variable', 'block'), 'invalid state'
575 stack.append(state + '_begin')
576 else:
577 state = 'root'
578 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100579 source_length = len(source)
580
Armin Ronacher21580912007-04-17 17:13:10 +0200581 balancing_stack = []
582
Armin Ronacher71082072008-04-12 14:19:36 +0200583 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100584 # tokenizer loop
585 for regex, tokens, new_state in statetokens:
586 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200587 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200588 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200589 continue
590
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200591 # we only match blocks and variables if braces / parentheses
Armin Ronacher21580912007-04-17 17:13:10 +0200592 # are balanced. continue parsing with the lower rule which
593 # is the operator rule. do this only if the end tags look
594 # like operators
595 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200596 tokens in ('variable_end', 'block_end',
597 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200598 continue
599
600 # tuples support more options
601 if isinstance(tokens, tuple):
602 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200603 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200604 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200605 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200606 # bygroup is a bit more complex, in that case we
607 # yield for the current token the first named
608 # group that matched
609 elif token == '#bygroup':
Thomas Waldmanne0003552013-05-17 23:52:14 +0200610 for key, value in six.iteritems(m.groupdict()):
Armin Ronacher92f572f2007-02-26 22:17:32 +0100611 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200612 yield lineno, key, value
613 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100614 break
615 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200616 raise RuntimeError('%r wanted to resolve '
617 'the token dynamically'
618 ' but no group matched'
619 % regex)
620 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100621 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200622 data = m.group(idx + 1)
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200623 if data or token not in ignore_if_empty:
Armin Ronacher21580912007-04-17 17:13:10 +0200624 yield lineno, token, data
625 lineno += data.count('\n')
626
Armin Ronacher71082072008-04-12 14:19:36 +0200627 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200628 else:
629 data = m.group()
630 # update brace/parentheses balance
631 if tokens == 'operator':
632 if data == '{':
633 balancing_stack.append('}')
634 elif data == '(':
635 balancing_stack.append(')')
636 elif data == '[':
637 balancing_stack.append(']')
638 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200639 if not balancing_stack:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100640 raise TemplateSyntaxError('unexpected \'%s\'' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200641 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200642 filename)
643 expected_op = balancing_stack.pop()
644 if expected_op != data:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100645 raise TemplateSyntaxError('unexpected \'%s\', '
646 'expected \'%s\'' %
Armin Ronacherf750daa2007-05-29 23:22:38 +0200647 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200648 lineno, name,
649 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200650 # yield items
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200651 if data or tokens not in ignore_if_empty:
652 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200653 lineno += data.count('\n')
654
655 # fetch new position into new variable so that we can check
656 # if there is a internal parsing error which would result
657 # in an infinite loop
658 pos2 = m.end()
659
660 # handle state changes
661 if new_state is not None:
662 # remove the uppermost state
663 if new_state == '#pop':
664 stack.pop()
665 # resolve the new state by group checking
666 elif new_state == '#bygroup':
Thomas Waldmanne0003552013-05-17 23:52:14 +0200667 for key, value in six.iteritems(m.groupdict()):
Armin Ronacher21580912007-04-17 17:13:10 +0200668 if value is not None:
669 stack.append(key)
670 break
671 else:
672 raise RuntimeError('%r wanted to resolve the '
673 'new state dynamically but'
674 ' no group matched' %
675 regex)
676 # direct state name given
677 else:
678 stack.append(new_state)
679 statetokens = self.rules[stack[-1]]
680 # we are still at the same position and no stack change.
681 # this means a loop without break condition, avoid that and
682 # raise error
683 elif pos2 == pos:
684 raise RuntimeError('%r yielded empty string without '
685 'stack change' % regex)
686 # publish new function and start again
687 pos = pos2
688 break
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200689 # if loop terminated without break we haven't found a single match
Armin Ronacher92f572f2007-02-26 22:17:32 +0100690 # either we are at the end of the file or we have a problem
691 else:
692 # end of text
693 if pos >= source_length:
694 return
695 # something went wrong
696 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200697 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200698 name, filename)