blob: adb326278be465e4757f3e094b2eeaa182b5c3aa [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher55494e42010-01-22 09:41:48 +010014 :copyright: (c) 2010 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherbd357722009-08-05 20:25:06 +020021from jinja2.utils import LRUCache, next
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherbd357722009-08-05 20:25:06 +020033
34# we use the unicode identifier rule if this python version is able
35# to handle unicode identifiers, otherwise the standard ASCII one.
36try:
37 compile('föö', '<unknown>', 'eval')
38except SyntaxError:
39 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
40else:
41 from jinja2 import _stringdefs
42 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
43 _stringdefs.xid_continue))
44
Armin Ronachercb1b97f2008-09-10 14:03:53 +020045float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020046newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010047
Armin Ronacherb3b58022009-02-04 19:33:58 +010048# internal the tokens and keep references to them
49TOKEN_ADD = intern('add')
50TOKEN_ASSIGN = intern('assign')
51TOKEN_COLON = intern('colon')
52TOKEN_COMMA = intern('comma')
53TOKEN_DIV = intern('div')
54TOKEN_DOT = intern('dot')
55TOKEN_EQ = intern('eq')
56TOKEN_FLOORDIV = intern('floordiv')
57TOKEN_GT = intern('gt')
58TOKEN_GTEQ = intern('gteq')
59TOKEN_LBRACE = intern('lbrace')
60TOKEN_LBRACKET = intern('lbracket')
61TOKEN_LPAREN = intern('lparen')
62TOKEN_LT = intern('lt')
63TOKEN_LTEQ = intern('lteq')
64TOKEN_MOD = intern('mod')
65TOKEN_MUL = intern('mul')
66TOKEN_NE = intern('ne')
67TOKEN_PIPE = intern('pipe')
68TOKEN_POW = intern('pow')
69TOKEN_RBRACE = intern('rbrace')
70TOKEN_RBRACKET = intern('rbracket')
71TOKEN_RPAREN = intern('rparen')
72TOKEN_SEMICOLON = intern('semicolon')
73TOKEN_SUB = intern('sub')
74TOKEN_TILDE = intern('tilde')
75TOKEN_WHITESPACE = intern('whitespace')
76TOKEN_FLOAT = intern('float')
77TOKEN_INTEGER = intern('integer')
78TOKEN_NAME = intern('name')
79TOKEN_STRING = intern('string')
80TOKEN_OPERATOR = intern('operator')
81TOKEN_BLOCK_BEGIN = intern('block_begin')
82TOKEN_BLOCK_END = intern('block_end')
83TOKEN_VARIABLE_BEGIN = intern('variable_begin')
84TOKEN_VARIABLE_END = intern('variable_end')
85TOKEN_RAW_BEGIN = intern('raw_begin')
86TOKEN_RAW_END = intern('raw_end')
87TOKEN_COMMENT_BEGIN = intern('comment_begin')
88TOKEN_COMMENT_END = intern('comment_end')
89TOKEN_COMMENT = intern('comment')
90TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
91TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +020092TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
93TOKEN_LINECOMMENT_END = intern('linecomment_end')
94TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +010095TOKEN_DATA = intern('data')
96TOKEN_INITIAL = intern('initial')
97TOKEN_EOF = intern('eof')
98
Armin Ronacher1cc232c2007-09-07 17:52:41 +020099# bind operators to token types
100operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +0100101 '+': TOKEN_ADD,
102 '-': TOKEN_SUB,
103 '/': TOKEN_DIV,
104 '//': TOKEN_FLOORDIV,
105 '*': TOKEN_MUL,
106 '%': TOKEN_MOD,
107 '**': TOKEN_POW,
108 '~': TOKEN_TILDE,
109 '[': TOKEN_LBRACKET,
110 ']': TOKEN_RBRACKET,
111 '(': TOKEN_LPAREN,
112 ')': TOKEN_RPAREN,
113 '{': TOKEN_LBRACE,
114 '}': TOKEN_RBRACE,
115 '==': TOKEN_EQ,
116 '!=': TOKEN_NE,
117 '>': TOKEN_GT,
118 '>=': TOKEN_GTEQ,
119 '<': TOKEN_LT,
120 '<=': TOKEN_LTEQ,
121 '=': TOKEN_ASSIGN,
122 '.': TOKEN_DOT,
123 ':': TOKEN_COLON,
124 '|': TOKEN_PIPE,
125 ',': TOKEN_COMMA,
126 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200127}
128
129reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
130assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200131operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
132 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200133
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200134ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
135 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
136 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
137 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200138ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
139 TOKEN_COMMENT, TOKEN_LINECOMMENT])
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200140
Armin Ronacher92f572f2007-02-26 22:17:32 +0100141
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100142def _describe_token_type(token_type):
143 if token_type in reverse_operators:
144 return reverse_operators[token_type]
145 return {
146 TOKEN_COMMENT_BEGIN: 'begin of comment',
147 TOKEN_COMMENT_END: 'end of comment',
148 TOKEN_COMMENT: 'comment',
149 TOKEN_LINECOMMENT: 'comment',
150 TOKEN_BLOCK_BEGIN: 'begin of statement block',
151 TOKEN_BLOCK_END: 'end of statement block',
152 TOKEN_VARIABLE_BEGIN: 'begin of print statement',
153 TOKEN_VARIABLE_END: 'end of print statement',
154 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement',
155 TOKEN_LINESTATEMENT_END: 'end of line statement',
156 TOKEN_DATA: 'template data / text',
157 TOKEN_EOF: 'end of template'
158 }.get(token_type, token_type)
159
160
161def describe_token(token):
162 """Returns a description of the token."""
163 if token.type == 'name':
164 return token.value
165 return _describe_token_type(token.type)
166
167
168def describe_token_expr(expr):
169 """Like `describe_token` but for token expressions."""
170 if ':' in expr:
171 type, value = expr.split(':', 1)
172 if type == 'name':
173 return value
174 else:
175 type = expr
176 return _describe_token_type(type)
177
178
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200179def count_newlines(value):
180 """Count the number of newline characters in the string. This is
181 useful for extensions that filter a stream.
182 """
183 return len(newline_re.findall(value))
184
185
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200186def compile_rules(environment):
187 """Compiles all the rules from the environment into a list of rules."""
188 e = re.escape
189 rules = [
190 (len(environment.comment_start_string), 'comment',
191 e(environment.comment_start_string)),
192 (len(environment.block_start_string), 'block',
193 e(environment.block_start_string)),
194 (len(environment.variable_start_string), 'variable',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200195 e(environment.variable_start_string))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200196 ]
197
198 if environment.line_statement_prefix is not None:
199 rules.append((len(environment.line_statement_prefix), 'linestatement',
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200200 r'^\s*' + e(environment.line_statement_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200201 if environment.line_comment_prefix is not None:
202 rules.append((len(environment.line_comment_prefix), 'linecomment',
Armin Ronachera0727a62009-04-02 14:14:30 +0200203 r'(?:^|(?<=\S))[^\S\r\n]*' +
204 e(environment.line_comment_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200205
206 return [x[1:] for x in sorted(rules, reverse=True)]
207
208
Armin Ronacher92f572f2007-02-26 22:17:32 +0100209class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200210 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100211 Used by the `Lexer` to specify known errors.
212 """
213
214 def __init__(self, message, cls=TemplateSyntaxError):
215 self.message = message
216 self.error_class = cls
217
Armin Ronacher720e55b2007-05-30 00:57:49 +0200218 def __call__(self, lineno, filename):
219 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100220
221
Armin Ronacher4325e372008-05-01 22:59:47 +0200222class Token(tuple):
223 """Token class."""
224 __slots__ = ()
225 lineno, type, value = (property(itemgetter(x)) for x in range(3))
226
227 def __new__(cls, lineno, type, value):
228 return tuple.__new__(cls, (lineno, intern(str(type)), value))
229
230 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200231 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200232 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100233 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200234 return self.value
235 return self.type
236
237 def test(self, expr):
238 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200239 token type or ``'token_type:token_value'``. This can only test
240 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200241 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200242 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200243 # passed an iterable of not interned strings.
244 if self.type == expr:
245 return True
246 elif ':' in expr:
247 return expr.split(':', 1) == [self.type, self.value]
248 return False
249
Armin Ronachercda43df2008-05-03 17:10:05 +0200250 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200251 """Test against multiple token expressions."""
252 for expr in iterable:
253 if self.test(expr):
254 return True
255 return False
256
257 def __repr__(self):
258 return 'Token(%r, %r, %r)' % (
259 self.lineno,
260 self.type,
261 self.value
262 )
263
264
265class TokenStreamIterator(object):
266 """The iterator for tokenstreams. Iterate over the stream
267 until the eof token is reached.
268 """
269
270 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200271 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200272
273 def __iter__(self):
274 return self
275
276 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200277 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100278 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200279 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200280 raise StopIteration()
Armin Ronacherbd357722009-08-05 20:25:06 +0200281 next(self.stream)
Armin Ronacher4325e372008-05-01 22:59:47 +0200282 return token
283
284
285class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200286 """A token stream is an iterable that yields :class:`Token`\s. The
287 parser however does not iterate over it but calls :meth:`next` to go
288 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200289 """
290
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200291 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200292 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200293 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200294 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200295 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200296 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100297 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacherbd357722009-08-05 20:25:06 +0200298 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200299
300 def __iter__(self):
301 return TokenStreamIterator(self)
302
303 def __nonzero__(self):
Armin Ronacherb3b58022009-02-04 19:33:58 +0100304 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Armin Ronacher4325e372008-05-01 22:59:47 +0200305
Armin Ronacher42a19882009-08-05 18:45:39 +0200306 eos = property(lambda x: not x, doc="Are we at the end of the stream?")
Armin Ronacher4325e372008-05-01 22:59:47 +0200307
308 def push(self, token):
309 """Push a token back to the stream."""
310 self._pushed.append(token)
311
312 def look(self):
313 """Look at the next token."""
Armin Ronacherbd357722009-08-05 20:25:06 +0200314 old_token = next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200315 result = self.current
316 self.push(result)
317 self.current = old_token
318 return result
319
Armin Ronacherea847c52008-05-02 20:04:32 +0200320 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200321 """Got n tokens ahead."""
322 for x in xrange(n):
Armin Ronacherbd357722009-08-05 20:25:06 +0200323 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200324
Armin Ronacherfdf95302008-05-11 22:20:51 +0200325 def next_if(self, expr):
326 """Perform the token test and return the token if it matched.
327 Otherwise the return value is `None`.
328 """
329 if self.current.test(expr):
Armin Ronacherbd357722009-08-05 20:25:06 +0200330 return next(self)
Armin Ronacherfdf95302008-05-11 22:20:51 +0200331
332 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200333 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200334 return self.next_if(expr) is not None
335
336 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200337 """Go one token ahead and return the old one"""
338 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200339 if self._pushed:
340 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100341 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200342 try:
343 self.current = self._next()
344 except StopIteration:
345 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200346 return rv
347
348 def close(self):
349 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100350 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200351 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200352 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200353
354 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200355 """Expect a given token type and return it. This accepts the same
356 argument as :meth:`jinja2.lexer.Token.test`.
357 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200358 if not self.current.test(expr):
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100359 expr = describe_token_expr(expr)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100360 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200361 raise TemplateSyntaxError('unexpected end of template, '
362 'expected %r.' % expr,
363 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200364 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200365 raise TemplateSyntaxError("expected token %r, got %r" %
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100366 (expr, describe_token(self.current)),
Armin Ronacher4325e372008-05-01 22:59:47 +0200367 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200368 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200369 try:
370 return self.current
371 finally:
Armin Ronacherbd357722009-08-05 20:25:06 +0200372 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200373
374
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200375def get_lexer(environment):
376 """Return a lexer which is probably cached."""
377 key = (environment.block_start_string,
378 environment.block_end_string,
379 environment.variable_start_string,
380 environment.variable_end_string,
381 environment.comment_start_string,
382 environment.comment_end_string,
383 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200384 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200385 environment.trim_blocks,
386 environment.newline_sequence)
387 lexer = _lexer_cache.get(key)
388 if lexer is None:
389 lexer = Lexer(environment)
390 _lexer_cache[key] = lexer
391 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200392
393
Armin Ronacher92f572f2007-02-26 22:17:32 +0100394class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200395 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100396 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200397
398 Note that the lexer is not automatically bound to an environment.
399 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100400 """
401
402 def __init__(self, environment):
403 # shortcuts
404 c = lambda x: re.compile(x, re.M | re.S)
405 e = re.escape
406
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200407 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100408 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100409 (whitespace_re, TOKEN_WHITESPACE, None),
410 (float_re, TOKEN_FLOAT, None),
411 (integer_re, TOKEN_INTEGER, None),
412 (name_re, TOKEN_NAME, None),
413 (string_re, TOKEN_STRING, None),
414 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100415 ]
416
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200417 # assemble the root lexing rule. because "|" is ungreedy
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100418 # we have to sort by length so that the lexer continues working
419 # as expected when we have parsing rules like <% for block and
420 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200421 # variables are just part of the rules if variable processing
422 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200423 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100424
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200425 # block suffix if trimming is enabled
426 block_suffix_re = environment.trim_blocks and '\\n?' or ''
Kristi Tsukida59f33662012-07-10 17:13:50 -0700427 # strip leading spaces if lstrip_blocks is enabled
428 block_prefix_re = environment.lstrip_blocks and r'^[ \t]*' or ''
429
430 print 'block_prefix_re = %s' % block_prefix_re
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200431
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200432 self.newline_sequence = environment.newline_sequence
433
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200434 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100435 self.rules = {
436 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100437 # directives
438 (c('(.*?)(?:%s)' % '|'.join(
Kristi Tsukida59f33662012-07-10 17:13:50 -0700439 [r'(?P<raw_begin>(?:\s*%s\-|%s%s|%s)\s*raw\s*(?:\-%s\s*|%s))' % (
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100440 e(environment.block_start_string),
Kristi Tsukida15605a82012-07-10 13:25:38 -0700441 block_prefix_re,
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100442 e(environment.block_start_string),
Kristi Tsukida59f33662012-07-10 17:13:50 -0700443 e(environment.block_start_string),
Armin Ronacherac0c0d02010-06-05 14:37:32 +0200444 e(environment.block_end_string),
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100445 e(environment.block_end_string)
446 )] + [
Kristi Tsukida59f33662012-07-10 17:13:50 -0700447 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r if n != "block" else '%s%s|%s' % (block_prefix_re, r, r) )
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100448 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100449 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200450 # data
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200451 (c('.+'), TOKEN_DATA, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100452 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200453 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100454 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200455 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200456 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200457 e(environment.comment_end_string),
458 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100459 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100460 (c('(.)'), (Failure('Missing end of comment tag'),), None)
461 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200462 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100463 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200464 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200465 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200466 e(environment.block_end_string),
467 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100468 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100469 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200470 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100471 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200472 (c('\-%s\s*|%s' % (
473 e(environment.variable_end_string),
474 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100475 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200476 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200477 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100478 TOKEN_RAW_BEGIN: [
Kristi Tsukida59f33662012-07-10 17:13:50 -0700479 (c('(.*?)((?:\s*%s\-|%s%s|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200480 e(environment.block_start_string),
Kristi Tsukida15605a82012-07-10 13:25:38 -0700481 block_prefix_re,
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200482 e(environment.block_start_string),
Kristi Tsukida59f33662012-07-10 17:13:50 -0700483 e(environment.block_start_string),
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200484 e(environment.block_end_string),
485 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200486 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100487 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200488 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200489 ],
490 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100491 TOKEN_LINESTATEMENT_BEGIN: [
492 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200493 ] + tag_rules,
494 # line comments
495 TOKEN_LINECOMMENT_BEGIN: [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200496 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
497 TOKEN_LINECOMMENT_END), '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200498 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200499 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200500
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200501 def _normalize_newlines(self, value):
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200502 """Called for strings and template data to normalize it to unicode."""
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200503 return newline_re.sub(self.newline_sequence, value)
504
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100505 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200506 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100507 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100508 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200509 return TokenStream(self.wrap(stream, name, filename), name, filename)
510
511 def wrap(self, stream, name=None, filename=None):
512 """This is called with the stream as returned by `tokenize` and wraps
513 every token in a :class:`Token` and converts the value.
514 """
515 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200516 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200517 continue
518 elif token == 'linestatement_begin':
519 token = 'block_begin'
520 elif token == 'linestatement_end':
521 token = 'block_end'
522 # we are not interested in those tokens in the parser
523 elif token in ('raw_begin', 'raw_end'):
524 continue
525 elif token == 'data':
526 value = self._normalize_newlines(value)
527 elif token == 'keyword':
528 token = value
529 elif token == 'name':
530 value = str(value)
531 elif token == 'string':
532 # try to unescape string
533 try:
534 value = self._normalize_newlines(value[1:-1]) \
535 .encode('ascii', 'backslashreplace') \
536 .decode('unicode-escape')
537 except Exception, e:
538 msg = str(e).split(':')[-1].strip()
539 raise TemplateSyntaxError(msg, lineno, name, filename)
540 # if we can express it as bytestring (ascii only)
541 # we do that for support of semi broken APIs
Armin Ronacher0d242be2010-02-10 01:35:13 +0100542 # as datetime.datetime.strftime. On python 3 this
543 # call becomes a noop thanks to 2to3
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200544 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200545 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200546 except UnicodeError:
547 pass
548 elif token == 'integer':
549 value = int(value)
550 elif token == 'float':
551 value = float(value)
552 elif token == 'operator':
553 token = operators[value]
554 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100555
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100556 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200557 """This method tokenizes the text and returns the tokens in a
558 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100559 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200560 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100561 pos = 0
562 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100563 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100564 if state is not None and state != 'root':
565 assert state in ('variable', 'block'), 'invalid state'
566 stack.append(state + '_begin')
567 else:
568 state = 'root'
569 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100570 source_length = len(source)
571
Armin Ronacher21580912007-04-17 17:13:10 +0200572 balancing_stack = []
573
Armin Ronacher71082072008-04-12 14:19:36 +0200574 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100575 # tokenizer loop
576 for regex, tokens, new_state in statetokens:
577 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200578 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200579 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200580 continue
581
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200582 # we only match blocks and variables if braces / parentheses
Armin Ronacher21580912007-04-17 17:13:10 +0200583 # are balanced. continue parsing with the lower rule which
584 # is the operator rule. do this only if the end tags look
585 # like operators
586 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200587 tokens in ('variable_end', 'block_end',
588 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200589 continue
590
591 # tuples support more options
592 if isinstance(tokens, tuple):
593 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200594 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200595 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200596 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200597 # bygroup is a bit more complex, in that case we
598 # yield for the current token the first named
599 # group that matched
600 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100601 for key, value in m.groupdict().iteritems():
602 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200603 yield lineno, key, value
604 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100605 break
606 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200607 raise RuntimeError('%r wanted to resolve '
608 'the token dynamically'
609 ' but no group matched'
610 % regex)
611 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100612 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200613 data = m.group(idx + 1)
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200614 if data or token not in ignore_if_empty:
Armin Ronacher21580912007-04-17 17:13:10 +0200615 yield lineno, token, data
616 lineno += data.count('\n')
617
Armin Ronacher71082072008-04-12 14:19:36 +0200618 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200619 else:
620 data = m.group()
621 # update brace/parentheses balance
622 if tokens == 'operator':
623 if data == '{':
624 balancing_stack.append('}')
625 elif data == '(':
626 balancing_stack.append(')')
627 elif data == '[':
628 balancing_stack.append(']')
629 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200630 if not balancing_stack:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100631 raise TemplateSyntaxError('unexpected \'%s\'' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200632 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200633 filename)
634 expected_op = balancing_stack.pop()
635 if expected_op != data:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100636 raise TemplateSyntaxError('unexpected \'%s\', '
637 'expected \'%s\'' %
Armin Ronacherf750daa2007-05-29 23:22:38 +0200638 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200639 lineno, name,
640 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200641 # yield items
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200642 if data or tokens not in ignore_if_empty:
643 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200644 lineno += data.count('\n')
645
646 # fetch new position into new variable so that we can check
647 # if there is a internal parsing error which would result
648 # in an infinite loop
649 pos2 = m.end()
650
651 # handle state changes
652 if new_state is not None:
653 # remove the uppermost state
654 if new_state == '#pop':
655 stack.pop()
656 # resolve the new state by group checking
657 elif new_state == '#bygroup':
658 for key, value in m.groupdict().iteritems():
659 if value is not None:
660 stack.append(key)
661 break
662 else:
663 raise RuntimeError('%r wanted to resolve the '
664 'new state dynamically but'
665 ' no group matched' %
666 regex)
667 # direct state name given
668 else:
669 stack.append(new_state)
670 statetokens = self.rules[stack[-1]]
671 # we are still at the same position and no stack change.
672 # this means a loop without break condition, avoid that and
673 # raise error
674 elif pos2 == pos:
675 raise RuntimeError('%r yielded empty string without '
676 'stack change' % regex)
677 # publish new function and start again
678 pos = pos2
679 break
Dmitry Jemerov5c5d0612011-09-26 19:04:29 +0200680 # if loop terminated without break we haven't found a single match
Armin Ronacher92f572f2007-02-26 22:17:32 +0100681 # either we are at the end of the file or we have a problem
682 else:
683 # end of text
684 if pos >= source_length:
685 return
686 # something went wrong
687 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200688 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200689 name, filename)