blob: 39aac9ff963b2803ad09d4d389555dd60356d59c [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher62ccd1b2009-01-04 14:26:19 +010014 :copyright: (c) 2009 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherb5124e62008-04-25 00:36:14 +020021from jinja2.utils import LRUCache
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherd1ff8582008-05-11 00:30:43 +020033name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronachercb1b97f2008-09-10 14:03:53 +020034float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020035newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010036
Armin Ronacherb3b58022009-02-04 19:33:58 +010037# internal the tokens and keep references to them
38TOKEN_ADD = intern('add')
39TOKEN_ASSIGN = intern('assign')
40TOKEN_COLON = intern('colon')
41TOKEN_COMMA = intern('comma')
42TOKEN_DIV = intern('div')
43TOKEN_DOT = intern('dot')
44TOKEN_EQ = intern('eq')
45TOKEN_FLOORDIV = intern('floordiv')
46TOKEN_GT = intern('gt')
47TOKEN_GTEQ = intern('gteq')
48TOKEN_LBRACE = intern('lbrace')
49TOKEN_LBRACKET = intern('lbracket')
50TOKEN_LPAREN = intern('lparen')
51TOKEN_LT = intern('lt')
52TOKEN_LTEQ = intern('lteq')
53TOKEN_MOD = intern('mod')
54TOKEN_MUL = intern('mul')
55TOKEN_NE = intern('ne')
56TOKEN_PIPE = intern('pipe')
57TOKEN_POW = intern('pow')
58TOKEN_RBRACE = intern('rbrace')
59TOKEN_RBRACKET = intern('rbracket')
60TOKEN_RPAREN = intern('rparen')
61TOKEN_SEMICOLON = intern('semicolon')
62TOKEN_SUB = intern('sub')
63TOKEN_TILDE = intern('tilde')
64TOKEN_WHITESPACE = intern('whitespace')
65TOKEN_FLOAT = intern('float')
66TOKEN_INTEGER = intern('integer')
67TOKEN_NAME = intern('name')
68TOKEN_STRING = intern('string')
69TOKEN_OPERATOR = intern('operator')
70TOKEN_BLOCK_BEGIN = intern('block_begin')
71TOKEN_BLOCK_END = intern('block_end')
72TOKEN_VARIABLE_BEGIN = intern('variable_begin')
73TOKEN_VARIABLE_END = intern('variable_end')
74TOKEN_RAW_BEGIN = intern('raw_begin')
75TOKEN_RAW_END = intern('raw_end')
76TOKEN_COMMENT_BEGIN = intern('comment_begin')
77TOKEN_COMMENT_END = intern('comment_end')
78TOKEN_COMMENT = intern('comment')
79TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
80TOKEN_LINESTATEMENT_END = intern('linestatement_end')
81TOKEN_DATA = intern('data')
82TOKEN_INITIAL = intern('initial')
83TOKEN_EOF = intern('eof')
84
Armin Ronacher1cc232c2007-09-07 17:52:41 +020085# bind operators to token types
86operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +010087 '+': TOKEN_ADD,
88 '-': TOKEN_SUB,
89 '/': TOKEN_DIV,
90 '//': TOKEN_FLOORDIV,
91 '*': TOKEN_MUL,
92 '%': TOKEN_MOD,
93 '**': TOKEN_POW,
94 '~': TOKEN_TILDE,
95 '[': TOKEN_LBRACKET,
96 ']': TOKEN_RBRACKET,
97 '(': TOKEN_LPAREN,
98 ')': TOKEN_RPAREN,
99 '{': TOKEN_LBRACE,
100 '}': TOKEN_RBRACE,
101 '==': TOKEN_EQ,
102 '!=': TOKEN_NE,
103 '>': TOKEN_GT,
104 '>=': TOKEN_GTEQ,
105 '<': TOKEN_LT,
106 '<=': TOKEN_LTEQ,
107 '=': TOKEN_ASSIGN,
108 '.': TOKEN_DOT,
109 ':': TOKEN_COLON,
110 '|': TOKEN_PIPE,
111 ',': TOKEN_COMMA,
112 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200113}
114
115reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
116assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200117operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
118 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200119
Armin Ronacher92f572f2007-02-26 22:17:32 +0100120
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200121def count_newlines(value):
122 """Count the number of newline characters in the string. This is
123 useful for extensions that filter a stream.
124 """
125 return len(newline_re.findall(value))
126
127
Armin Ronacher92f572f2007-02-26 22:17:32 +0100128class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200129 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100130 Used by the `Lexer` to specify known errors.
131 """
132
133 def __init__(self, message, cls=TemplateSyntaxError):
134 self.message = message
135 self.error_class = cls
136
Armin Ronacher720e55b2007-05-30 00:57:49 +0200137 def __call__(self, lineno, filename):
138 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100139
140
Armin Ronacher4325e372008-05-01 22:59:47 +0200141class Token(tuple):
142 """Token class."""
143 __slots__ = ()
144 lineno, type, value = (property(itemgetter(x)) for x in range(3))
145
146 def __new__(cls, lineno, type, value):
147 return tuple.__new__(cls, (lineno, intern(str(type)), value))
148
149 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200150 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200151 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100152 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200153 return self.value
154 return self.type
155
156 def test(self, expr):
157 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200158 token type or ``'token_type:token_value'``. This can only test
159 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200160 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200161 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200162 # passed an iterable of not interned strings.
163 if self.type == expr:
164 return True
165 elif ':' in expr:
166 return expr.split(':', 1) == [self.type, self.value]
167 return False
168
Armin Ronachercda43df2008-05-03 17:10:05 +0200169 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200170 """Test against multiple token expressions."""
171 for expr in iterable:
172 if self.test(expr):
173 return True
174 return False
175
176 def __repr__(self):
177 return 'Token(%r, %r, %r)' % (
178 self.lineno,
179 self.type,
180 self.value
181 )
182
183
184class TokenStreamIterator(object):
185 """The iterator for tokenstreams. Iterate over the stream
186 until the eof token is reached.
187 """
188
189 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200190 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200191
192 def __iter__(self):
193 return self
194
195 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200196 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100197 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200198 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200199 raise StopIteration()
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200200 self.stream.next()
Armin Ronacher4325e372008-05-01 22:59:47 +0200201 return token
202
203
204class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200205 """A token stream is an iterable that yields :class:`Token`\s. The
206 parser however does not iterate over it but calls :meth:`next` to go
207 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200208 """
209
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200210 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200211 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200212 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200213 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200214 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200215 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100216 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200217 self.next()
218
219 def __iter__(self):
220 return TokenStreamIterator(self)
221
222 def __nonzero__(self):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200223 """Are we at the end of the stream?"""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100224 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Armin Ronacher4325e372008-05-01 22:59:47 +0200225
226 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
227
228 def push(self, token):
229 """Push a token back to the stream."""
230 self._pushed.append(token)
231
232 def look(self):
233 """Look at the next token."""
234 old_token = self.next()
235 result = self.current
236 self.push(result)
237 self.current = old_token
238 return result
239
Armin Ronacherea847c52008-05-02 20:04:32 +0200240 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200241 """Got n tokens ahead."""
242 for x in xrange(n):
243 self.next()
244
Armin Ronacherfdf95302008-05-11 22:20:51 +0200245 def next_if(self, expr):
246 """Perform the token test and return the token if it matched.
247 Otherwise the return value is `None`.
248 """
249 if self.current.test(expr):
250 return self.next()
251
252 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200253 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200254 return self.next_if(expr) is not None
255
256 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200257 """Go one token ahead and return the old one"""
258 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200259 if self._pushed:
260 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100261 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200262 try:
263 self.current = self._next()
264 except StopIteration:
265 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200266 return rv
267
268 def close(self):
269 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100270 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200271 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200272 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200273
274 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200275 """Expect a given token type and return it. This accepts the same
276 argument as :meth:`jinja2.lexer.Token.test`.
277 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200278 if not self.current.test(expr):
279 if ':' in expr:
280 expr = expr.split(':')[1]
Armin Ronacherb3b58022009-02-04 19:33:58 +0100281 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200282 raise TemplateSyntaxError('unexpected end of template, '
283 'expected %r.' % expr,
284 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200285 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200286 raise TemplateSyntaxError("expected token %r, got %r" %
287 (expr, str(self.current)),
288 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200289 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200290 try:
291 return self.current
292 finally:
293 self.next()
294
295
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200296def get_lexer(environment):
297 """Return a lexer which is probably cached."""
298 key = (environment.block_start_string,
299 environment.block_end_string,
300 environment.variable_start_string,
301 environment.variable_end_string,
302 environment.comment_start_string,
303 environment.comment_end_string,
304 environment.line_statement_prefix,
305 environment.trim_blocks,
306 environment.newline_sequence)
307 lexer = _lexer_cache.get(key)
308 if lexer is None:
309 lexer = Lexer(environment)
310 _lexer_cache[key] = lexer
311 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200312
313
Armin Ronacher92f572f2007-02-26 22:17:32 +0100314class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200315 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100316 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200317
318 Note that the lexer is not automatically bound to an environment.
319 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100320 """
321
322 def __init__(self, environment):
323 # shortcuts
324 c = lambda x: re.compile(x, re.M | re.S)
325 e = re.escape
326
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200327 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100328 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100329 (whitespace_re, TOKEN_WHITESPACE, None),
330 (float_re, TOKEN_FLOAT, None),
331 (integer_re, TOKEN_INTEGER, None),
332 (name_re, TOKEN_NAME, None),
333 (string_re, TOKEN_STRING, None),
334 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100335 ]
336
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100337 # assamble the root lexing rule. because "|" is ungreedy
338 # we have to sort by length so that the lexer continues working
339 # as expected when we have parsing rules like <% for block and
340 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200341 # variables are just part of the rules if variable processing
342 # is required.
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100343 root_tag_rules = [
344 ('comment', environment.comment_start_string),
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200345 ('block', environment.block_start_string),
346 ('variable', environment.variable_start_string)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100347 ]
Armin Ronacher4f7d2d52008-04-22 10:40:26 +0200348 root_tag_rules.sort(key=lambda x: -len(x[1]))
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200349
350 # now escape the rules. This is done here so that the escape
351 # signs don't count for the lengths of the tags.
352 root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
353
354 # if we have a line statement prefix we need an extra rule for
355 # that. We add this rule *after* all the others.
356 if environment.line_statement_prefix is not None:
357 prefix = e(environment.line_statement_prefix)
358 root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100359
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200360 # block suffix if trimming is enabled
361 block_suffix_re = environment.trim_blocks and '\\n?' or ''
362
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200363 self.newline_sequence = environment.newline_sequence
364
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200365 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100366 self.rules = {
367 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100368 # directives
369 (c('(.*?)(?:%s)' % '|'.join(
370 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
371 e(environment.block_start_string),
372 e(environment.block_start_string),
373 e(environment.block_end_string)
374 )] + [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200375 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100376 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100377 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200378 # data
Armin Ronacher92f572f2007-02-26 22:17:32 +0100379 (c('.+'), 'data', None)
380 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200381 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100382 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200383 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200384 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200385 e(environment.comment_end_string),
386 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100387 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100388 (c('(.)'), (Failure('Missing end of comment tag'),), None)
389 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200390 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100391 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200392 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200393 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200394 e(environment.block_end_string),
395 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100396 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100397 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200398 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100399 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200400 (c('\-%s\s*|%s' % (
401 e(environment.variable_end_string),
402 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100403 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200404 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200405 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100406 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200407 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
408 e(environment.block_start_string),
409 e(environment.block_start_string),
410 e(environment.block_end_string),
411 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200412 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100413 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200414 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200415 ],
416 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100417 TOKEN_LINESTATEMENT_BEGIN: [
418 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200419 ] + tag_rules
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200420 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200421
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200422 def _normalize_newlines(self, value):
423 """Called for strings and template data to normlize it to unicode."""
424 return newline_re.sub(self.newline_sequence, value)
425
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100426 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200427 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100428 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100429 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200430 return TokenStream(self.wrap(stream, name, filename), name, filename)
431
432 def wrap(self, stream, name=None, filename=None):
433 """This is called with the stream as returned by `tokenize` and wraps
434 every token in a :class:`Token` and converts the value.
435 """
436 for lineno, token, value in stream:
437 if token in ('comment_begin', 'comment', 'comment_end',
438 'whitespace'):
439 continue
440 elif token == 'linestatement_begin':
441 token = 'block_begin'
442 elif token == 'linestatement_end':
443 token = 'block_end'
444 # we are not interested in those tokens in the parser
445 elif token in ('raw_begin', 'raw_end'):
446 continue
447 elif token == 'data':
448 value = self._normalize_newlines(value)
449 elif token == 'keyword':
450 token = value
451 elif token == 'name':
452 value = str(value)
453 elif token == 'string':
454 # try to unescape string
455 try:
456 value = self._normalize_newlines(value[1:-1]) \
457 .encode('ascii', 'backslashreplace') \
458 .decode('unicode-escape')
459 except Exception, e:
460 msg = str(e).split(':')[-1].strip()
461 raise TemplateSyntaxError(msg, lineno, name, filename)
462 # if we can express it as bytestring (ascii only)
463 # we do that for support of semi broken APIs
464 # as datetime.datetime.strftime
465 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200466 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200467 except UnicodeError:
468 pass
469 elif token == 'integer':
470 value = int(value)
471 elif token == 'float':
472 value = float(value)
473 elif token == 'operator':
474 token = operators[value]
475 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100476
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100477 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200478 """This method tokenizes the text and returns the tokens in a
479 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100480 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200481 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100482 pos = 0
483 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100484 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100485 if state is not None and state != 'root':
486 assert state in ('variable', 'block'), 'invalid state'
487 stack.append(state + '_begin')
488 else:
489 state = 'root'
490 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100491 source_length = len(source)
492
Armin Ronacher21580912007-04-17 17:13:10 +0200493 balancing_stack = []
494
Armin Ronacher71082072008-04-12 14:19:36 +0200495 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100496 # tokenizer loop
497 for regex, tokens, new_state in statetokens:
498 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200499 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200500 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200501 continue
502
503 # we only match blocks and variables if brances / parentheses
504 # are balanced. continue parsing with the lower rule which
505 # is the operator rule. do this only if the end tags look
506 # like operators
507 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200508 tokens in ('variable_end', 'block_end',
509 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200510 continue
511
512 # tuples support more options
513 if isinstance(tokens, tuple):
514 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200515 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200516 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200517 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200518 # bygroup is a bit more complex, in that case we
519 # yield for the current token the first named
520 # group that matched
521 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100522 for key, value in m.groupdict().iteritems():
523 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200524 yield lineno, key, value
525 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100526 break
527 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200528 raise RuntimeError('%r wanted to resolve '
529 'the token dynamically'
530 ' but no group matched'
531 % regex)
532 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100533 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200534 data = m.group(idx + 1)
535 if data:
536 yield lineno, token, data
537 lineno += data.count('\n')
538
Armin Ronacher71082072008-04-12 14:19:36 +0200539 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200540 else:
541 data = m.group()
542 # update brace/parentheses balance
543 if tokens == 'operator':
544 if data == '{':
545 balancing_stack.append('}')
546 elif data == '(':
547 balancing_stack.append(')')
548 elif data == '[':
549 balancing_stack.append(']')
550 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200551 if not balancing_stack:
552 raise TemplateSyntaxError('unexpected "%s"' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200553 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200554 filename)
555 expected_op = balancing_stack.pop()
556 if expected_op != data:
557 raise TemplateSyntaxError('unexpected "%s", '
558 'expected "%s"' %
559 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200560 lineno, name,
561 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200562 # yield items
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200563 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200564 lineno += data.count('\n')
565
566 # fetch new position into new variable so that we can check
567 # if there is a internal parsing error which would result
568 # in an infinite loop
569 pos2 = m.end()
570
571 # handle state changes
572 if new_state is not None:
573 # remove the uppermost state
574 if new_state == '#pop':
575 stack.pop()
576 # resolve the new state by group checking
577 elif new_state == '#bygroup':
578 for key, value in m.groupdict().iteritems():
579 if value is not None:
580 stack.append(key)
581 break
582 else:
583 raise RuntimeError('%r wanted to resolve the '
584 'new state dynamically but'
585 ' no group matched' %
586 regex)
587 # direct state name given
588 else:
589 stack.append(new_state)
590 statetokens = self.rules[stack[-1]]
591 # we are still at the same position and no stack change.
592 # this means a loop without break condition, avoid that and
593 # raise error
594 elif pos2 == pos:
595 raise RuntimeError('%r yielded empty string without '
596 'stack change' % regex)
597 # publish new function and start again
598 pos = pos2
599 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100600 # if loop terminated without break we havn't found a single match
601 # either we are at the end of the file or we have a problem
602 else:
603 # end of text
604 if pos >= source_length:
605 return
606 # something went wrong
607 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200608 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200609 name, filename)