blob: 10cfd637a609962eddef13709da9a55e50aa82e8 [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher62ccd1b2009-01-04 14:26:19 +010014 :copyright: (c) 2009 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherb5124e62008-04-25 00:36:14 +020021from jinja2.utils import LRUCache
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherd1ff8582008-05-11 00:30:43 +020033name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronachercb1b97f2008-09-10 14:03:53 +020034float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020035newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010036
Armin Ronacherb3b58022009-02-04 19:33:58 +010037# internal the tokens and keep references to them
38TOKEN_ADD = intern('add')
39TOKEN_ASSIGN = intern('assign')
40TOKEN_COLON = intern('colon')
41TOKEN_COMMA = intern('comma')
42TOKEN_DIV = intern('div')
43TOKEN_DOT = intern('dot')
44TOKEN_EQ = intern('eq')
45TOKEN_FLOORDIV = intern('floordiv')
46TOKEN_GT = intern('gt')
47TOKEN_GTEQ = intern('gteq')
48TOKEN_LBRACE = intern('lbrace')
49TOKEN_LBRACKET = intern('lbracket')
50TOKEN_LPAREN = intern('lparen')
51TOKEN_LT = intern('lt')
52TOKEN_LTEQ = intern('lteq')
53TOKEN_MOD = intern('mod')
54TOKEN_MUL = intern('mul')
55TOKEN_NE = intern('ne')
56TOKEN_PIPE = intern('pipe')
57TOKEN_POW = intern('pow')
58TOKEN_RBRACE = intern('rbrace')
59TOKEN_RBRACKET = intern('rbracket')
60TOKEN_RPAREN = intern('rparen')
61TOKEN_SEMICOLON = intern('semicolon')
62TOKEN_SUB = intern('sub')
63TOKEN_TILDE = intern('tilde')
64TOKEN_WHITESPACE = intern('whitespace')
65TOKEN_FLOAT = intern('float')
66TOKEN_INTEGER = intern('integer')
67TOKEN_NAME = intern('name')
68TOKEN_STRING = intern('string')
69TOKEN_OPERATOR = intern('operator')
70TOKEN_BLOCK_BEGIN = intern('block_begin')
71TOKEN_BLOCK_END = intern('block_end')
72TOKEN_VARIABLE_BEGIN = intern('variable_begin')
73TOKEN_VARIABLE_END = intern('variable_end')
74TOKEN_RAW_BEGIN = intern('raw_begin')
75TOKEN_RAW_END = intern('raw_end')
76TOKEN_COMMENT_BEGIN = intern('comment_begin')
77TOKEN_COMMENT_END = intern('comment_end')
78TOKEN_COMMENT = intern('comment')
79TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
80TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +020081TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
82TOKEN_LINECOMMENT_END = intern('linecomment_end')
83TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +010084TOKEN_DATA = intern('data')
85TOKEN_INITIAL = intern('initial')
86TOKEN_EOF = intern('eof')
87
Armin Ronacher1cc232c2007-09-07 17:52:41 +020088# bind operators to token types
89operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +010090 '+': TOKEN_ADD,
91 '-': TOKEN_SUB,
92 '/': TOKEN_DIV,
93 '//': TOKEN_FLOORDIV,
94 '*': TOKEN_MUL,
95 '%': TOKEN_MOD,
96 '**': TOKEN_POW,
97 '~': TOKEN_TILDE,
98 '[': TOKEN_LBRACKET,
99 ']': TOKEN_RBRACKET,
100 '(': TOKEN_LPAREN,
101 ')': TOKEN_RPAREN,
102 '{': TOKEN_LBRACE,
103 '}': TOKEN_RBRACE,
104 '==': TOKEN_EQ,
105 '!=': TOKEN_NE,
106 '>': TOKEN_GT,
107 '>=': TOKEN_GTEQ,
108 '<': TOKEN_LT,
109 '<=': TOKEN_LTEQ,
110 '=': TOKEN_ASSIGN,
111 '.': TOKEN_DOT,
112 ':': TOKEN_COLON,
113 '|': TOKEN_PIPE,
114 ',': TOKEN_COMMA,
115 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200116}
117
118reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
119assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200120operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
121 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200122
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200123ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
124 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
125 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
126 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
127
Armin Ronacher92f572f2007-02-26 22:17:32 +0100128
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200129def count_newlines(value):
130 """Count the number of newline characters in the string. This is
131 useful for extensions that filter a stream.
132 """
133 return len(newline_re.findall(value))
134
135
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200136def compile_rules(environment):
137 """Compiles all the rules from the environment into a list of rules."""
138 e = re.escape
139 rules = [
140 (len(environment.comment_start_string), 'comment',
141 e(environment.comment_start_string)),
142 (len(environment.block_start_string), 'block',
143 e(environment.block_start_string)),
144 (len(environment.variable_start_string), 'variable',
145 e(environment.variable_start_string)),
146 ]
147
148 if environment.line_statement_prefix is not None:
149 rules.append((len(environment.line_statement_prefix), 'linestatement',
150 '^\\s*' + e(environment.line_statement_prefix)))
151 if environment.line_comment_prefix is not None:
152 rules.append((len(environment.line_comment_prefix), 'linecomment',
153 '\\s*' + e(environment.line_comment_prefix)))
154
155 return [x[1:] for x in sorted(rules, reverse=True)]
156
157
Armin Ronacher92f572f2007-02-26 22:17:32 +0100158class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200159 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100160 Used by the `Lexer` to specify known errors.
161 """
162
163 def __init__(self, message, cls=TemplateSyntaxError):
164 self.message = message
165 self.error_class = cls
166
Armin Ronacher720e55b2007-05-30 00:57:49 +0200167 def __call__(self, lineno, filename):
168 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100169
170
Armin Ronacher4325e372008-05-01 22:59:47 +0200171class Token(tuple):
172 """Token class."""
173 __slots__ = ()
174 lineno, type, value = (property(itemgetter(x)) for x in range(3))
175
176 def __new__(cls, lineno, type, value):
177 return tuple.__new__(cls, (lineno, intern(str(type)), value))
178
179 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200180 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200181 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100182 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200183 return self.value
184 return self.type
185
186 def test(self, expr):
187 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200188 token type or ``'token_type:token_value'``. This can only test
189 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200190 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200191 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200192 # passed an iterable of not interned strings.
193 if self.type == expr:
194 return True
195 elif ':' in expr:
196 return expr.split(':', 1) == [self.type, self.value]
197 return False
198
Armin Ronachercda43df2008-05-03 17:10:05 +0200199 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200200 """Test against multiple token expressions."""
201 for expr in iterable:
202 if self.test(expr):
203 return True
204 return False
205
206 def __repr__(self):
207 return 'Token(%r, %r, %r)' % (
208 self.lineno,
209 self.type,
210 self.value
211 )
212
213
214class TokenStreamIterator(object):
215 """The iterator for tokenstreams. Iterate over the stream
216 until the eof token is reached.
217 """
218
219 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200220 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200221
222 def __iter__(self):
223 return self
224
225 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200226 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100227 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200228 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200229 raise StopIteration()
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200230 self.stream.next()
Armin Ronacher4325e372008-05-01 22:59:47 +0200231 return token
232
233
234class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200235 """A token stream is an iterable that yields :class:`Token`\s. The
236 parser however does not iterate over it but calls :meth:`next` to go
237 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200238 """
239
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200240 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200241 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200242 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200243 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200244 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200245 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100246 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200247 self.next()
248
249 def __iter__(self):
250 return TokenStreamIterator(self)
251
252 def __nonzero__(self):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200253 """Are we at the end of the stream?"""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100254 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Armin Ronacher4325e372008-05-01 22:59:47 +0200255
256 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
257
258 def push(self, token):
259 """Push a token back to the stream."""
260 self._pushed.append(token)
261
262 def look(self):
263 """Look at the next token."""
264 old_token = self.next()
265 result = self.current
266 self.push(result)
267 self.current = old_token
268 return result
269
Armin Ronacherea847c52008-05-02 20:04:32 +0200270 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200271 """Got n tokens ahead."""
272 for x in xrange(n):
273 self.next()
274
Armin Ronacherfdf95302008-05-11 22:20:51 +0200275 def next_if(self, expr):
276 """Perform the token test and return the token if it matched.
277 Otherwise the return value is `None`.
278 """
279 if self.current.test(expr):
280 return self.next()
281
282 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200283 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200284 return self.next_if(expr) is not None
285
286 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200287 """Go one token ahead and return the old one"""
288 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200289 if self._pushed:
290 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100291 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200292 try:
293 self.current = self._next()
294 except StopIteration:
295 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200296 return rv
297
298 def close(self):
299 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100300 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200301 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200302 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200303
304 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200305 """Expect a given token type and return it. This accepts the same
306 argument as :meth:`jinja2.lexer.Token.test`.
307 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200308 if not self.current.test(expr):
309 if ':' in expr:
310 expr = expr.split(':')[1]
Armin Ronacherb3b58022009-02-04 19:33:58 +0100311 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200312 raise TemplateSyntaxError('unexpected end of template, '
313 'expected %r.' % expr,
314 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200315 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200316 raise TemplateSyntaxError("expected token %r, got %r" %
317 (expr, str(self.current)),
318 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200319 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200320 try:
321 return self.current
322 finally:
323 self.next()
324
325
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200326def get_lexer(environment):
327 """Return a lexer which is probably cached."""
328 key = (environment.block_start_string,
329 environment.block_end_string,
330 environment.variable_start_string,
331 environment.variable_end_string,
332 environment.comment_start_string,
333 environment.comment_end_string,
334 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200335 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200336 environment.trim_blocks,
337 environment.newline_sequence)
338 lexer = _lexer_cache.get(key)
339 if lexer is None:
340 lexer = Lexer(environment)
341 _lexer_cache[key] = lexer
342 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200343
344
Armin Ronacher92f572f2007-02-26 22:17:32 +0100345class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200346 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100347 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200348
349 Note that the lexer is not automatically bound to an environment.
350 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100351 """
352
353 def __init__(self, environment):
354 # shortcuts
355 c = lambda x: re.compile(x, re.M | re.S)
356 e = re.escape
357
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200358 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100359 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100360 (whitespace_re, TOKEN_WHITESPACE, None),
361 (float_re, TOKEN_FLOAT, None),
362 (integer_re, TOKEN_INTEGER, None),
363 (name_re, TOKEN_NAME, None),
364 (string_re, TOKEN_STRING, None),
365 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100366 ]
367
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100368 # assamble the root lexing rule. because "|" is ungreedy
369 # we have to sort by length so that the lexer continues working
370 # as expected when we have parsing rules like <% for block and
371 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200372 # variables are just part of the rules if variable processing
373 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200374 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100375
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200376 # block suffix if trimming is enabled
377 block_suffix_re = environment.trim_blocks and '\\n?' or ''
378
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200379 self.newline_sequence = environment.newline_sequence
380
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200381 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100382 self.rules = {
383 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100384 # directives
385 (c('(.*?)(?:%s)' % '|'.join(
386 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
387 e(environment.block_start_string),
388 e(environment.block_start_string),
389 e(environment.block_end_string)
390 )] + [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200391 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100392 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100393 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200394 # data
Armin Ronacher92f572f2007-02-26 22:17:32 +0100395 (c('.+'), 'data', None)
396 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200397 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100398 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200399 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200400 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200401 e(environment.comment_end_string),
402 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100403 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100404 (c('(.)'), (Failure('Missing end of comment tag'),), None)
405 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200406 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100407 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200408 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200409 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200410 e(environment.block_end_string),
411 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100412 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100413 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200414 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100415 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200416 (c('\-%s\s*|%s' % (
417 e(environment.variable_end_string),
418 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100419 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200420 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200421 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100422 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200423 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
424 e(environment.block_start_string),
425 e(environment.block_start_string),
426 e(environment.block_end_string),
427 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200428 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100429 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200430 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200431 ],
432 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100433 TOKEN_LINESTATEMENT_BEGIN: [
434 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200435 ] + tag_rules,
436 # line comments
437 TOKEN_LINECOMMENT_BEGIN: [
438 (c(r'.*?(?=\n|$)'), TOKEN_LINECOMMENT_END, '#pop')
439 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200440 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200441
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200442 def _normalize_newlines(self, value):
443 """Called for strings and template data to normlize it to unicode."""
444 return newline_re.sub(self.newline_sequence, value)
445
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100446 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200447 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100448 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100449 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200450 return TokenStream(self.wrap(stream, name, filename), name, filename)
451
452 def wrap(self, stream, name=None, filename=None):
453 """This is called with the stream as returned by `tokenize` and wraps
454 every token in a :class:`Token` and converts the value.
455 """
456 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200457 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200458 continue
459 elif token == 'linestatement_begin':
460 token = 'block_begin'
461 elif token == 'linestatement_end':
462 token = 'block_end'
463 # we are not interested in those tokens in the parser
464 elif token in ('raw_begin', 'raw_end'):
465 continue
466 elif token == 'data':
467 value = self._normalize_newlines(value)
468 elif token == 'keyword':
469 token = value
470 elif token == 'name':
471 value = str(value)
472 elif token == 'string':
473 # try to unescape string
474 try:
475 value = self._normalize_newlines(value[1:-1]) \
476 .encode('ascii', 'backslashreplace') \
477 .decode('unicode-escape')
478 except Exception, e:
479 msg = str(e).split(':')[-1].strip()
480 raise TemplateSyntaxError(msg, lineno, name, filename)
481 # if we can express it as bytestring (ascii only)
482 # we do that for support of semi broken APIs
483 # as datetime.datetime.strftime
484 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200485 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200486 except UnicodeError:
487 pass
488 elif token == 'integer':
489 value = int(value)
490 elif token == 'float':
491 value = float(value)
492 elif token == 'operator':
493 token = operators[value]
494 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100495
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100496 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200497 """This method tokenizes the text and returns the tokens in a
498 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100499 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200500 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100501 pos = 0
502 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100503 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100504 if state is not None and state != 'root':
505 assert state in ('variable', 'block'), 'invalid state'
506 stack.append(state + '_begin')
507 else:
508 state = 'root'
509 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100510 source_length = len(source)
511
Armin Ronacher21580912007-04-17 17:13:10 +0200512 balancing_stack = []
513
Armin Ronacher71082072008-04-12 14:19:36 +0200514 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100515 # tokenizer loop
516 for regex, tokens, new_state in statetokens:
517 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200518 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200519 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200520 continue
521
522 # we only match blocks and variables if brances / parentheses
523 # are balanced. continue parsing with the lower rule which
524 # is the operator rule. do this only if the end tags look
525 # like operators
526 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200527 tokens in ('variable_end', 'block_end',
528 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200529 continue
530
531 # tuples support more options
532 if isinstance(tokens, tuple):
533 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200534 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200535 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200536 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200537 # bygroup is a bit more complex, in that case we
538 # yield for the current token the first named
539 # group that matched
540 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100541 for key, value in m.groupdict().iteritems():
542 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200543 yield lineno, key, value
544 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100545 break
546 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200547 raise RuntimeError('%r wanted to resolve '
548 'the token dynamically'
549 ' but no group matched'
550 % regex)
551 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100552 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200553 data = m.group(idx + 1)
554 if data:
555 yield lineno, token, data
556 lineno += data.count('\n')
557
Armin Ronacher71082072008-04-12 14:19:36 +0200558 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200559 else:
560 data = m.group()
561 # update brace/parentheses balance
562 if tokens == 'operator':
563 if data == '{':
564 balancing_stack.append('}')
565 elif data == '(':
566 balancing_stack.append(')')
567 elif data == '[':
568 balancing_stack.append(']')
569 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200570 if not balancing_stack:
571 raise TemplateSyntaxError('unexpected "%s"' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200572 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200573 filename)
574 expected_op = balancing_stack.pop()
575 if expected_op != data:
576 raise TemplateSyntaxError('unexpected "%s", '
577 'expected "%s"' %
578 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200579 lineno, name,
580 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200581 # yield items
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200582 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200583 lineno += data.count('\n')
584
585 # fetch new position into new variable so that we can check
586 # if there is a internal parsing error which would result
587 # in an infinite loop
588 pos2 = m.end()
589
590 # handle state changes
591 if new_state is not None:
592 # remove the uppermost state
593 if new_state == '#pop':
594 stack.pop()
595 # resolve the new state by group checking
596 elif new_state == '#bygroup':
597 for key, value in m.groupdict().iteritems():
598 if value is not None:
599 stack.append(key)
600 break
601 else:
602 raise RuntimeError('%r wanted to resolve the '
603 'new state dynamically but'
604 ' no group matched' %
605 regex)
606 # direct state name given
607 else:
608 stack.append(new_state)
609 statetokens = self.rules[stack[-1]]
610 # we are still at the same position and no stack change.
611 # this means a loop without break condition, avoid that and
612 # raise error
613 elif pos2 == pos:
614 raise RuntimeError('%r yielded empty string without '
615 'stack change' % regex)
616 # publish new function and start again
617 pos = pos2
618 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100619 # if loop terminated without break we havn't found a single match
620 # either we are at the end of the file or we have a problem
621 else:
622 # end of text
623 if pos >= source_length:
624 return
625 # something went wrong
626 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200627 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200628 name, filename)