blob: aa6789696ee29f2b9ec38a454fef9bab3928487a [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher55494e42010-01-22 09:41:48 +010014 :copyright: (c) 2010 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherbd357722009-08-05 20:25:06 +020021from jinja2.utils import LRUCache, next
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherbd357722009-08-05 20:25:06 +020033
34# we use the unicode identifier rule if this python version is able
35# to handle unicode identifiers, otherwise the standard ASCII one.
36try:
37 compile('föö', '<unknown>', 'eval')
38except SyntaxError:
39 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
40else:
41 from jinja2 import _stringdefs
42 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
43 _stringdefs.xid_continue))
44
Armin Ronachercb1b97f2008-09-10 14:03:53 +020045float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020046newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010047
Armin Ronacherb3b58022009-02-04 19:33:58 +010048# internal the tokens and keep references to them
49TOKEN_ADD = intern('add')
50TOKEN_ASSIGN = intern('assign')
51TOKEN_COLON = intern('colon')
52TOKEN_COMMA = intern('comma')
53TOKEN_DIV = intern('div')
54TOKEN_DOT = intern('dot')
55TOKEN_EQ = intern('eq')
56TOKEN_FLOORDIV = intern('floordiv')
57TOKEN_GT = intern('gt')
58TOKEN_GTEQ = intern('gteq')
59TOKEN_LBRACE = intern('lbrace')
60TOKEN_LBRACKET = intern('lbracket')
61TOKEN_LPAREN = intern('lparen')
62TOKEN_LT = intern('lt')
63TOKEN_LTEQ = intern('lteq')
64TOKEN_MOD = intern('mod')
65TOKEN_MUL = intern('mul')
66TOKEN_NE = intern('ne')
67TOKEN_PIPE = intern('pipe')
68TOKEN_POW = intern('pow')
69TOKEN_RBRACE = intern('rbrace')
70TOKEN_RBRACKET = intern('rbracket')
71TOKEN_RPAREN = intern('rparen')
72TOKEN_SEMICOLON = intern('semicolon')
73TOKEN_SUB = intern('sub')
74TOKEN_TILDE = intern('tilde')
75TOKEN_WHITESPACE = intern('whitespace')
76TOKEN_FLOAT = intern('float')
77TOKEN_INTEGER = intern('integer')
78TOKEN_NAME = intern('name')
79TOKEN_STRING = intern('string')
80TOKEN_OPERATOR = intern('operator')
81TOKEN_BLOCK_BEGIN = intern('block_begin')
82TOKEN_BLOCK_END = intern('block_end')
83TOKEN_VARIABLE_BEGIN = intern('variable_begin')
84TOKEN_VARIABLE_END = intern('variable_end')
85TOKEN_RAW_BEGIN = intern('raw_begin')
86TOKEN_RAW_END = intern('raw_end')
87TOKEN_COMMENT_BEGIN = intern('comment_begin')
88TOKEN_COMMENT_END = intern('comment_end')
89TOKEN_COMMENT = intern('comment')
90TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
91TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +020092TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
93TOKEN_LINECOMMENT_END = intern('linecomment_end')
94TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +010095TOKEN_DATA = intern('data')
96TOKEN_INITIAL = intern('initial')
97TOKEN_EOF = intern('eof')
98
Armin Ronacher1cc232c2007-09-07 17:52:41 +020099# bind operators to token types
100operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +0100101 '+': TOKEN_ADD,
102 '-': TOKEN_SUB,
103 '/': TOKEN_DIV,
104 '//': TOKEN_FLOORDIV,
105 '*': TOKEN_MUL,
106 '%': TOKEN_MOD,
107 '**': TOKEN_POW,
108 '~': TOKEN_TILDE,
109 '[': TOKEN_LBRACKET,
110 ']': TOKEN_RBRACKET,
111 '(': TOKEN_LPAREN,
112 ')': TOKEN_RPAREN,
113 '{': TOKEN_LBRACE,
114 '}': TOKEN_RBRACE,
115 '==': TOKEN_EQ,
116 '!=': TOKEN_NE,
117 '>': TOKEN_GT,
118 '>=': TOKEN_GTEQ,
119 '<': TOKEN_LT,
120 '<=': TOKEN_LTEQ,
121 '=': TOKEN_ASSIGN,
122 '.': TOKEN_DOT,
123 ':': TOKEN_COLON,
124 '|': TOKEN_PIPE,
125 ',': TOKEN_COMMA,
126 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200127}
128
129reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
130assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200131operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
132 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200133
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200134ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
135 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
136 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
137 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200138ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
139 TOKEN_COMMENT, TOKEN_LINECOMMENT])
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200140
Armin Ronacher92f572f2007-02-26 22:17:32 +0100141
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200142def count_newlines(value):
143 """Count the number of newline characters in the string. This is
144 useful for extensions that filter a stream.
145 """
146 return len(newline_re.findall(value))
147
148
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200149def compile_rules(environment):
150 """Compiles all the rules from the environment into a list of rules."""
151 e = re.escape
152 rules = [
153 (len(environment.comment_start_string), 'comment',
154 e(environment.comment_start_string)),
155 (len(environment.block_start_string), 'block',
156 e(environment.block_start_string)),
157 (len(environment.variable_start_string), 'variable',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200158 e(environment.variable_start_string))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200159 ]
160
161 if environment.line_statement_prefix is not None:
162 rules.append((len(environment.line_statement_prefix), 'linestatement',
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200163 r'^\s*' + e(environment.line_statement_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200164 if environment.line_comment_prefix is not None:
165 rules.append((len(environment.line_comment_prefix), 'linecomment',
Armin Ronachera0727a62009-04-02 14:14:30 +0200166 r'(?:^|(?<=\S))[^\S\r\n]*' +
167 e(environment.line_comment_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200168
169 return [x[1:] for x in sorted(rules, reverse=True)]
170
171
Armin Ronacher92f572f2007-02-26 22:17:32 +0100172class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200173 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100174 Used by the `Lexer` to specify known errors.
175 """
176
177 def __init__(self, message, cls=TemplateSyntaxError):
178 self.message = message
179 self.error_class = cls
180
Armin Ronacher720e55b2007-05-30 00:57:49 +0200181 def __call__(self, lineno, filename):
182 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100183
184
Armin Ronacher4325e372008-05-01 22:59:47 +0200185class Token(tuple):
186 """Token class."""
187 __slots__ = ()
188 lineno, type, value = (property(itemgetter(x)) for x in range(3))
189
190 def __new__(cls, lineno, type, value):
191 return tuple.__new__(cls, (lineno, intern(str(type)), value))
192
193 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200194 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200195 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100196 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200197 return self.value
198 return self.type
199
200 def test(self, expr):
201 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200202 token type or ``'token_type:token_value'``. This can only test
203 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200204 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200205 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200206 # passed an iterable of not interned strings.
207 if self.type == expr:
208 return True
209 elif ':' in expr:
210 return expr.split(':', 1) == [self.type, self.value]
211 return False
212
Armin Ronachercda43df2008-05-03 17:10:05 +0200213 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200214 """Test against multiple token expressions."""
215 for expr in iterable:
216 if self.test(expr):
217 return True
218 return False
219
220 def __repr__(self):
221 return 'Token(%r, %r, %r)' % (
222 self.lineno,
223 self.type,
224 self.value
225 )
226
227
228class TokenStreamIterator(object):
229 """The iterator for tokenstreams. Iterate over the stream
230 until the eof token is reached.
231 """
232
233 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200234 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200235
236 def __iter__(self):
237 return self
238
239 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200240 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100241 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200242 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200243 raise StopIteration()
Armin Ronacherbd357722009-08-05 20:25:06 +0200244 next(self.stream)
Armin Ronacher4325e372008-05-01 22:59:47 +0200245 return token
246
247
248class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200249 """A token stream is an iterable that yields :class:`Token`\s. The
250 parser however does not iterate over it but calls :meth:`next` to go
251 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200252 """
253
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200254 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200255 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200256 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200257 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200258 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200259 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100260 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacherbd357722009-08-05 20:25:06 +0200261 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200262
263 def __iter__(self):
264 return TokenStreamIterator(self)
265
266 def __nonzero__(self):
Armin Ronacherb3b58022009-02-04 19:33:58 +0100267 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Armin Ronacher4325e372008-05-01 22:59:47 +0200268
Armin Ronacher42a19882009-08-05 18:45:39 +0200269 eos = property(lambda x: not x, doc="Are we at the end of the stream?")
Armin Ronacher4325e372008-05-01 22:59:47 +0200270
271 def push(self, token):
272 """Push a token back to the stream."""
273 self._pushed.append(token)
274
275 def look(self):
276 """Look at the next token."""
Armin Ronacherbd357722009-08-05 20:25:06 +0200277 old_token = next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200278 result = self.current
279 self.push(result)
280 self.current = old_token
281 return result
282
Armin Ronacherea847c52008-05-02 20:04:32 +0200283 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200284 """Got n tokens ahead."""
285 for x in xrange(n):
Armin Ronacherbd357722009-08-05 20:25:06 +0200286 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200287
Armin Ronacherfdf95302008-05-11 22:20:51 +0200288 def next_if(self, expr):
289 """Perform the token test and return the token if it matched.
290 Otherwise the return value is `None`.
291 """
292 if self.current.test(expr):
Armin Ronacherbd357722009-08-05 20:25:06 +0200293 return next(self)
Armin Ronacherfdf95302008-05-11 22:20:51 +0200294
295 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200296 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200297 return self.next_if(expr) is not None
298
299 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200300 """Go one token ahead and return the old one"""
301 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200302 if self._pushed:
303 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100304 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200305 try:
306 self.current = self._next()
307 except StopIteration:
308 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200309 return rv
310
311 def close(self):
312 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100313 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200314 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200315 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200316
317 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200318 """Expect a given token type and return it. This accepts the same
319 argument as :meth:`jinja2.lexer.Token.test`.
320 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200321 if not self.current.test(expr):
322 if ':' in expr:
323 expr = expr.split(':')[1]
Armin Ronacherb3b58022009-02-04 19:33:58 +0100324 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200325 raise TemplateSyntaxError('unexpected end of template, '
326 'expected %r.' % expr,
327 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200328 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200329 raise TemplateSyntaxError("expected token %r, got %r" %
330 (expr, str(self.current)),
331 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200332 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200333 try:
334 return self.current
335 finally:
Armin Ronacherbd357722009-08-05 20:25:06 +0200336 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200337
338
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200339def get_lexer(environment):
340 """Return a lexer which is probably cached."""
341 key = (environment.block_start_string,
342 environment.block_end_string,
343 environment.variable_start_string,
344 environment.variable_end_string,
345 environment.comment_start_string,
346 environment.comment_end_string,
347 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200348 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200349 environment.trim_blocks,
350 environment.newline_sequence)
351 lexer = _lexer_cache.get(key)
352 if lexer is None:
353 lexer = Lexer(environment)
354 _lexer_cache[key] = lexer
355 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200356
357
Armin Ronacher92f572f2007-02-26 22:17:32 +0100358class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200359 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100360 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200361
362 Note that the lexer is not automatically bound to an environment.
363 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100364 """
365
366 def __init__(self, environment):
367 # shortcuts
368 c = lambda x: re.compile(x, re.M | re.S)
369 e = re.escape
370
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200371 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100372 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100373 (whitespace_re, TOKEN_WHITESPACE, None),
374 (float_re, TOKEN_FLOAT, None),
375 (integer_re, TOKEN_INTEGER, None),
376 (name_re, TOKEN_NAME, None),
377 (string_re, TOKEN_STRING, None),
378 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100379 ]
380
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100381 # assamble the root lexing rule. because "|" is ungreedy
382 # we have to sort by length so that the lexer continues working
383 # as expected when we have parsing rules like <% for block and
384 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200385 # variables are just part of the rules if variable processing
386 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200387 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100388
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200389 # block suffix if trimming is enabled
390 block_suffix_re = environment.trim_blocks and '\\n?' or ''
391
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200392 self.newline_sequence = environment.newline_sequence
393
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200394 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100395 self.rules = {
396 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100397 # directives
398 (c('(.*?)(?:%s)' % '|'.join(
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200399 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100400 e(environment.block_start_string),
401 e(environment.block_start_string),
402 e(environment.block_end_string)
403 )] + [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200404 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100405 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100406 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200407 # data
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200408 (c('.+'), TOKEN_DATA, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100409 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200410 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100411 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200412 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200413 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200414 e(environment.comment_end_string),
415 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100416 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100417 (c('(.)'), (Failure('Missing end of comment tag'),), None)
418 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200419 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100420 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200421 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200422 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200423 e(environment.block_end_string),
424 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100425 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100426 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200427 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100428 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200429 (c('\-%s\s*|%s' % (
430 e(environment.variable_end_string),
431 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100432 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200433 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200434 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100435 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200436 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
437 e(environment.block_start_string),
438 e(environment.block_start_string),
439 e(environment.block_end_string),
440 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200441 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100442 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200443 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200444 ],
445 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100446 TOKEN_LINESTATEMENT_BEGIN: [
447 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200448 ] + tag_rules,
449 # line comments
450 TOKEN_LINECOMMENT_BEGIN: [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200451 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
452 TOKEN_LINECOMMENT_END), '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200453 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200454 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200455
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200456 def _normalize_newlines(self, value):
457 """Called for strings and template data to normlize it to unicode."""
458 return newline_re.sub(self.newline_sequence, value)
459
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100460 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200461 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100462 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100463 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200464 return TokenStream(self.wrap(stream, name, filename), name, filename)
465
466 def wrap(self, stream, name=None, filename=None):
467 """This is called with the stream as returned by `tokenize` and wraps
468 every token in a :class:`Token` and converts the value.
469 """
470 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200471 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200472 continue
473 elif token == 'linestatement_begin':
474 token = 'block_begin'
475 elif token == 'linestatement_end':
476 token = 'block_end'
477 # we are not interested in those tokens in the parser
478 elif token in ('raw_begin', 'raw_end'):
479 continue
480 elif token == 'data':
481 value = self._normalize_newlines(value)
482 elif token == 'keyword':
483 token = value
484 elif token == 'name':
485 value = str(value)
486 elif token == 'string':
487 # try to unescape string
488 try:
489 value = self._normalize_newlines(value[1:-1]) \
490 .encode('ascii', 'backslashreplace') \
491 .decode('unicode-escape')
492 except Exception, e:
493 msg = str(e).split(':')[-1].strip()
494 raise TemplateSyntaxError(msg, lineno, name, filename)
495 # if we can express it as bytestring (ascii only)
496 # we do that for support of semi broken APIs
497 # as datetime.datetime.strftime
498 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200499 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200500 except UnicodeError:
501 pass
502 elif token == 'integer':
503 value = int(value)
504 elif token == 'float':
505 value = float(value)
506 elif token == 'operator':
507 token = operators[value]
508 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100509
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100510 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200511 """This method tokenizes the text and returns the tokens in a
512 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100513 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200514 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100515 pos = 0
516 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100517 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100518 if state is not None and state != 'root':
519 assert state in ('variable', 'block'), 'invalid state'
520 stack.append(state + '_begin')
521 else:
522 state = 'root'
523 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100524 source_length = len(source)
525
Armin Ronacher21580912007-04-17 17:13:10 +0200526 balancing_stack = []
527
Armin Ronacher71082072008-04-12 14:19:36 +0200528 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100529 # tokenizer loop
530 for regex, tokens, new_state in statetokens:
531 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200532 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200533 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200534 continue
535
536 # we only match blocks and variables if brances / parentheses
537 # are balanced. continue parsing with the lower rule which
538 # is the operator rule. do this only if the end tags look
539 # like operators
540 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200541 tokens in ('variable_end', 'block_end',
542 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200543 continue
544
545 # tuples support more options
546 if isinstance(tokens, tuple):
547 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200548 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200549 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200550 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200551 # bygroup is a bit more complex, in that case we
552 # yield for the current token the first named
553 # group that matched
554 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100555 for key, value in m.groupdict().iteritems():
556 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200557 yield lineno, key, value
558 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100559 break
560 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200561 raise RuntimeError('%r wanted to resolve '
562 'the token dynamically'
563 ' but no group matched'
564 % regex)
565 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100566 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200567 data = m.group(idx + 1)
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200568 if data or token not in ignore_if_empty:
Armin Ronacher21580912007-04-17 17:13:10 +0200569 yield lineno, token, data
570 lineno += data.count('\n')
571
Armin Ronacher71082072008-04-12 14:19:36 +0200572 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200573 else:
574 data = m.group()
575 # update brace/parentheses balance
576 if tokens == 'operator':
577 if data == '{':
578 balancing_stack.append('}')
579 elif data == '(':
580 balancing_stack.append(')')
581 elif data == '[':
582 balancing_stack.append(']')
583 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200584 if not balancing_stack:
585 raise TemplateSyntaxError('unexpected "%s"' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200586 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200587 filename)
588 expected_op = balancing_stack.pop()
589 if expected_op != data:
590 raise TemplateSyntaxError('unexpected "%s", '
591 'expected "%s"' %
592 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200593 lineno, name,
594 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200595 # yield items
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200596 if data or tokens not in ignore_if_empty:
597 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200598 lineno += data.count('\n')
599
600 # fetch new position into new variable so that we can check
601 # if there is a internal parsing error which would result
602 # in an infinite loop
603 pos2 = m.end()
604
605 # handle state changes
606 if new_state is not None:
607 # remove the uppermost state
608 if new_state == '#pop':
609 stack.pop()
610 # resolve the new state by group checking
611 elif new_state == '#bygroup':
612 for key, value in m.groupdict().iteritems():
613 if value is not None:
614 stack.append(key)
615 break
616 else:
617 raise RuntimeError('%r wanted to resolve the '
618 'new state dynamically but'
619 ' no group matched' %
620 regex)
621 # direct state name given
622 else:
623 stack.append(new_state)
624 statetokens = self.rules[stack[-1]]
625 # we are still at the same position and no stack change.
626 # this means a loop without break condition, avoid that and
627 # raise error
628 elif pos2 == pos:
629 raise RuntimeError('%r yielded empty string without '
630 'stack change' % regex)
631 # publish new function and start again
632 pos = pos2
633 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100634 # if loop terminated without break we havn't found a single match
635 # either we are at the end of the file or we have a problem
636 else:
637 # end of text
638 if pos >= source_length:
639 return
640 # something went wrong
641 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200642 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200643 name, filename)