blob: 3a925bb6ee66292382b5cc8744e636405a7e498d [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher62ccd1b2009-01-04 14:26:19 +010014 :copyright: (c) 2009 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherb5124e62008-04-25 00:36:14 +020021from jinja2.utils import LRUCache
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherd1ff8582008-05-11 00:30:43 +020033name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronachercb1b97f2008-09-10 14:03:53 +020034float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020035newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010036
Armin Ronacherb3b58022009-02-04 19:33:58 +010037# internal the tokens and keep references to them
38TOKEN_ADD = intern('add')
39TOKEN_ASSIGN = intern('assign')
40TOKEN_COLON = intern('colon')
41TOKEN_COMMA = intern('comma')
42TOKEN_DIV = intern('div')
43TOKEN_DOT = intern('dot')
44TOKEN_EQ = intern('eq')
45TOKEN_FLOORDIV = intern('floordiv')
46TOKEN_GT = intern('gt')
47TOKEN_GTEQ = intern('gteq')
48TOKEN_LBRACE = intern('lbrace')
49TOKEN_LBRACKET = intern('lbracket')
50TOKEN_LPAREN = intern('lparen')
51TOKEN_LT = intern('lt')
52TOKEN_LTEQ = intern('lteq')
53TOKEN_MOD = intern('mod')
54TOKEN_MUL = intern('mul')
55TOKEN_NE = intern('ne')
56TOKEN_PIPE = intern('pipe')
57TOKEN_POW = intern('pow')
58TOKEN_RBRACE = intern('rbrace')
59TOKEN_RBRACKET = intern('rbracket')
60TOKEN_RPAREN = intern('rparen')
61TOKEN_SEMICOLON = intern('semicolon')
62TOKEN_SUB = intern('sub')
63TOKEN_TILDE = intern('tilde')
64TOKEN_WHITESPACE = intern('whitespace')
65TOKEN_FLOAT = intern('float')
66TOKEN_INTEGER = intern('integer')
67TOKEN_NAME = intern('name')
68TOKEN_STRING = intern('string')
69TOKEN_OPERATOR = intern('operator')
70TOKEN_BLOCK_BEGIN = intern('block_begin')
71TOKEN_BLOCK_END = intern('block_end')
72TOKEN_VARIABLE_BEGIN = intern('variable_begin')
73TOKEN_VARIABLE_END = intern('variable_end')
74TOKEN_RAW_BEGIN = intern('raw_begin')
75TOKEN_RAW_END = intern('raw_end')
76TOKEN_COMMENT_BEGIN = intern('comment_begin')
77TOKEN_COMMENT_END = intern('comment_end')
78TOKEN_COMMENT = intern('comment')
79TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
80TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +020081TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
82TOKEN_LINECOMMENT_END = intern('linecomment_end')
83TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +010084TOKEN_DATA = intern('data')
85TOKEN_INITIAL = intern('initial')
86TOKEN_EOF = intern('eof')
87
Armin Ronacher1cc232c2007-09-07 17:52:41 +020088# bind operators to token types
89operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +010090 '+': TOKEN_ADD,
91 '-': TOKEN_SUB,
92 '/': TOKEN_DIV,
93 '//': TOKEN_FLOORDIV,
94 '*': TOKEN_MUL,
95 '%': TOKEN_MOD,
96 '**': TOKEN_POW,
97 '~': TOKEN_TILDE,
98 '[': TOKEN_LBRACKET,
99 ']': TOKEN_RBRACKET,
100 '(': TOKEN_LPAREN,
101 ')': TOKEN_RPAREN,
102 '{': TOKEN_LBRACE,
103 '}': TOKEN_RBRACE,
104 '==': TOKEN_EQ,
105 '!=': TOKEN_NE,
106 '>': TOKEN_GT,
107 '>=': TOKEN_GTEQ,
108 '<': TOKEN_LT,
109 '<=': TOKEN_LTEQ,
110 '=': TOKEN_ASSIGN,
111 '.': TOKEN_DOT,
112 ':': TOKEN_COLON,
113 '|': TOKEN_PIPE,
114 ',': TOKEN_COMMA,
115 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200116}
117
118reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
119assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200120operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
121 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200122
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200123ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
124 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
125 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
126 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200127ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
128 TOKEN_COMMENT, TOKEN_LINECOMMENT])
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200129
Armin Ronacher92f572f2007-02-26 22:17:32 +0100130
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200131def count_newlines(value):
132 """Count the number of newline characters in the string. This is
133 useful for extensions that filter a stream.
134 """
135 return len(newline_re.findall(value))
136
137
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200138def compile_rules(environment):
139 """Compiles all the rules from the environment into a list of rules."""
140 e = re.escape
141 rules = [
142 (len(environment.comment_start_string), 'comment',
143 e(environment.comment_start_string)),
144 (len(environment.block_start_string), 'block',
145 e(environment.block_start_string)),
146 (len(environment.variable_start_string), 'variable',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200147 e(environment.variable_start_string))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200148 ]
149
150 if environment.line_statement_prefix is not None:
151 rules.append((len(environment.line_statement_prefix), 'linestatement',
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200152 r'^\s*' + e(environment.line_statement_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200153 if environment.line_comment_prefix is not None:
154 rules.append((len(environment.line_comment_prefix), 'linecomment',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200155 r'(?:^|(?<=\S))[^\S\r\n]*' + e(environment.line_comment_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200156
157 return [x[1:] for x in sorted(rules, reverse=True)]
158
159
Armin Ronacher92f572f2007-02-26 22:17:32 +0100160class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200161 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100162 Used by the `Lexer` to specify known errors.
163 """
164
165 def __init__(self, message, cls=TemplateSyntaxError):
166 self.message = message
167 self.error_class = cls
168
Armin Ronacher720e55b2007-05-30 00:57:49 +0200169 def __call__(self, lineno, filename):
170 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100171
172
Armin Ronacher4325e372008-05-01 22:59:47 +0200173class Token(tuple):
174 """Token class."""
175 __slots__ = ()
176 lineno, type, value = (property(itemgetter(x)) for x in range(3))
177
178 def __new__(cls, lineno, type, value):
179 return tuple.__new__(cls, (lineno, intern(str(type)), value))
180
181 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200182 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200183 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100184 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200185 return self.value
186 return self.type
187
188 def test(self, expr):
189 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200190 token type or ``'token_type:token_value'``. This can only test
191 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200192 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200193 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200194 # passed an iterable of not interned strings.
195 if self.type == expr:
196 return True
197 elif ':' in expr:
198 return expr.split(':', 1) == [self.type, self.value]
199 return False
200
Armin Ronachercda43df2008-05-03 17:10:05 +0200201 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200202 """Test against multiple token expressions."""
203 for expr in iterable:
204 if self.test(expr):
205 return True
206 return False
207
208 def __repr__(self):
209 return 'Token(%r, %r, %r)' % (
210 self.lineno,
211 self.type,
212 self.value
213 )
214
215
216class TokenStreamIterator(object):
217 """The iterator for tokenstreams. Iterate over the stream
218 until the eof token is reached.
219 """
220
221 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200222 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200223
224 def __iter__(self):
225 return self
226
227 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200228 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100229 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200230 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200231 raise StopIteration()
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200232 self.stream.next()
Armin Ronacher4325e372008-05-01 22:59:47 +0200233 return token
234
235
236class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200237 """A token stream is an iterable that yields :class:`Token`\s. The
238 parser however does not iterate over it but calls :meth:`next` to go
239 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200240 """
241
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200242 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200243 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200244 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200245 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200246 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200247 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100248 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200249 self.next()
250
251 def __iter__(self):
252 return TokenStreamIterator(self)
253
254 def __nonzero__(self):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200255 """Are we at the end of the stream?"""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100256 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Armin Ronacher4325e372008-05-01 22:59:47 +0200257
258 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
259
260 def push(self, token):
261 """Push a token back to the stream."""
262 self._pushed.append(token)
263
264 def look(self):
265 """Look at the next token."""
266 old_token = self.next()
267 result = self.current
268 self.push(result)
269 self.current = old_token
270 return result
271
Armin Ronacherea847c52008-05-02 20:04:32 +0200272 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200273 """Got n tokens ahead."""
274 for x in xrange(n):
275 self.next()
276
Armin Ronacherfdf95302008-05-11 22:20:51 +0200277 def next_if(self, expr):
278 """Perform the token test and return the token if it matched.
279 Otherwise the return value is `None`.
280 """
281 if self.current.test(expr):
282 return self.next()
283
284 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200285 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200286 return self.next_if(expr) is not None
287
288 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200289 """Go one token ahead and return the old one"""
290 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200291 if self._pushed:
292 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100293 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200294 try:
295 self.current = self._next()
296 except StopIteration:
297 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200298 return rv
299
300 def close(self):
301 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100302 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200303 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200304 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200305
306 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200307 """Expect a given token type and return it. This accepts the same
308 argument as :meth:`jinja2.lexer.Token.test`.
309 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200310 if not self.current.test(expr):
311 if ':' in expr:
312 expr = expr.split(':')[1]
Armin Ronacherb3b58022009-02-04 19:33:58 +0100313 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200314 raise TemplateSyntaxError('unexpected end of template, '
315 'expected %r.' % expr,
316 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200317 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200318 raise TemplateSyntaxError("expected token %r, got %r" %
319 (expr, str(self.current)),
320 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200321 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200322 try:
323 return self.current
324 finally:
325 self.next()
326
327
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200328def get_lexer(environment):
329 """Return a lexer which is probably cached."""
330 key = (environment.block_start_string,
331 environment.block_end_string,
332 environment.variable_start_string,
333 environment.variable_end_string,
334 environment.comment_start_string,
335 environment.comment_end_string,
336 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200337 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200338 environment.trim_blocks,
339 environment.newline_sequence)
340 lexer = _lexer_cache.get(key)
341 if lexer is None:
342 lexer = Lexer(environment)
343 _lexer_cache[key] = lexer
344 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200345
346
Armin Ronacher92f572f2007-02-26 22:17:32 +0100347class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200348 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100349 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200350
351 Note that the lexer is not automatically bound to an environment.
352 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100353 """
354
355 def __init__(self, environment):
356 # shortcuts
357 c = lambda x: re.compile(x, re.M | re.S)
358 e = re.escape
359
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200360 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100361 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100362 (whitespace_re, TOKEN_WHITESPACE, None),
363 (float_re, TOKEN_FLOAT, None),
364 (integer_re, TOKEN_INTEGER, None),
365 (name_re, TOKEN_NAME, None),
366 (string_re, TOKEN_STRING, None),
367 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100368 ]
369
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100370 # assamble the root lexing rule. because "|" is ungreedy
371 # we have to sort by length so that the lexer continues working
372 # as expected when we have parsing rules like <% for block and
373 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200374 # variables are just part of the rules if variable processing
375 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200376 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100377
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200378 # block suffix if trimming is enabled
379 block_suffix_re = environment.trim_blocks and '\\n?' or ''
380
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200381 self.newline_sequence = environment.newline_sequence
382
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200383 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100384 self.rules = {
385 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100386 # directives
387 (c('(.*?)(?:%s)' % '|'.join(
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200388 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100389 e(environment.block_start_string),
390 e(environment.block_start_string),
391 e(environment.block_end_string)
392 )] + [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200393 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100394 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100395 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200396 # data
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200397 (c('.+'), TOKEN_DATA, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100398 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200399 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100400 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200401 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200402 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200403 e(environment.comment_end_string),
404 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100405 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100406 (c('(.)'), (Failure('Missing end of comment tag'),), None)
407 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200408 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100409 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200410 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200411 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200412 e(environment.block_end_string),
413 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100414 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100415 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200416 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100417 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200418 (c('\-%s\s*|%s' % (
419 e(environment.variable_end_string),
420 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100421 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200422 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200423 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100424 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200425 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
426 e(environment.block_start_string),
427 e(environment.block_start_string),
428 e(environment.block_end_string),
429 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200430 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100431 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200432 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200433 ],
434 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100435 TOKEN_LINESTATEMENT_BEGIN: [
436 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200437 ] + tag_rules,
438 # line comments
439 TOKEN_LINECOMMENT_BEGIN: [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200440 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
441 TOKEN_LINECOMMENT_END), '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200442 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200443 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200444
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200445 def _normalize_newlines(self, value):
446 """Called for strings and template data to normlize it to unicode."""
447 return newline_re.sub(self.newline_sequence, value)
448
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100449 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200450 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100451 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100452 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200453 return TokenStream(self.wrap(stream, name, filename), name, filename)
454
455 def wrap(self, stream, name=None, filename=None):
456 """This is called with the stream as returned by `tokenize` and wraps
457 every token in a :class:`Token` and converts the value.
458 """
459 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200460 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200461 continue
462 elif token == 'linestatement_begin':
463 token = 'block_begin'
464 elif token == 'linestatement_end':
465 token = 'block_end'
466 # we are not interested in those tokens in the parser
467 elif token in ('raw_begin', 'raw_end'):
468 continue
469 elif token == 'data':
470 value = self._normalize_newlines(value)
471 elif token == 'keyword':
472 token = value
473 elif token == 'name':
474 value = str(value)
475 elif token == 'string':
476 # try to unescape string
477 try:
478 value = self._normalize_newlines(value[1:-1]) \
479 .encode('ascii', 'backslashreplace') \
480 .decode('unicode-escape')
481 except Exception, e:
482 msg = str(e).split(':')[-1].strip()
483 raise TemplateSyntaxError(msg, lineno, name, filename)
484 # if we can express it as bytestring (ascii only)
485 # we do that for support of semi broken APIs
486 # as datetime.datetime.strftime
487 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200488 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200489 except UnicodeError:
490 pass
491 elif token == 'integer':
492 value = int(value)
493 elif token == 'float':
494 value = float(value)
495 elif token == 'operator':
496 token = operators[value]
497 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100498
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100499 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200500 """This method tokenizes the text and returns the tokens in a
501 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100502 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200503 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100504 pos = 0
505 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100506 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100507 if state is not None and state != 'root':
508 assert state in ('variable', 'block'), 'invalid state'
509 stack.append(state + '_begin')
510 else:
511 state = 'root'
512 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100513 source_length = len(source)
514
Armin Ronacher21580912007-04-17 17:13:10 +0200515 balancing_stack = []
516
Armin Ronacher71082072008-04-12 14:19:36 +0200517 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100518 # tokenizer loop
519 for regex, tokens, new_state in statetokens:
520 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200521 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200522 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200523 continue
524
525 # we only match blocks and variables if brances / parentheses
526 # are balanced. continue parsing with the lower rule which
527 # is the operator rule. do this only if the end tags look
528 # like operators
529 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200530 tokens in ('variable_end', 'block_end',
531 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200532 continue
533
534 # tuples support more options
535 if isinstance(tokens, tuple):
536 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200537 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200538 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200539 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200540 # bygroup is a bit more complex, in that case we
541 # yield for the current token the first named
542 # group that matched
543 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100544 for key, value in m.groupdict().iteritems():
545 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200546 yield lineno, key, value
547 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100548 break
549 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200550 raise RuntimeError('%r wanted to resolve '
551 'the token dynamically'
552 ' but no group matched'
553 % regex)
554 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100555 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200556 data = m.group(idx + 1)
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200557 if data or token not in ignore_if_empty:
Armin Ronacher21580912007-04-17 17:13:10 +0200558 yield lineno, token, data
559 lineno += data.count('\n')
560
Armin Ronacher71082072008-04-12 14:19:36 +0200561 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200562 else:
563 data = m.group()
564 # update brace/parentheses balance
565 if tokens == 'operator':
566 if data == '{':
567 balancing_stack.append('}')
568 elif data == '(':
569 balancing_stack.append(')')
570 elif data == '[':
571 balancing_stack.append(']')
572 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200573 if not balancing_stack:
574 raise TemplateSyntaxError('unexpected "%s"' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200575 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200576 filename)
577 expected_op = balancing_stack.pop()
578 if expected_op != data:
579 raise TemplateSyntaxError('unexpected "%s", '
580 'expected "%s"' %
581 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200582 lineno, name,
583 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200584 # yield items
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200585 if data or tokens not in ignore_if_empty:
586 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200587 lineno += data.count('\n')
588
589 # fetch new position into new variable so that we can check
590 # if there is a internal parsing error which would result
591 # in an infinite loop
592 pos2 = m.end()
593
594 # handle state changes
595 if new_state is not None:
596 # remove the uppermost state
597 if new_state == '#pop':
598 stack.pop()
599 # resolve the new state by group checking
600 elif new_state == '#bygroup':
601 for key, value in m.groupdict().iteritems():
602 if value is not None:
603 stack.append(key)
604 break
605 else:
606 raise RuntimeError('%r wanted to resolve the '
607 'new state dynamically but'
608 ' no group matched' %
609 regex)
610 # direct state name given
611 else:
612 stack.append(new_state)
613 statetokens = self.rules[stack[-1]]
614 # we are still at the same position and no stack change.
615 # this means a loop without break condition, avoid that and
616 # raise error
617 elif pos2 == pos:
618 raise RuntimeError('%r yielded empty string without '
619 'stack change' % regex)
620 # publish new function and start again
621 pos = pos2
622 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100623 # if loop terminated without break we havn't found a single match
624 # either we are at the end of the file or we have a problem
625 else:
626 # end of text
627 if pos >= source_length:
628 return
629 # something went wrong
630 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200631 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200632 name, filename)