blob: b974199d2f5cfd4e7333618826218793e4048dd7 [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher55494e42010-01-22 09:41:48 +010014 :copyright: (c) 2010 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherbd357722009-08-05 20:25:06 +020021from jinja2.utils import LRUCache, next
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherbd357722009-08-05 20:25:06 +020033
34# we use the unicode identifier rule if this python version is able
35# to handle unicode identifiers, otherwise the standard ASCII one.
36try:
37 compile('föö', '<unknown>', 'eval')
38except SyntaxError:
39 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
40else:
41 from jinja2 import _stringdefs
42 name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
43 _stringdefs.xid_continue))
44
Armin Ronachercb1b97f2008-09-10 14:03:53 +020045float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020046newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010047
Armin Ronacherb3b58022009-02-04 19:33:58 +010048# internal the tokens and keep references to them
49TOKEN_ADD = intern('add')
50TOKEN_ASSIGN = intern('assign')
51TOKEN_COLON = intern('colon')
52TOKEN_COMMA = intern('comma')
53TOKEN_DIV = intern('div')
54TOKEN_DOT = intern('dot')
55TOKEN_EQ = intern('eq')
56TOKEN_FLOORDIV = intern('floordiv')
57TOKEN_GT = intern('gt')
58TOKEN_GTEQ = intern('gteq')
59TOKEN_LBRACE = intern('lbrace')
60TOKEN_LBRACKET = intern('lbracket')
61TOKEN_LPAREN = intern('lparen')
62TOKEN_LT = intern('lt')
63TOKEN_LTEQ = intern('lteq')
64TOKEN_MOD = intern('mod')
65TOKEN_MUL = intern('mul')
66TOKEN_NE = intern('ne')
67TOKEN_PIPE = intern('pipe')
68TOKEN_POW = intern('pow')
69TOKEN_RBRACE = intern('rbrace')
70TOKEN_RBRACKET = intern('rbracket')
71TOKEN_RPAREN = intern('rparen')
72TOKEN_SEMICOLON = intern('semicolon')
73TOKEN_SUB = intern('sub')
74TOKEN_TILDE = intern('tilde')
75TOKEN_WHITESPACE = intern('whitespace')
76TOKEN_FLOAT = intern('float')
77TOKEN_INTEGER = intern('integer')
78TOKEN_NAME = intern('name')
79TOKEN_STRING = intern('string')
80TOKEN_OPERATOR = intern('operator')
81TOKEN_BLOCK_BEGIN = intern('block_begin')
82TOKEN_BLOCK_END = intern('block_end')
83TOKEN_VARIABLE_BEGIN = intern('variable_begin')
84TOKEN_VARIABLE_END = intern('variable_end')
85TOKEN_RAW_BEGIN = intern('raw_begin')
86TOKEN_RAW_END = intern('raw_end')
87TOKEN_COMMENT_BEGIN = intern('comment_begin')
88TOKEN_COMMENT_END = intern('comment_end')
89TOKEN_COMMENT = intern('comment')
90TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
91TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +020092TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
93TOKEN_LINECOMMENT_END = intern('linecomment_end')
94TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +010095TOKEN_DATA = intern('data')
96TOKEN_INITIAL = intern('initial')
97TOKEN_EOF = intern('eof')
98
Armin Ronacher1cc232c2007-09-07 17:52:41 +020099# bind operators to token types
100operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +0100101 '+': TOKEN_ADD,
102 '-': TOKEN_SUB,
103 '/': TOKEN_DIV,
104 '//': TOKEN_FLOORDIV,
105 '*': TOKEN_MUL,
106 '%': TOKEN_MOD,
107 '**': TOKEN_POW,
108 '~': TOKEN_TILDE,
109 '[': TOKEN_LBRACKET,
110 ']': TOKEN_RBRACKET,
111 '(': TOKEN_LPAREN,
112 ')': TOKEN_RPAREN,
113 '{': TOKEN_LBRACE,
114 '}': TOKEN_RBRACE,
115 '==': TOKEN_EQ,
116 '!=': TOKEN_NE,
117 '>': TOKEN_GT,
118 '>=': TOKEN_GTEQ,
119 '<': TOKEN_LT,
120 '<=': TOKEN_LTEQ,
121 '=': TOKEN_ASSIGN,
122 '.': TOKEN_DOT,
123 ':': TOKEN_COLON,
124 '|': TOKEN_PIPE,
125 ',': TOKEN_COMMA,
126 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200127}
128
129reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
130assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200131operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
132 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200133
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200134ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
135 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
136 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
137 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200138ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
139 TOKEN_COMMENT, TOKEN_LINECOMMENT])
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200140
Armin Ronacher92f572f2007-02-26 22:17:32 +0100141
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100142def _describe_token_type(token_type):
143 if token_type in reverse_operators:
144 return reverse_operators[token_type]
145 return {
146 TOKEN_COMMENT_BEGIN: 'begin of comment',
147 TOKEN_COMMENT_END: 'end of comment',
148 TOKEN_COMMENT: 'comment',
149 TOKEN_LINECOMMENT: 'comment',
150 TOKEN_BLOCK_BEGIN: 'begin of statement block',
151 TOKEN_BLOCK_END: 'end of statement block',
152 TOKEN_VARIABLE_BEGIN: 'begin of print statement',
153 TOKEN_VARIABLE_END: 'end of print statement',
154 TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement',
155 TOKEN_LINESTATEMENT_END: 'end of line statement',
156 TOKEN_DATA: 'template data / text',
157 TOKEN_EOF: 'end of template'
158 }.get(token_type, token_type)
159
160
161def describe_token(token):
162 """Returns a description of the token."""
163 if token.type == 'name':
164 return token.value
165 return _describe_token_type(token.type)
166
167
168def describe_token_expr(expr):
169 """Like `describe_token` but for token expressions."""
170 if ':' in expr:
171 type, value = expr.split(':', 1)
172 if type == 'name':
173 return value
174 else:
175 type = expr
176 return _describe_token_type(type)
177
178
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200179def count_newlines(value):
180 """Count the number of newline characters in the string. This is
181 useful for extensions that filter a stream.
182 """
183 return len(newline_re.findall(value))
184
185
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200186def compile_rules(environment):
187 """Compiles all the rules from the environment into a list of rules."""
188 e = re.escape
189 rules = [
190 (len(environment.comment_start_string), 'comment',
191 e(environment.comment_start_string)),
192 (len(environment.block_start_string), 'block',
193 e(environment.block_start_string)),
194 (len(environment.variable_start_string), 'variable',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200195 e(environment.variable_start_string))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200196 ]
197
198 if environment.line_statement_prefix is not None:
199 rules.append((len(environment.line_statement_prefix), 'linestatement',
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200200 r'^\s*' + e(environment.line_statement_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200201 if environment.line_comment_prefix is not None:
202 rules.append((len(environment.line_comment_prefix), 'linecomment',
Armin Ronachera0727a62009-04-02 14:14:30 +0200203 r'(?:^|(?<=\S))[^\S\r\n]*' +
204 e(environment.line_comment_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200205
206 return [x[1:] for x in sorted(rules, reverse=True)]
207
208
Armin Ronacher92f572f2007-02-26 22:17:32 +0100209class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200210 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100211 Used by the `Lexer` to specify known errors.
212 """
213
214 def __init__(self, message, cls=TemplateSyntaxError):
215 self.message = message
216 self.error_class = cls
217
Armin Ronacher720e55b2007-05-30 00:57:49 +0200218 def __call__(self, lineno, filename):
219 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100220
221
Armin Ronacher4325e372008-05-01 22:59:47 +0200222class Token(tuple):
223 """Token class."""
224 __slots__ = ()
225 lineno, type, value = (property(itemgetter(x)) for x in range(3))
226
227 def __new__(cls, lineno, type, value):
228 return tuple.__new__(cls, (lineno, intern(str(type)), value))
229
230 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200231 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200232 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100233 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200234 return self.value
235 return self.type
236
237 def test(self, expr):
238 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200239 token type or ``'token_type:token_value'``. This can only test
240 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200241 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200242 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200243 # passed an iterable of not interned strings.
244 if self.type == expr:
245 return True
246 elif ':' in expr:
247 return expr.split(':', 1) == [self.type, self.value]
248 return False
249
Armin Ronachercda43df2008-05-03 17:10:05 +0200250 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200251 """Test against multiple token expressions."""
252 for expr in iterable:
253 if self.test(expr):
254 return True
255 return False
256
257 def __repr__(self):
258 return 'Token(%r, %r, %r)' % (
259 self.lineno,
260 self.type,
261 self.value
262 )
263
264
265class TokenStreamIterator(object):
266 """The iterator for tokenstreams. Iterate over the stream
267 until the eof token is reached.
268 """
269
270 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200271 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200272
273 def __iter__(self):
274 return self
275
276 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200277 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100278 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200279 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200280 raise StopIteration()
Armin Ronacherbd357722009-08-05 20:25:06 +0200281 next(self.stream)
Armin Ronacher4325e372008-05-01 22:59:47 +0200282 return token
283
284
285class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200286 """A token stream is an iterable that yields :class:`Token`\s. The
287 parser however does not iterate over it but calls :meth:`next` to go
288 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200289 """
290
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200291 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200292 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200293 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200294 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200295 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200296 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100297 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacherbd357722009-08-05 20:25:06 +0200298 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200299
300 def __iter__(self):
301 return TokenStreamIterator(self)
302
303 def __nonzero__(self):
Armin Ronacherb3b58022009-02-04 19:33:58 +0100304 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Armin Ronacher4325e372008-05-01 22:59:47 +0200305
Armin Ronacher42a19882009-08-05 18:45:39 +0200306 eos = property(lambda x: not x, doc="Are we at the end of the stream?")
Armin Ronacher4325e372008-05-01 22:59:47 +0200307
308 def push(self, token):
309 """Push a token back to the stream."""
310 self._pushed.append(token)
311
312 def look(self):
313 """Look at the next token."""
Armin Ronacherbd357722009-08-05 20:25:06 +0200314 old_token = next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200315 result = self.current
316 self.push(result)
317 self.current = old_token
318 return result
319
Armin Ronacherea847c52008-05-02 20:04:32 +0200320 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200321 """Got n tokens ahead."""
322 for x in xrange(n):
Armin Ronacherbd357722009-08-05 20:25:06 +0200323 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200324
Armin Ronacherfdf95302008-05-11 22:20:51 +0200325 def next_if(self, expr):
326 """Perform the token test and return the token if it matched.
327 Otherwise the return value is `None`.
328 """
329 if self.current.test(expr):
Armin Ronacherbd357722009-08-05 20:25:06 +0200330 return next(self)
Armin Ronacherfdf95302008-05-11 22:20:51 +0200331
332 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200333 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200334 return self.next_if(expr) is not None
335
336 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200337 """Go one token ahead and return the old one"""
338 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200339 if self._pushed:
340 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100341 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200342 try:
343 self.current = self._next()
344 except StopIteration:
345 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200346 return rv
347
348 def close(self):
349 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100350 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200351 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200352 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200353
354 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200355 """Expect a given token type and return it. This accepts the same
356 argument as :meth:`jinja2.lexer.Token.test`.
357 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200358 if not self.current.test(expr):
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100359 expr = describe_token_expr(expr)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100360 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200361 raise TemplateSyntaxError('unexpected end of template, '
362 'expected %r.' % expr,
363 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200364 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200365 raise TemplateSyntaxError("expected token %r, got %r" %
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100366 (expr, describe_token(self.current)),
Armin Ronacher4325e372008-05-01 22:59:47 +0200367 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200368 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200369 try:
370 return self.current
371 finally:
Armin Ronacherbd357722009-08-05 20:25:06 +0200372 next(self)
Armin Ronacher4325e372008-05-01 22:59:47 +0200373
374
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200375def get_lexer(environment):
376 """Return a lexer which is probably cached."""
377 key = (environment.block_start_string,
378 environment.block_end_string,
379 environment.variable_start_string,
380 environment.variable_end_string,
381 environment.comment_start_string,
382 environment.comment_end_string,
383 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200384 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200385 environment.trim_blocks,
386 environment.newline_sequence)
387 lexer = _lexer_cache.get(key)
388 if lexer is None:
389 lexer = Lexer(environment)
390 _lexer_cache[key] = lexer
391 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200392
393
Armin Ronacher92f572f2007-02-26 22:17:32 +0100394class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200395 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100396 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200397
398 Note that the lexer is not automatically bound to an environment.
399 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100400 """
401
402 def __init__(self, environment):
403 # shortcuts
404 c = lambda x: re.compile(x, re.M | re.S)
405 e = re.escape
406
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200407 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100408 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100409 (whitespace_re, TOKEN_WHITESPACE, None),
410 (float_re, TOKEN_FLOAT, None),
411 (integer_re, TOKEN_INTEGER, None),
412 (name_re, TOKEN_NAME, None),
413 (string_re, TOKEN_STRING, None),
414 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100415 ]
416
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100417 # assamble the root lexing rule. because "|" is ungreedy
418 # we have to sort by length so that the lexer continues working
419 # as expected when we have parsing rules like <% for block and
420 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200421 # variables are just part of the rules if variable processing
422 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200423 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100424
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200425 # block suffix if trimming is enabled
426 block_suffix_re = environment.trim_blocks and '\\n?' or ''
427
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200428 self.newline_sequence = environment.newline_sequence
429
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200430 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100431 self.rules = {
432 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100433 # directives
434 (c('(.*?)(?:%s)' % '|'.join(
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200435 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100436 e(environment.block_start_string),
437 e(environment.block_start_string),
438 e(environment.block_end_string)
439 )] + [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200440 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100441 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100442 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200443 # data
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200444 (c('.+'), TOKEN_DATA, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100445 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200446 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100447 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200448 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200449 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200450 e(environment.comment_end_string),
451 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100452 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100453 (c('(.)'), (Failure('Missing end of comment tag'),), None)
454 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200455 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100456 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200457 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200458 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200459 e(environment.block_end_string),
460 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100461 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100462 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200463 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100464 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200465 (c('\-%s\s*|%s' % (
466 e(environment.variable_end_string),
467 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100468 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200469 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200470 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100471 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200472 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
473 e(environment.block_start_string),
474 e(environment.block_start_string),
475 e(environment.block_end_string),
476 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200477 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100478 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200479 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200480 ],
481 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100482 TOKEN_LINESTATEMENT_BEGIN: [
483 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200484 ] + tag_rules,
485 # line comments
486 TOKEN_LINECOMMENT_BEGIN: [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200487 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
488 TOKEN_LINECOMMENT_END), '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200489 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200490 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200491
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200492 def _normalize_newlines(self, value):
493 """Called for strings and template data to normlize it to unicode."""
494 return newline_re.sub(self.newline_sequence, value)
495
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100496 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200497 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100498 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100499 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200500 return TokenStream(self.wrap(stream, name, filename), name, filename)
501
502 def wrap(self, stream, name=None, filename=None):
503 """This is called with the stream as returned by `tokenize` and wraps
504 every token in a :class:`Token` and converts the value.
505 """
506 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200507 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200508 continue
509 elif token == 'linestatement_begin':
510 token = 'block_begin'
511 elif token == 'linestatement_end':
512 token = 'block_end'
513 # we are not interested in those tokens in the parser
514 elif token in ('raw_begin', 'raw_end'):
515 continue
516 elif token == 'data':
517 value = self._normalize_newlines(value)
518 elif token == 'keyword':
519 token = value
520 elif token == 'name':
521 value = str(value)
522 elif token == 'string':
523 # try to unescape string
524 try:
525 value = self._normalize_newlines(value[1:-1]) \
526 .encode('ascii', 'backslashreplace') \
527 .decode('unicode-escape')
528 except Exception, e:
529 msg = str(e).split(':')[-1].strip()
530 raise TemplateSyntaxError(msg, lineno, name, filename)
531 # if we can express it as bytestring (ascii only)
532 # we do that for support of semi broken APIs
533 # as datetime.datetime.strftime
534 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200535 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200536 except UnicodeError:
537 pass
538 elif token == 'integer':
539 value = int(value)
540 elif token == 'float':
541 value = float(value)
542 elif token == 'operator':
543 token = operators[value]
544 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100545
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100546 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200547 """This method tokenizes the text and returns the tokens in a
548 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100549 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200550 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100551 pos = 0
552 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100553 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100554 if state is not None and state != 'root':
555 assert state in ('variable', 'block'), 'invalid state'
556 stack.append(state + '_begin')
557 else:
558 state = 'root'
559 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100560 source_length = len(source)
561
Armin Ronacher21580912007-04-17 17:13:10 +0200562 balancing_stack = []
563
Armin Ronacher71082072008-04-12 14:19:36 +0200564 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100565 # tokenizer loop
566 for regex, tokens, new_state in statetokens:
567 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200568 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200569 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200570 continue
571
572 # we only match blocks and variables if brances / parentheses
573 # are balanced. continue parsing with the lower rule which
574 # is the operator rule. do this only if the end tags look
575 # like operators
576 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200577 tokens in ('variable_end', 'block_end',
578 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200579 continue
580
581 # tuples support more options
582 if isinstance(tokens, tuple):
583 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200584 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200585 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200586 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200587 # bygroup is a bit more complex, in that case we
588 # yield for the current token the first named
589 # group that matched
590 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100591 for key, value in m.groupdict().iteritems():
592 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200593 yield lineno, key, value
594 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100595 break
596 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200597 raise RuntimeError('%r wanted to resolve '
598 'the token dynamically'
599 ' but no group matched'
600 % regex)
601 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100602 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200603 data = m.group(idx + 1)
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200604 if data or token not in ignore_if_empty:
Armin Ronacher21580912007-04-17 17:13:10 +0200605 yield lineno, token, data
606 lineno += data.count('\n')
607
Armin Ronacher71082072008-04-12 14:19:36 +0200608 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200609 else:
610 data = m.group()
611 # update brace/parentheses balance
612 if tokens == 'operator':
613 if data == '{':
614 balancing_stack.append('}')
615 elif data == '(':
616 balancing_stack.append(')')
617 elif data == '[':
618 balancing_stack.append(']')
619 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200620 if not balancing_stack:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100621 raise TemplateSyntaxError('unexpected \'%s\'' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200622 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200623 filename)
624 expected_op = balancing_stack.pop()
625 if expected_op != data:
Armin Ronacher5dcb7242010-02-06 14:01:26 +0100626 raise TemplateSyntaxError('unexpected \'%s\', '
627 'expected \'%s\'' %
Armin Ronacherf750daa2007-05-29 23:22:38 +0200628 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200629 lineno, name,
630 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200631 # yield items
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200632 if data or tokens not in ignore_if_empty:
633 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200634 lineno += data.count('\n')
635
636 # fetch new position into new variable so that we can check
637 # if there is a internal parsing error which would result
638 # in an infinite loop
639 pos2 = m.end()
640
641 # handle state changes
642 if new_state is not None:
643 # remove the uppermost state
644 if new_state == '#pop':
645 stack.pop()
646 # resolve the new state by group checking
647 elif new_state == '#bygroup':
648 for key, value in m.groupdict().iteritems():
649 if value is not None:
650 stack.append(key)
651 break
652 else:
653 raise RuntimeError('%r wanted to resolve the '
654 'new state dynamically but'
655 ' no group matched' %
656 regex)
657 # direct state name given
658 else:
659 stack.append(new_state)
660 statetokens = self.rules[stack[-1]]
661 # we are still at the same position and no stack change.
662 # this means a loop without break condition, avoid that and
663 # raise error
664 elif pos2 == pos:
665 raise RuntimeError('%r yielded empty string without '
666 'stack change' % regex)
667 # publish new function and start again
668 pos = pos2
669 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100670 # if loop terminated without break we havn't found a single match
671 # either we are at the end of the file or we have a problem
672 else:
673 # end of text
674 if pos >= source_length:
675 return
676 # something went wrong
677 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200678 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200679 name, filename)