blob: 919cba8695e9c1494b3936ace088483629622e56 [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher62ccd1b2009-01-04 14:26:19 +010014 :copyright: (c) 2009 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherb5124e62008-04-25 00:36:14 +020021from jinja2.utils import LRUCache
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherd1ff8582008-05-11 00:30:43 +020033name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronachercb1b97f2008-09-10 14:03:53 +020034float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020035newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010036
Armin Ronacherb3b58022009-02-04 19:33:58 +010037# internal the tokens and keep references to them
38TOKEN_ADD = intern('add')
39TOKEN_ASSIGN = intern('assign')
40TOKEN_COLON = intern('colon')
41TOKEN_COMMA = intern('comma')
42TOKEN_DIV = intern('div')
43TOKEN_DOT = intern('dot')
44TOKEN_EQ = intern('eq')
45TOKEN_FLOORDIV = intern('floordiv')
46TOKEN_GT = intern('gt')
47TOKEN_GTEQ = intern('gteq')
48TOKEN_LBRACE = intern('lbrace')
49TOKEN_LBRACKET = intern('lbracket')
50TOKEN_LPAREN = intern('lparen')
51TOKEN_LT = intern('lt')
52TOKEN_LTEQ = intern('lteq')
53TOKEN_MOD = intern('mod')
54TOKEN_MUL = intern('mul')
55TOKEN_NE = intern('ne')
56TOKEN_PIPE = intern('pipe')
57TOKEN_POW = intern('pow')
58TOKEN_RBRACE = intern('rbrace')
59TOKEN_RBRACKET = intern('rbracket')
60TOKEN_RPAREN = intern('rparen')
61TOKEN_SEMICOLON = intern('semicolon')
62TOKEN_SUB = intern('sub')
63TOKEN_TILDE = intern('tilde')
64TOKEN_WHITESPACE = intern('whitespace')
65TOKEN_FLOAT = intern('float')
66TOKEN_INTEGER = intern('integer')
67TOKEN_NAME = intern('name')
68TOKEN_STRING = intern('string')
69TOKEN_OPERATOR = intern('operator')
70TOKEN_BLOCK_BEGIN = intern('block_begin')
71TOKEN_BLOCK_END = intern('block_end')
72TOKEN_VARIABLE_BEGIN = intern('variable_begin')
73TOKEN_VARIABLE_END = intern('variable_end')
74TOKEN_RAW_BEGIN = intern('raw_begin')
75TOKEN_RAW_END = intern('raw_end')
76TOKEN_COMMENT_BEGIN = intern('comment_begin')
77TOKEN_COMMENT_END = intern('comment_end')
78TOKEN_COMMENT = intern('comment')
79TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
80TOKEN_LINESTATEMENT_END = intern('linestatement_end')
Armin Ronacher59b6bd52009-03-30 21:00:16 +020081TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin')
82TOKEN_LINECOMMENT_END = intern('linecomment_end')
83TOKEN_LINECOMMENT = intern('linecomment')
Armin Ronacherb3b58022009-02-04 19:33:58 +010084TOKEN_DATA = intern('data')
85TOKEN_INITIAL = intern('initial')
86TOKEN_EOF = intern('eof')
87
Armin Ronacher1cc232c2007-09-07 17:52:41 +020088# bind operators to token types
89operators = {
Armin Ronacherb3b58022009-02-04 19:33:58 +010090 '+': TOKEN_ADD,
91 '-': TOKEN_SUB,
92 '/': TOKEN_DIV,
93 '//': TOKEN_FLOORDIV,
94 '*': TOKEN_MUL,
95 '%': TOKEN_MOD,
96 '**': TOKEN_POW,
97 '~': TOKEN_TILDE,
98 '[': TOKEN_LBRACKET,
99 ']': TOKEN_RBRACKET,
100 '(': TOKEN_LPAREN,
101 ')': TOKEN_RPAREN,
102 '{': TOKEN_LBRACE,
103 '}': TOKEN_RBRACE,
104 '==': TOKEN_EQ,
105 '!=': TOKEN_NE,
106 '>': TOKEN_GT,
107 '>=': TOKEN_GTEQ,
108 '<': TOKEN_LT,
109 '<=': TOKEN_LTEQ,
110 '=': TOKEN_ASSIGN,
111 '.': TOKEN_DOT,
112 ':': TOKEN_COLON,
113 '|': TOKEN_PIPE,
114 ',': TOKEN_COMMA,
115 ';': TOKEN_SEMICOLON
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200116}
117
118reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
119assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +0200120operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
121 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200122
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200123ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT,
124 TOKEN_COMMENT_END, TOKEN_WHITESPACE,
125 TOKEN_WHITESPACE, TOKEN_LINECOMMENT_BEGIN,
126 TOKEN_LINECOMMENT_END, TOKEN_LINECOMMENT])
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200127ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA,
128 TOKEN_COMMENT, TOKEN_LINECOMMENT])
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200129
Armin Ronacher92f572f2007-02-26 22:17:32 +0100130
Armin Ronacherd02fc7d2008-06-14 14:19:47 +0200131def count_newlines(value):
132 """Count the number of newline characters in the string. This is
133 useful for extensions that filter a stream.
134 """
135 return len(newline_re.findall(value))
136
137
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200138def compile_rules(environment):
139 """Compiles all the rules from the environment into a list of rules."""
140 e = re.escape
141 rules = [
142 (len(environment.comment_start_string), 'comment',
143 e(environment.comment_start_string)),
144 (len(environment.block_start_string), 'block',
145 e(environment.block_start_string)),
146 (len(environment.variable_start_string), 'variable',
Armin Ronacher1bb3ab72009-04-01 19:16:25 +0200147 e(environment.variable_start_string))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200148 ]
149
150 if environment.line_statement_prefix is not None:
151 rules.append((len(environment.line_statement_prefix), 'linestatement',
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200152 r'^\s*' + e(environment.line_statement_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200153 if environment.line_comment_prefix is not None:
154 rules.append((len(environment.line_comment_prefix), 'linecomment',
Armin Ronachera0727a62009-04-02 14:14:30 +0200155 r'(?:^|(?<=\S))[^\S\r\n]*' +
156 e(environment.line_comment_prefix)))
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200157
158 return [x[1:] for x in sorted(rules, reverse=True)]
159
160
Armin Ronacher92f572f2007-02-26 22:17:32 +0100161class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200162 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100163 Used by the `Lexer` to specify known errors.
164 """
165
166 def __init__(self, message, cls=TemplateSyntaxError):
167 self.message = message
168 self.error_class = cls
169
Armin Ronacher720e55b2007-05-30 00:57:49 +0200170 def __call__(self, lineno, filename):
171 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100172
173
Armin Ronacher4325e372008-05-01 22:59:47 +0200174class Token(tuple):
175 """Token class."""
176 __slots__ = ()
177 lineno, type, value = (property(itemgetter(x)) for x in range(3))
178
179 def __new__(cls, lineno, type, value):
180 return tuple.__new__(cls, (lineno, intern(str(type)), value))
181
182 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200183 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200184 return reverse_operators[self.type]
Ali Afshar272ca2a2009-01-05 12:14:14 +0100185 elif self.type == 'name':
Armin Ronacher4325e372008-05-01 22:59:47 +0200186 return self.value
187 return self.type
188
189 def test(self, expr):
190 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200191 token type or ``'token_type:token_value'``. This can only test
192 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200193 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200194 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200195 # passed an iterable of not interned strings.
196 if self.type == expr:
197 return True
198 elif ':' in expr:
199 return expr.split(':', 1) == [self.type, self.value]
200 return False
201
Armin Ronachercda43df2008-05-03 17:10:05 +0200202 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200203 """Test against multiple token expressions."""
204 for expr in iterable:
205 if self.test(expr):
206 return True
207 return False
208
209 def __repr__(self):
210 return 'Token(%r, %r, %r)' % (
211 self.lineno,
212 self.type,
213 self.value
214 )
215
216
217class TokenStreamIterator(object):
218 """The iterator for tokenstreams. Iterate over the stream
219 until the eof token is reached.
220 """
221
222 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200223 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200224
225 def __iter__(self):
226 return self
227
228 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200229 token = self.stream.current
Armin Ronacherb3b58022009-02-04 19:33:58 +0100230 if token.type is TOKEN_EOF:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200231 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200232 raise StopIteration()
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200233 self.stream.next()
Armin Ronacher4325e372008-05-01 22:59:47 +0200234 return token
235
236
237class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200238 """A token stream is an iterable that yields :class:`Token`\s. The
239 parser however does not iterate over it but calls :meth:`next` to go
240 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200241 """
242
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200243 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200244 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200245 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200246 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200247 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200248 self.closed = False
Armin Ronacherb3b58022009-02-04 19:33:58 +0100249 self.current = Token(1, TOKEN_INITIAL, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200250 self.next()
251
252 def __iter__(self):
253 return TokenStreamIterator(self)
254
255 def __nonzero__(self):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200256 """Are we at the end of the stream?"""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100257 return bool(self._pushed) or self.current.type is not TOKEN_EOF
Armin Ronacher4325e372008-05-01 22:59:47 +0200258
259 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
260
261 def push(self, token):
262 """Push a token back to the stream."""
263 self._pushed.append(token)
264
265 def look(self):
266 """Look at the next token."""
267 old_token = self.next()
268 result = self.current
269 self.push(result)
270 self.current = old_token
271 return result
272
Armin Ronacherea847c52008-05-02 20:04:32 +0200273 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200274 """Got n tokens ahead."""
275 for x in xrange(n):
276 self.next()
277
Armin Ronacherfdf95302008-05-11 22:20:51 +0200278 def next_if(self, expr):
279 """Perform the token test and return the token if it matched.
280 Otherwise the return value is `None`.
281 """
282 if self.current.test(expr):
283 return self.next()
284
285 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200286 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200287 return self.next_if(expr) is not None
288
289 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200290 """Go one token ahead and return the old one"""
291 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200292 if self._pushed:
293 self.current = self._pushed.popleft()
Armin Ronacherb3b58022009-02-04 19:33:58 +0100294 elif self.current.type is not TOKEN_EOF:
Armin Ronacherfdf95302008-05-11 22:20:51 +0200295 try:
296 self.current = self._next()
297 except StopIteration:
298 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200299 return rv
300
301 def close(self):
302 """Close the stream."""
Armin Ronacherb3b58022009-02-04 19:33:58 +0100303 self.current = Token(self.current.lineno, TOKEN_EOF, '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200304 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200305 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200306
307 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200308 """Expect a given token type and return it. This accepts the same
309 argument as :meth:`jinja2.lexer.Token.test`.
310 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200311 if not self.current.test(expr):
312 if ':' in expr:
313 expr = expr.split(':')[1]
Armin Ronacherb3b58022009-02-04 19:33:58 +0100314 if self.current.type is TOKEN_EOF:
Armin Ronacher4325e372008-05-01 22:59:47 +0200315 raise TemplateSyntaxError('unexpected end of template, '
316 'expected %r.' % expr,
317 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200318 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200319 raise TemplateSyntaxError("expected token %r, got %r" %
320 (expr, str(self.current)),
321 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200322 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200323 try:
324 return self.current
325 finally:
326 self.next()
327
328
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200329def get_lexer(environment):
330 """Return a lexer which is probably cached."""
331 key = (environment.block_start_string,
332 environment.block_end_string,
333 environment.variable_start_string,
334 environment.variable_end_string,
335 environment.comment_start_string,
336 environment.comment_end_string,
337 environment.line_statement_prefix,
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200338 environment.line_comment_prefix,
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200339 environment.trim_blocks,
340 environment.newline_sequence)
341 lexer = _lexer_cache.get(key)
342 if lexer is None:
343 lexer = Lexer(environment)
344 _lexer_cache[key] = lexer
345 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200346
347
Armin Ronacher92f572f2007-02-26 22:17:32 +0100348class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200349 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100350 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200351
352 Note that the lexer is not automatically bound to an environment.
353 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100354 """
355
356 def __init__(self, environment):
357 # shortcuts
358 c = lambda x: re.compile(x, re.M | re.S)
359 e = re.escape
360
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200361 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100362 tag_rules = [
Armin Ronacherb3b58022009-02-04 19:33:58 +0100363 (whitespace_re, TOKEN_WHITESPACE, None),
364 (float_re, TOKEN_FLOAT, None),
365 (integer_re, TOKEN_INTEGER, None),
366 (name_re, TOKEN_NAME, None),
367 (string_re, TOKEN_STRING, None),
368 (operator_re, TOKEN_OPERATOR, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100369 ]
370
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100371 # assamble the root lexing rule. because "|" is ungreedy
372 # we have to sort by length so that the lexer continues working
373 # as expected when we have parsing rules like <% for block and
374 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200375 # variables are just part of the rules if variable processing
376 # is required.
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200377 root_tag_rules = compile_rules(environment)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100378
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200379 # block suffix if trimming is enabled
380 block_suffix_re = environment.trim_blocks and '\\n?' or ''
381
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200382 self.newline_sequence = environment.newline_sequence
383
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200384 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100385 self.rules = {
386 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100387 # directives
388 (c('(.*?)(?:%s)' % '|'.join(
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200389 [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100390 e(environment.block_start_string),
391 e(environment.block_start_string),
392 e(environment.block_end_string)
393 )] + [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200394 r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100395 for n, r in root_tag_rules
Armin Ronacherb3b58022009-02-04 19:33:58 +0100396 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200397 # data
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200398 (c('.+'), TOKEN_DATA, None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100399 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200400 # comments
Armin Ronacherb3b58022009-02-04 19:33:58 +0100401 TOKEN_COMMENT_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200402 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200403 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200404 e(environment.comment_end_string),
405 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100406 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100407 (c('(.)'), (Failure('Missing end of comment tag'),), None)
408 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200409 # blocks
Armin Ronacherb3b58022009-02-04 19:33:58 +0100410 TOKEN_BLOCK_BEGIN: [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200411 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200412 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200413 e(environment.block_end_string),
414 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100415 )), TOKEN_BLOCK_END, '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100416 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200417 # variables
Armin Ronacherb3b58022009-02-04 19:33:58 +0100418 TOKEN_VARIABLE_BEGIN: [
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200419 (c('\-%s\s*|%s' % (
420 e(environment.variable_end_string),
421 e(environment.variable_end_string)
Armin Ronacherb3b58022009-02-04 19:33:58 +0100422 )), TOKEN_VARIABLE_END, '#pop')
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200423 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200424 # raw block
Armin Ronacherb3b58022009-02-04 19:33:58 +0100425 TOKEN_RAW_BEGIN: [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200426 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
427 e(environment.block_start_string),
428 e(environment.block_start_string),
429 e(environment.block_end_string),
430 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200431 block_suffix_re
Armin Ronacherb3b58022009-02-04 19:33:58 +0100432 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200433 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200434 ],
435 # line statements
Armin Ronacherb3b58022009-02-04 19:33:58 +0100436 TOKEN_LINESTATEMENT_BEGIN: [
437 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200438 ] + tag_rules,
439 # line comments
440 TOKEN_LINECOMMENT_BEGIN: [
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200441 (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT,
442 TOKEN_LINECOMMENT_END), '#pop')
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200443 ]
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200444 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200445
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200446 def _normalize_newlines(self, value):
447 """Called for strings and template data to normlize it to unicode."""
448 return newline_re.sub(self.newline_sequence, value)
449
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100450 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200451 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100452 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100453 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200454 return TokenStream(self.wrap(stream, name, filename), name, filename)
455
456 def wrap(self, stream, name=None, filename=None):
457 """This is called with the stream as returned by `tokenize` and wraps
458 every token in a :class:`Token` and converts the value.
459 """
460 for lineno, token, value in stream:
Armin Ronacher59b6bd52009-03-30 21:00:16 +0200461 if token in ignored_tokens:
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200462 continue
463 elif token == 'linestatement_begin':
464 token = 'block_begin'
465 elif token == 'linestatement_end':
466 token = 'block_end'
467 # we are not interested in those tokens in the parser
468 elif token in ('raw_begin', 'raw_end'):
469 continue
470 elif token == 'data':
471 value = self._normalize_newlines(value)
472 elif token == 'keyword':
473 token = value
474 elif token == 'name':
475 value = str(value)
476 elif token == 'string':
477 # try to unescape string
478 try:
479 value = self._normalize_newlines(value[1:-1]) \
480 .encode('ascii', 'backslashreplace') \
481 .decode('unicode-escape')
482 except Exception, e:
483 msg = str(e).split(':')[-1].strip()
484 raise TemplateSyntaxError(msg, lineno, name, filename)
485 # if we can express it as bytestring (ascii only)
486 # we do that for support of semi broken APIs
487 # as datetime.datetime.strftime
488 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200489 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200490 except UnicodeError:
491 pass
492 elif token == 'integer':
493 value = int(value)
494 elif token == 'float':
495 value = float(value)
496 elif token == 'operator':
497 token = operators[value]
498 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100499
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100500 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200501 """This method tokenizes the text and returns the tokens in a
502 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100503 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200504 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100505 pos = 0
506 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100507 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100508 if state is not None and state != 'root':
509 assert state in ('variable', 'block'), 'invalid state'
510 stack.append(state + '_begin')
511 else:
512 state = 'root'
513 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100514 source_length = len(source)
515
Armin Ronacher21580912007-04-17 17:13:10 +0200516 balancing_stack = []
517
Armin Ronacher71082072008-04-12 14:19:36 +0200518 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100519 # tokenizer loop
520 for regex, tokens, new_state in statetokens:
521 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200522 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200523 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200524 continue
525
526 # we only match blocks and variables if brances / parentheses
527 # are balanced. continue parsing with the lower rule which
528 # is the operator rule. do this only if the end tags look
529 # like operators
530 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200531 tokens in ('variable_end', 'block_end',
532 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200533 continue
534
535 # tuples support more options
536 if isinstance(tokens, tuple):
537 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200538 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200539 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200540 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200541 # bygroup is a bit more complex, in that case we
542 # yield for the current token the first named
543 # group that matched
544 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100545 for key, value in m.groupdict().iteritems():
546 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200547 yield lineno, key, value
548 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100549 break
550 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200551 raise RuntimeError('%r wanted to resolve '
552 'the token dynamically'
553 ' but no group matched'
554 % regex)
555 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100556 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200557 data = m.group(idx + 1)
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200558 if data or token not in ignore_if_empty:
Armin Ronacher21580912007-04-17 17:13:10 +0200559 yield lineno, token, data
560 lineno += data.count('\n')
561
Armin Ronacher71082072008-04-12 14:19:36 +0200562 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200563 else:
564 data = m.group()
565 # update brace/parentheses balance
566 if tokens == 'operator':
567 if data == '{':
568 balancing_stack.append('}')
569 elif data == '(':
570 balancing_stack.append(')')
571 elif data == '[':
572 balancing_stack.append(']')
573 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200574 if not balancing_stack:
575 raise TemplateSyntaxError('unexpected "%s"' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200576 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200577 filename)
578 expected_op = balancing_stack.pop()
579 if expected_op != data:
580 raise TemplateSyntaxError('unexpected "%s", '
581 'expected "%s"' %
582 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200583 lineno, name,
584 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200585 # yield items
Armin Ronacherdb7985d2009-03-31 23:51:56 +0200586 if data or tokens not in ignore_if_empty:
587 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200588 lineno += data.count('\n')
589
590 # fetch new position into new variable so that we can check
591 # if there is a internal parsing error which would result
592 # in an infinite loop
593 pos2 = m.end()
594
595 # handle state changes
596 if new_state is not None:
597 # remove the uppermost state
598 if new_state == '#pop':
599 stack.pop()
600 # resolve the new state by group checking
601 elif new_state == '#bygroup':
602 for key, value in m.groupdict().iteritems():
603 if value is not None:
604 stack.append(key)
605 break
606 else:
607 raise RuntimeError('%r wanted to resolve the '
608 'new state dynamically but'
609 ' no group matched' %
610 regex)
611 # direct state name given
612 else:
613 stack.append(new_state)
614 statetokens = self.rules[stack[-1]]
615 # we are still at the same position and no stack change.
616 # this means a loop without break condition, avoid that and
617 # raise error
618 elif pos2 == pos:
619 raise RuntimeError('%r yielded empty string without '
620 'stack change' % regex)
621 # publish new function and start again
622 pos = pos2
623 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100624 # if loop terminated without break we havn't found a single match
625 # either we are at the end of the file or we have a problem
626 else:
627 # end of text
628 if pos >= source_length:
629 return
630 # something went wrong
631 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200632 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200633 name, filename)