blob: 0597b7a80b46f63c4067f6190c27fd1b3367e2ed [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher1d51f632008-03-25 14:34:45 +010014 :copyright: 2007-2008 by Armin Ronacher.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher1cc232c2007-09-07 17:52:41 +020018import unicodedata
Armin Ronacher4325e372008-05-01 22:59:47 +020019from operator import itemgetter
20from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020021from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherb5124e62008-04-25 00:36:14 +020022from jinja2.utils import LRUCache
Armin Ronacher92f572f2007-02-26 22:17:32 +010023
24
Armin Ronacher21580912007-04-17 17:13:10 +020025# cache for the lexers. Exists in order to be able to have multiple
26# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020027_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020028
Armin Ronacher92f572f2007-02-26 22:17:32 +010029# static regular expressions
Armin Ronacher0949e4d2007-10-07 18:53:29 +020030whitespace_re = re.compile(r'\s+(?um)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010031string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
32 r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
Armin Ronacher1cc232c2007-09-07 17:52:41 +020033integer_re = re.compile(r'\d+')
Armin Ronacherd1ff8582008-05-11 00:30:43 +020034name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronacher1cc232c2007-09-07 17:52:41 +020035float_re = re.compile(r'\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020036newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010037
Armin Ronacher1cc232c2007-09-07 17:52:41 +020038# bind operators to token types
39operators = {
40 '+': 'add',
41 '-': 'sub',
42 '/': 'div',
43 '//': 'floordiv',
44 '*': 'mul',
45 '%': 'mod',
46 '**': 'pow',
47 '~': 'tilde',
Armin Ronacher1cc232c2007-09-07 17:52:41 +020048 '[': 'lbracket',
49 ']': 'rbracket',
50 '(': 'lparen',
51 ')': 'rparen',
52 '{': 'lbrace',
53 '}': 'rbrace',
54 '==': 'eq',
55 '!=': 'ne',
56 '>': 'gt',
57 '>=': 'gteq',
58 '<': 'lt',
59 '<=': 'lteq',
60 '=': 'assign',
61 '.': 'dot',
62 ':': 'colon',
63 '|': 'pipe',
Armin Ronacher07bc6842008-03-31 14:18:49 +020064 ',': 'comma',
65 ';': 'semicolon'
Armin Ronacher1cc232c2007-09-07 17:52:41 +020066}
67
68reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
69assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +020070operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
71 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +020072
Armin Ronacher92f572f2007-02-26 22:17:32 +010073
Armin Ronacherd02fc7d2008-06-14 14:19:47 +020074def count_newlines(value):
75 """Count the number of newline characters in the string. This is
76 useful for extensions that filter a stream.
77 """
78 return len(newline_re.findall(value))
79
80
Armin Ronacher92f572f2007-02-26 22:17:32 +010081class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +020082 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +010083 Used by the `Lexer` to specify known errors.
84 """
85
86 def __init__(self, message, cls=TemplateSyntaxError):
87 self.message = message
88 self.error_class = cls
89
Armin Ronacher720e55b2007-05-30 00:57:49 +020090 def __call__(self, lineno, filename):
91 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +010092
93
Armin Ronacher4325e372008-05-01 22:59:47 +020094class Token(tuple):
95 """Token class."""
96 __slots__ = ()
97 lineno, type, value = (property(itemgetter(x)) for x in range(3))
98
99 def __new__(cls, lineno, type, value):
100 return tuple.__new__(cls, (lineno, intern(str(type)), value))
101
102 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200103 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200104 return reverse_operators[self.type]
105 elif self.type is 'name':
106 return self.value
107 return self.type
108
109 def test(self, expr):
110 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200111 token type or ``'token_type:token_value'``. This can only test
112 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200113 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200114 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200115 # passed an iterable of not interned strings.
116 if self.type == expr:
117 return True
118 elif ':' in expr:
119 return expr.split(':', 1) == [self.type, self.value]
120 return False
121
Armin Ronachercda43df2008-05-03 17:10:05 +0200122 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200123 """Test against multiple token expressions."""
124 for expr in iterable:
125 if self.test(expr):
126 return True
127 return False
128
129 def __repr__(self):
130 return 'Token(%r, %r, %r)' % (
131 self.lineno,
132 self.type,
133 self.value
134 )
135
136
137class TokenStreamIterator(object):
138 """The iterator for tokenstreams. Iterate over the stream
139 until the eof token is reached.
140 """
141
142 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200143 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200144
145 def __iter__(self):
146 return self
147
148 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200149 token = self.stream.current
Armin Ronacher4325e372008-05-01 22:59:47 +0200150 if token.type == 'eof':
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200151 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200152 raise StopIteration()
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200153 self.stream.next()
Armin Ronacher4325e372008-05-01 22:59:47 +0200154 return token
155
156
157class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200158 """A token stream is an iterable that yields :class:`Token`\s. The
159 parser however does not iterate over it but calls :meth:`next` to go
160 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200161 """
162
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200163 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200164 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200165 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200166 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200167 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200168 self.closed = False
169 self.current = Token(1, 'initial', '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200170 self.next()
171
172 def __iter__(self):
173 return TokenStreamIterator(self)
174
175 def __nonzero__(self):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200176 """Are we at the end of the stream?"""
Armin Ronacher4325e372008-05-01 22:59:47 +0200177 return bool(self._pushed) or self.current.type != 'eof'
178
179 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
180
181 def push(self, token):
182 """Push a token back to the stream."""
183 self._pushed.append(token)
184
185 def look(self):
186 """Look at the next token."""
187 old_token = self.next()
188 result = self.current
189 self.push(result)
190 self.current = old_token
191 return result
192
Armin Ronacherea847c52008-05-02 20:04:32 +0200193 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200194 """Got n tokens ahead."""
195 for x in xrange(n):
196 self.next()
197
Armin Ronacherfdf95302008-05-11 22:20:51 +0200198 def next_if(self, expr):
199 """Perform the token test and return the token if it matched.
200 Otherwise the return value is `None`.
201 """
202 if self.current.test(expr):
203 return self.next()
204
205 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200206 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200207 return self.next_if(expr) is not None
208
209 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200210 """Go one token ahead and return the old one"""
211 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200212 if self._pushed:
213 self.current = self._pushed.popleft()
214 elif self.current.type is not 'eof':
215 try:
216 self.current = self._next()
217 except StopIteration:
218 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200219 return rv
220
221 def close(self):
222 """Close the stream."""
223 self.current = Token(self.current.lineno, 'eof', '')
224 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200225 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200226
227 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200228 """Expect a given token type and return it. This accepts the same
229 argument as :meth:`jinja2.lexer.Token.test`.
230 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200231 if not self.current.test(expr):
232 if ':' in expr:
233 expr = expr.split(':')[1]
234 if self.current.type is 'eof':
235 raise TemplateSyntaxError('unexpected end of template, '
236 'expected %r.' % expr,
237 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200238 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200239 raise TemplateSyntaxError("expected token %r, got %r" %
240 (expr, str(self.current)),
241 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200242 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200243 try:
244 return self.current
245 finally:
246 self.next()
247
248
Armin Ronacher21580912007-04-17 17:13:10 +0200249class LexerMeta(type):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200250 """Metaclass for the lexer that caches instances for
Armin Ronacher21580912007-04-17 17:13:10 +0200251 the same configuration in a weak value dictionary.
252 """
253
254 def __call__(cls, environment):
Armin Ronacher203bfcb2008-04-24 21:54:44 +0200255 key = (environment.block_start_string,
256 environment.block_end_string,
257 environment.variable_start_string,
258 environment.variable_end_string,
259 environment.comment_start_string,
260 environment.comment_end_string,
261 environment.line_statement_prefix,
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200262 environment.trim_blocks,
263 environment.newline_sequence)
Armin Ronacherb5124e62008-04-25 00:36:14 +0200264 lexer = _lexer_cache.get(key)
265 if lexer is None:
266 lexer = type.__call__(cls, environment)
267 _lexer_cache[key] = lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200268 return lexer
269
270
Armin Ronacher92f572f2007-02-26 22:17:32 +0100271class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200272 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100273 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200274
275 Note that the lexer is not automatically bound to an environment.
276 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100277 """
278
Armin Ronacher21580912007-04-17 17:13:10 +0200279 __metaclass__ = LexerMeta
280
Armin Ronacher92f572f2007-02-26 22:17:32 +0100281 def __init__(self, environment):
282 # shortcuts
283 c = lambda x: re.compile(x, re.M | re.S)
284 e = re.escape
285
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200286 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100287 tag_rules = [
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200288 (whitespace_re, 'whitespace', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200289 (float_re, 'float', None),
290 (integer_re, 'integer', None),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100291 (name_re, 'name', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200292 (string_re, 'string', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200293 (operator_re, 'operator', None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100294 ]
295
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100296 # assamble the root lexing rule. because "|" is ungreedy
297 # we have to sort by length so that the lexer continues working
298 # as expected when we have parsing rules like <% for block and
299 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200300 # variables are just part of the rules if variable processing
301 # is required.
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100302 root_tag_rules = [
303 ('comment', environment.comment_start_string),
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200304 ('block', environment.block_start_string),
305 ('variable', environment.variable_start_string)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100306 ]
Armin Ronacher4f7d2d52008-04-22 10:40:26 +0200307 root_tag_rules.sort(key=lambda x: -len(x[1]))
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200308
309 # now escape the rules. This is done here so that the escape
310 # signs don't count for the lengths of the tags.
311 root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
312
313 # if we have a line statement prefix we need an extra rule for
314 # that. We add this rule *after* all the others.
315 if environment.line_statement_prefix is not None:
316 prefix = e(environment.line_statement_prefix)
317 root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100318
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200319 # block suffix if trimming is enabled
320 block_suffix_re = environment.trim_blocks and '\\n?' or ''
321
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200322 self.newline_sequence = environment.newline_sequence
323
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200324 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100325 self.rules = {
326 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100327 # directives
328 (c('(.*?)(?:%s)' % '|'.join(
329 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
330 e(environment.block_start_string),
331 e(environment.block_start_string),
332 e(environment.block_end_string)
333 )] + [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200334 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100335 for n, r in root_tag_rules
336 ])), ('data', '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200337 # data
Armin Ronacher92f572f2007-02-26 22:17:32 +0100338 (c('.+'), 'data', None)
339 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200340 # comments
Armin Ronacher92f572f2007-02-26 22:17:32 +0100341 'comment_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200342 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200343 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200344 e(environment.comment_end_string),
345 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200346 )), ('comment', 'comment_end'), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100347 (c('(.)'), (Failure('Missing end of comment tag'),), None)
348 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200349 # blocks
Armin Ronacher92f572f2007-02-26 22:17:32 +0100350 'block_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200351 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200352 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200353 e(environment.block_end_string),
354 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200355 )), 'block_end', '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100356 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200357 # variables
358 'variable_begin': [
359 (c('\-%s\s*|%s' % (
360 e(environment.variable_end_string),
361 e(environment.variable_end_string)
362 )), 'variable_end', '#pop')
363 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200364 # raw block
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100365 'raw_begin': [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200366 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
367 e(environment.block_start_string),
368 e(environment.block_start_string),
369 e(environment.block_end_string),
370 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200371 block_suffix_re
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100372 )), ('data', 'raw_end'), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200373 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200374 ],
375 # line statements
376 'linestatement_begin': [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200377 (c(r'\s*(\n|$)'), 'linestatement_end', '#pop')
378 ] + tag_rules
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200379 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200380
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200381 def _normalize_newlines(self, value):
382 """Called for strings and template data to normlize it to unicode."""
383 return newline_re.sub(self.newline_sequence, value)
384
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200385 def tokenize(self, source, name=None, filename=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200386 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100387 """
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200388 stream = self.tokeniter(source, name, filename)
389 return TokenStream(self.wrap(stream, name, filename), name, filename)
390
391 def wrap(self, stream, name=None, filename=None):
392 """This is called with the stream as returned by `tokenize` and wraps
393 every token in a :class:`Token` and converts the value.
394 """
395 for lineno, token, value in stream:
396 if token in ('comment_begin', 'comment', 'comment_end',
397 'whitespace'):
398 continue
399 elif token == 'linestatement_begin':
400 token = 'block_begin'
401 elif token == 'linestatement_end':
402 token = 'block_end'
403 # we are not interested in those tokens in the parser
404 elif token in ('raw_begin', 'raw_end'):
405 continue
406 elif token == 'data':
407 value = self._normalize_newlines(value)
408 elif token == 'keyword':
409 token = value
410 elif token == 'name':
411 value = str(value)
412 elif token == 'string':
413 # try to unescape string
414 try:
415 value = self._normalize_newlines(value[1:-1]) \
416 .encode('ascii', 'backslashreplace') \
417 .decode('unicode-escape')
418 except Exception, e:
419 msg = str(e).split(':')[-1].strip()
420 raise TemplateSyntaxError(msg, lineno, name, filename)
421 # if we can express it as bytestring (ascii only)
422 # we do that for support of semi broken APIs
423 # as datetime.datetime.strftime
424 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200425 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200426 except UnicodeError:
427 pass
428 elif token == 'integer':
429 value = int(value)
430 elif token == 'float':
431 value = float(value)
432 elif token == 'operator':
433 token = operators[value]
434 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100435
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200436 def tokeniter(self, source, name, filename=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200437 """This method tokenizes the text and returns the tokens in a
438 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100439 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200440 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100441 pos = 0
442 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100443 stack = ['root']
444 statetokens = self.rules['root']
445 source_length = len(source)
446
Armin Ronacher21580912007-04-17 17:13:10 +0200447 balancing_stack = []
448
Armin Ronacher71082072008-04-12 14:19:36 +0200449 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100450 # tokenizer loop
451 for regex, tokens, new_state in statetokens:
452 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200453 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200454 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200455 continue
456
457 # we only match blocks and variables if brances / parentheses
458 # are balanced. continue parsing with the lower rule which
459 # is the operator rule. do this only if the end tags look
460 # like operators
461 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200462 tokens in ('variable_end', 'block_end',
463 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200464 continue
465
466 # tuples support more options
467 if isinstance(tokens, tuple):
468 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200469 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200470 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200471 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200472 # bygroup is a bit more complex, in that case we
473 # yield for the current token the first named
474 # group that matched
475 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100476 for key, value in m.groupdict().iteritems():
477 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200478 yield lineno, key, value
479 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100480 break
481 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200482 raise RuntimeError('%r wanted to resolve '
483 'the token dynamically'
484 ' but no group matched'
485 % regex)
486 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100487 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200488 data = m.group(idx + 1)
489 if data:
490 yield lineno, token, data
491 lineno += data.count('\n')
492
Armin Ronacher71082072008-04-12 14:19:36 +0200493 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200494 else:
495 data = m.group()
496 # update brace/parentheses balance
497 if tokens == 'operator':
498 if data == '{':
499 balancing_stack.append('}')
500 elif data == '(':
501 balancing_stack.append(')')
502 elif data == '[':
503 balancing_stack.append(']')
504 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200505 if not balancing_stack:
506 raise TemplateSyntaxError('unexpected "%s"' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200507 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200508 filename)
509 expected_op = balancing_stack.pop()
510 if expected_op != data:
511 raise TemplateSyntaxError('unexpected "%s", '
512 'expected "%s"' %
513 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200514 lineno, name,
515 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200516 # yield items
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200517 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200518 lineno += data.count('\n')
519
520 # fetch new position into new variable so that we can check
521 # if there is a internal parsing error which would result
522 # in an infinite loop
523 pos2 = m.end()
524
525 # handle state changes
526 if new_state is not None:
527 # remove the uppermost state
528 if new_state == '#pop':
529 stack.pop()
530 # resolve the new state by group checking
531 elif new_state == '#bygroup':
532 for key, value in m.groupdict().iteritems():
533 if value is not None:
534 stack.append(key)
535 break
536 else:
537 raise RuntimeError('%r wanted to resolve the '
538 'new state dynamically but'
539 ' no group matched' %
540 regex)
541 # direct state name given
542 else:
543 stack.append(new_state)
544 statetokens = self.rules[stack[-1]]
545 # we are still at the same position and no stack change.
546 # this means a loop without break condition, avoid that and
547 # raise error
548 elif pos2 == pos:
549 raise RuntimeError('%r yielded empty string without '
550 'stack change' % regex)
551 # publish new function and start again
552 pos = pos2
553 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100554 # if loop terminated without break we havn't found a single match
555 # either we are at the end of the file or we have a problem
556 else:
557 # end of text
558 if pos >= source_length:
559 return
560 # something went wrong
561 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200562 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200563 name, filename)