blob: 19509ac217607ff04f02ea7f3357694a5fdc4a00 [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher62ccd1b2009-01-04 14:26:19 +010014 :copyright: (c) 2009 by the Jinja Team.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher4325e372008-05-01 22:59:47 +020018from operator import itemgetter
19from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020020from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherb5124e62008-04-25 00:36:14 +020021from jinja2.utils import LRUCache
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher21580912007-04-17 17:13:10 +020024# cache for the lexers. Exists in order to be able to have multiple
25# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020026_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020027
Armin Ronacher92f572f2007-02-26 22:17:32 +010028# static regular expressions
Armin Ronacher9a0078d2008-08-13 18:24:17 +020029whitespace_re = re.compile(r'\s+', re.U)
Armin Ronacher92f572f2007-02-26 22:17:32 +010030string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
Armin Ronacher9a0078d2008-08-13 18:24:17 +020031 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
Armin Ronacher1cc232c2007-09-07 17:52:41 +020032integer_re = re.compile(r'\d+')
Armin Ronacherd1ff8582008-05-11 00:30:43 +020033name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronachercb1b97f2008-09-10 14:03:53 +020034float_re = re.compile(r'(?<!\.)\d+\.\d+')
Armin Ronacherf3c35c42008-05-23 23:18:14 +020035newline_re = re.compile(r'(\r\n|\r|\n)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010036
Armin Ronacher1cc232c2007-09-07 17:52:41 +020037# bind operators to token types
38operators = {
39 '+': 'add',
40 '-': 'sub',
41 '/': 'div',
42 '//': 'floordiv',
43 '*': 'mul',
44 '%': 'mod',
45 '**': 'pow',
46 '~': 'tilde',
Armin Ronacher1cc232c2007-09-07 17:52:41 +020047 '[': 'lbracket',
48 ']': 'rbracket',
49 '(': 'lparen',
50 ')': 'rparen',
51 '{': 'lbrace',
52 '}': 'rbrace',
53 '==': 'eq',
54 '!=': 'ne',
55 '>': 'gt',
56 '>=': 'gteq',
57 '<': 'lt',
58 '<=': 'lteq',
59 '=': 'assign',
60 '.': 'dot',
61 ':': 'colon',
62 '|': 'pipe',
Armin Ronacher07bc6842008-03-31 14:18:49 +020063 ',': 'comma',
64 ';': 'semicolon'
Armin Ronacher1cc232c2007-09-07 17:52:41 +020065}
66
67reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
68assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +020069operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
70 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +020071
Armin Ronacher92f572f2007-02-26 22:17:32 +010072
Armin Ronacherd02fc7d2008-06-14 14:19:47 +020073def count_newlines(value):
74 """Count the number of newline characters in the string. This is
75 useful for extensions that filter a stream.
76 """
77 return len(newline_re.findall(value))
78
79
Armin Ronacher92f572f2007-02-26 22:17:32 +010080class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +020081 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +010082 Used by the `Lexer` to specify known errors.
83 """
84
85 def __init__(self, message, cls=TemplateSyntaxError):
86 self.message = message
87 self.error_class = cls
88
Armin Ronacher720e55b2007-05-30 00:57:49 +020089 def __call__(self, lineno, filename):
90 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +010091
92
Armin Ronacher4325e372008-05-01 22:59:47 +020093class Token(tuple):
94 """Token class."""
95 __slots__ = ()
96 lineno, type, value = (property(itemgetter(x)) for x in range(3))
97
98 def __new__(cls, lineno, type, value):
99 return tuple.__new__(cls, (lineno, intern(str(type)), value))
100
101 def __str__(self):
Armin Ronacher8a1d27f2008-05-19 08:37:19 +0200102 if self.type in reverse_operators:
Armin Ronacher4325e372008-05-01 22:59:47 +0200103 return reverse_operators[self.type]
104 elif self.type is 'name':
105 return self.value
106 return self.type
107
108 def test(self, expr):
109 """Test a token against a token expression. This can either be a
Armin Ronacher023b5e92008-05-08 11:03:10 +0200110 token type or ``'token_type:token_value'``. This can only test
111 against string values and types.
Armin Ronacher4325e372008-05-01 22:59:47 +0200112 """
Armin Ronachercda43df2008-05-03 17:10:05 +0200113 # here we do a regular string equality check as test_any is usually
Armin Ronacher4325e372008-05-01 22:59:47 +0200114 # passed an iterable of not interned strings.
115 if self.type == expr:
116 return True
117 elif ':' in expr:
118 return expr.split(':', 1) == [self.type, self.value]
119 return False
120
Armin Ronachercda43df2008-05-03 17:10:05 +0200121 def test_any(self, *iterable):
Armin Ronacher4325e372008-05-01 22:59:47 +0200122 """Test against multiple token expressions."""
123 for expr in iterable:
124 if self.test(expr):
125 return True
126 return False
127
128 def __repr__(self):
129 return 'Token(%r, %r, %r)' % (
130 self.lineno,
131 self.type,
132 self.value
133 )
134
135
136class TokenStreamIterator(object):
137 """The iterator for tokenstreams. Iterate over the stream
138 until the eof token is reached.
139 """
140
141 def __init__(self, stream):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200142 self.stream = stream
Armin Ronacher4325e372008-05-01 22:59:47 +0200143
144 def __iter__(self):
145 return self
146
147 def next(self):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200148 token = self.stream.current
Armin Ronacher4325e372008-05-01 22:59:47 +0200149 if token.type == 'eof':
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200150 self.stream.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200151 raise StopIteration()
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200152 self.stream.next()
Armin Ronacher4325e372008-05-01 22:59:47 +0200153 return token
154
155
156class TokenStream(object):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200157 """A token stream is an iterable that yields :class:`Token`\s. The
158 parser however does not iterate over it but calls :meth:`next` to go
159 one token ahead. The current active token is stored as :attr:`current`.
Armin Ronacher4325e372008-05-01 22:59:47 +0200160 """
161
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200162 def __init__(self, generator, name, filename):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200163 self._next = iter(generator).next
Armin Ronacher4325e372008-05-01 22:59:47 +0200164 self._pushed = deque()
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200165 self.name = name
Armin Ronacher4325e372008-05-01 22:59:47 +0200166 self.filename = filename
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200167 self.closed = False
168 self.current = Token(1, 'initial', '')
Armin Ronacher4325e372008-05-01 22:59:47 +0200169 self.next()
170
171 def __iter__(self):
172 return TokenStreamIterator(self)
173
174 def __nonzero__(self):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200175 """Are we at the end of the stream?"""
Armin Ronacher4325e372008-05-01 22:59:47 +0200176 return bool(self._pushed) or self.current.type != 'eof'
177
178 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
179
180 def push(self, token):
181 """Push a token back to the stream."""
182 self._pushed.append(token)
183
184 def look(self):
185 """Look at the next token."""
186 old_token = self.next()
187 result = self.current
188 self.push(result)
189 self.current = old_token
190 return result
191
Armin Ronacherea847c52008-05-02 20:04:32 +0200192 def skip(self, n=1):
Armin Ronacher4325e372008-05-01 22:59:47 +0200193 """Got n tokens ahead."""
194 for x in xrange(n):
195 self.next()
196
Armin Ronacherfdf95302008-05-11 22:20:51 +0200197 def next_if(self, expr):
198 """Perform the token test and return the token if it matched.
199 Otherwise the return value is `None`.
200 """
201 if self.current.test(expr):
202 return self.next()
203
204 def skip_if(self, expr):
Armin Ronacher9cf95912008-05-24 19:54:43 +0200205 """Like :meth:`next_if` but only returns `True` or `False`."""
Armin Ronacherfdf95302008-05-11 22:20:51 +0200206 return self.next_if(expr) is not None
207
208 def next(self):
Armin Ronacher4325e372008-05-01 22:59:47 +0200209 """Go one token ahead and return the old one"""
210 rv = self.current
Armin Ronacherfdf95302008-05-11 22:20:51 +0200211 if self._pushed:
212 self.current = self._pushed.popleft()
213 elif self.current.type is not 'eof':
214 try:
215 self.current = self._next()
216 except StopIteration:
217 self.close()
Armin Ronacher4325e372008-05-01 22:59:47 +0200218 return rv
219
220 def close(self):
221 """Close the stream."""
222 self.current = Token(self.current.lineno, 'eof', '')
223 self._next = None
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200224 self.closed = True
Armin Ronacher4325e372008-05-01 22:59:47 +0200225
226 def expect(self, expr):
Armin Ronacher023b5e92008-05-08 11:03:10 +0200227 """Expect a given token type and return it. This accepts the same
228 argument as :meth:`jinja2.lexer.Token.test`.
229 """
Armin Ronacher4325e372008-05-01 22:59:47 +0200230 if not self.current.test(expr):
231 if ':' in expr:
232 expr = expr.split(':')[1]
233 if self.current.type is 'eof':
234 raise TemplateSyntaxError('unexpected end of template, '
235 'expected %r.' % expr,
236 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200237 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200238 raise TemplateSyntaxError("expected token %r, got %r" %
239 (expr, str(self.current)),
240 self.current.lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200241 self.name, self.filename)
Armin Ronacher4325e372008-05-01 22:59:47 +0200242 try:
243 return self.current
244 finally:
245 self.next()
246
247
Armin Ronacher9a0078d2008-08-13 18:24:17 +0200248def get_lexer(environment):
249 """Return a lexer which is probably cached."""
250 key = (environment.block_start_string,
251 environment.block_end_string,
252 environment.variable_start_string,
253 environment.variable_end_string,
254 environment.comment_start_string,
255 environment.comment_end_string,
256 environment.line_statement_prefix,
257 environment.trim_blocks,
258 environment.newline_sequence)
259 lexer = _lexer_cache.get(key)
260 if lexer is None:
261 lexer = Lexer(environment)
262 _lexer_cache[key] = lexer
263 return lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200264
265
Armin Ronacher92f572f2007-02-26 22:17:32 +0100266class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200267 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100268 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200269
270 Note that the lexer is not automatically bound to an environment.
271 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100272 """
273
274 def __init__(self, environment):
275 # shortcuts
276 c = lambda x: re.compile(x, re.M | re.S)
277 e = re.escape
278
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200279 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100280 tag_rules = [
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200281 (whitespace_re, 'whitespace', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200282 (float_re, 'float', None),
283 (integer_re, 'integer', None),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100284 (name_re, 'name', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200285 (string_re, 'string', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200286 (operator_re, 'operator', None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100287 ]
288
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100289 # assamble the root lexing rule. because "|" is ungreedy
290 # we have to sort by length so that the lexer continues working
291 # as expected when we have parsing rules like <% for block and
292 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200293 # variables are just part of the rules if variable processing
294 # is required.
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100295 root_tag_rules = [
296 ('comment', environment.comment_start_string),
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200297 ('block', environment.block_start_string),
298 ('variable', environment.variable_start_string)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100299 ]
Armin Ronacher4f7d2d52008-04-22 10:40:26 +0200300 root_tag_rules.sort(key=lambda x: -len(x[1]))
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200301
302 # now escape the rules. This is done here so that the escape
303 # signs don't count for the lengths of the tags.
304 root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
305
306 # if we have a line statement prefix we need an extra rule for
307 # that. We add this rule *after* all the others.
308 if environment.line_statement_prefix is not None:
309 prefix = e(environment.line_statement_prefix)
310 root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100311
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200312 # block suffix if trimming is enabled
313 block_suffix_re = environment.trim_blocks and '\\n?' or ''
314
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200315 self.newline_sequence = environment.newline_sequence
316
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200317 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100318 self.rules = {
319 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100320 # directives
321 (c('(.*?)(?:%s)' % '|'.join(
322 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
323 e(environment.block_start_string),
324 e(environment.block_start_string),
325 e(environment.block_end_string)
326 )] + [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200327 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100328 for n, r in root_tag_rules
329 ])), ('data', '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200330 # data
Armin Ronacher92f572f2007-02-26 22:17:32 +0100331 (c('.+'), 'data', None)
332 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200333 # comments
Armin Ronacher92f572f2007-02-26 22:17:32 +0100334 'comment_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200335 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200336 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200337 e(environment.comment_end_string),
338 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200339 )), ('comment', 'comment_end'), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100340 (c('(.)'), (Failure('Missing end of comment tag'),), None)
341 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200342 # blocks
Armin Ronacher92f572f2007-02-26 22:17:32 +0100343 'block_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200344 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200345 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200346 e(environment.block_end_string),
347 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200348 )), 'block_end', '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100349 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200350 # variables
351 'variable_begin': [
352 (c('\-%s\s*|%s' % (
353 e(environment.variable_end_string),
354 e(environment.variable_end_string)
355 )), 'variable_end', '#pop')
356 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200357 # raw block
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100358 'raw_begin': [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200359 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
360 e(environment.block_start_string),
361 e(environment.block_start_string),
362 e(environment.block_end_string),
363 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200364 block_suffix_re
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100365 )), ('data', 'raw_end'), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200366 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200367 ],
368 # line statements
369 'linestatement_begin': [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200370 (c(r'\s*(\n|$)'), 'linestatement_end', '#pop')
371 ] + tag_rules
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200372 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200373
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200374 def _normalize_newlines(self, value):
375 """Called for strings and template data to normlize it to unicode."""
376 return newline_re.sub(self.newline_sequence, value)
377
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100378 def tokenize(self, source, name=None, filename=None, state=None):
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200379 """Calls tokeniter + tokenize and wraps it in a token stream.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100380 """
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100381 stream = self.tokeniter(source, name, filename, state)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200382 return TokenStream(self.wrap(stream, name, filename), name, filename)
383
384 def wrap(self, stream, name=None, filename=None):
385 """This is called with the stream as returned by `tokenize` and wraps
386 every token in a :class:`Token` and converts the value.
387 """
388 for lineno, token, value in stream:
389 if token in ('comment_begin', 'comment', 'comment_end',
390 'whitespace'):
391 continue
392 elif token == 'linestatement_begin':
393 token = 'block_begin'
394 elif token == 'linestatement_end':
395 token = 'block_end'
396 # we are not interested in those tokens in the parser
397 elif token in ('raw_begin', 'raw_end'):
398 continue
399 elif token == 'data':
400 value = self._normalize_newlines(value)
401 elif token == 'keyword':
402 token = value
403 elif token == 'name':
404 value = str(value)
405 elif token == 'string':
406 # try to unescape string
407 try:
408 value = self._normalize_newlines(value[1:-1]) \
409 .encode('ascii', 'backslashreplace') \
410 .decode('unicode-escape')
411 except Exception, e:
412 msg = str(e).split(':')[-1].strip()
413 raise TemplateSyntaxError(msg, lineno, name, filename)
414 # if we can express it as bytestring (ascii only)
415 # we do that for support of semi broken APIs
416 # as datetime.datetime.strftime
417 try:
Armin Ronacherd1ff8582008-05-11 00:30:43 +0200418 value = str(value)
Armin Ronacher9ad96e72008-06-13 22:44:01 +0200419 except UnicodeError:
420 pass
421 elif token == 'integer':
422 value = int(value)
423 elif token == 'float':
424 value = float(value)
425 elif token == 'operator':
426 token = operators[value]
427 yield Token(lineno, token, value)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100428
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100429 def tokeniter(self, source, name, filename=None, state=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200430 """This method tokenizes the text and returns the tokens in a
431 generator. Use this method if you just want to tokenize a template.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100432 """
Armin Ronacherf3c35c42008-05-23 23:18:14 +0200433 source = '\n'.join(unicode(source).splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100434 pos = 0
435 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100436 stack = ['root']
Armin Ronacherba6e25a2008-11-02 15:58:14 +0100437 if state is not None and state != 'root':
438 assert state in ('variable', 'block'), 'invalid state'
439 stack.append(state + '_begin')
440 else:
441 state = 'root'
442 statetokens = self.rules[stack[-1]]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100443 source_length = len(source)
444
Armin Ronacher21580912007-04-17 17:13:10 +0200445 balancing_stack = []
446
Armin Ronacher71082072008-04-12 14:19:36 +0200447 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100448 # tokenizer loop
449 for regex, tokens, new_state in statetokens:
450 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200451 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200452 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200453 continue
454
455 # we only match blocks and variables if brances / parentheses
456 # are balanced. continue parsing with the lower rule which
457 # is the operator rule. do this only if the end tags look
458 # like operators
459 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200460 tokens in ('variable_end', 'block_end',
461 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200462 continue
463
464 # tuples support more options
465 if isinstance(tokens, tuple):
466 for idx, token in enumerate(tokens):
Armin Ronacher21580912007-04-17 17:13:10 +0200467 # failure group
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200468 if token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200469 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200470 # bygroup is a bit more complex, in that case we
471 # yield for the current token the first named
472 # group that matched
473 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100474 for key, value in m.groupdict().iteritems():
475 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200476 yield lineno, key, value
477 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100478 break
479 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200480 raise RuntimeError('%r wanted to resolve '
481 'the token dynamically'
482 ' but no group matched'
483 % regex)
484 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100485 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200486 data = m.group(idx + 1)
487 if data:
488 yield lineno, token, data
489 lineno += data.count('\n')
490
Armin Ronacher71082072008-04-12 14:19:36 +0200491 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200492 else:
493 data = m.group()
494 # update brace/parentheses balance
495 if tokens == 'operator':
496 if data == '{':
497 balancing_stack.append('}')
498 elif data == '(':
499 balancing_stack.append(')')
500 elif data == '[':
501 balancing_stack.append(']')
502 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200503 if not balancing_stack:
504 raise TemplateSyntaxError('unexpected "%s"' %
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200505 data, lineno, name,
Armin Ronacherf750daa2007-05-29 23:22:38 +0200506 filename)
507 expected_op = balancing_stack.pop()
508 if expected_op != data:
509 raise TemplateSyntaxError('unexpected "%s", '
510 'expected "%s"' %
511 (data, expected_op),
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200512 lineno, name,
513 filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200514 # yield items
Armin Ronacherd8b8c3e2008-05-22 21:28:32 +0200515 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200516 lineno += data.count('\n')
517
518 # fetch new position into new variable so that we can check
519 # if there is a internal parsing error which would result
520 # in an infinite loop
521 pos2 = m.end()
522
523 # handle state changes
524 if new_state is not None:
525 # remove the uppermost state
526 if new_state == '#pop':
527 stack.pop()
528 # resolve the new state by group checking
529 elif new_state == '#bygroup':
530 for key, value in m.groupdict().iteritems():
531 if value is not None:
532 stack.append(key)
533 break
534 else:
535 raise RuntimeError('%r wanted to resolve the '
536 'new state dynamically but'
537 ' no group matched' %
538 regex)
539 # direct state name given
540 else:
541 stack.append(new_state)
542 statetokens = self.rules[stack[-1]]
543 # we are still at the same position and no stack change.
544 # this means a loop without break condition, avoid that and
545 # raise error
546 elif pos2 == pos:
547 raise RuntimeError('%r yielded empty string without '
548 'stack change' % regex)
549 # publish new function and start again
550 pos = pos2
551 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100552 # if loop terminated without break we havn't found a single match
553 # either we are at the end of the file or we have a problem
554 else:
555 # end of text
556 if pos >= source_length:
557 return
558 # something went wrong
559 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200560 (source[pos], pos), lineno,
Armin Ronacher7f15ef82008-05-16 09:11:39 +0200561 name, filename)