blob: ade9f9225ce7679c7399334b5d22b30061614fb7 [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher1d51f632008-03-25 14:34:45 +010014 :copyright: 2007-2008 by Armin Ronacher.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher1cc232c2007-09-07 17:52:41 +020018import unicodedata
Armin Ronacher4325e372008-05-01 22:59:47 +020019from operator import itemgetter
20from collections import deque
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020021from jinja2.exceptions import TemplateSyntaxError
Armin Ronacherb5124e62008-04-25 00:36:14 +020022from jinja2.utils import LRUCache
Armin Ronacher92f572f2007-02-26 22:17:32 +010023
24
Armin Ronacher21580912007-04-17 17:13:10 +020025# cache for the lexers. Exists in order to be able to have multiple
26# environments with the same lexer
Armin Ronacher187bde12008-05-01 18:19:16 +020027_lexer_cache = LRUCache(50)
Armin Ronacher21580912007-04-17 17:13:10 +020028
Armin Ronacher92f572f2007-02-26 22:17:32 +010029# static regular expressions
Armin Ronacher0949e4d2007-10-07 18:53:29 +020030whitespace_re = re.compile(r'\s+(?um)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010031string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
32 r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
Armin Ronacher1cc232c2007-09-07 17:52:41 +020033integer_re = re.compile(r'\d+')
Armin Ronachere791c2a2008-04-07 18:39:54 +020034name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronacher1cc232c2007-09-07 17:52:41 +020035float_re = re.compile(r'\d+\.\d+')
Armin Ronacher92f572f2007-02-26 22:17:32 +010036
Armin Ronacher1cc232c2007-09-07 17:52:41 +020037# bind operators to token types
38operators = {
39 '+': 'add',
40 '-': 'sub',
41 '/': 'div',
42 '//': 'floordiv',
43 '*': 'mul',
44 '%': 'mod',
45 '**': 'pow',
46 '~': 'tilde',
Armin Ronacher1cc232c2007-09-07 17:52:41 +020047 '[': 'lbracket',
48 ']': 'rbracket',
49 '(': 'lparen',
50 ')': 'rparen',
51 '{': 'lbrace',
52 '}': 'rbrace',
53 '==': 'eq',
54 '!=': 'ne',
55 '>': 'gt',
56 '>=': 'gteq',
57 '<': 'lt',
58 '<=': 'lteq',
59 '=': 'assign',
60 '.': 'dot',
61 ':': 'colon',
62 '|': 'pipe',
Armin Ronacher07bc6842008-03-31 14:18:49 +020063 ',': 'comma',
64 ';': 'semicolon'
Armin Ronacher1cc232c2007-09-07 17:52:41 +020065}
66
67reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
68assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +020069operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
70 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +020071
Armin Ronacher1d51f632008-03-25 14:34:45 +010072simple_escapes = {
73 'a': '\a',
74 'n': '\n',
75 'r': '\r',
76 'f': '\f',
77 't': '\t',
78 'v': '\v',
79 '\\': '\\',
80 '"': '"',
81 "'": "'",
82 '0': '\x00'
83}
84unicode_escapes = {
85 'x': 2,
86 'u': 4,
87 'U': 8
88}
89
Armin Ronacher1cc232c2007-09-07 17:52:41 +020090
91def unescape_string(lineno, filename, s):
Armin Ronacherb5124e62008-04-25 00:36:14 +020092 r"""Unescape a string. Supported escapes:
Armin Ronacher1cc232c2007-09-07 17:52:41 +020093 \a, \n, \r\, \f, \v, \\, \", \', \0
94
95 \x00, \u0000, \U00000000, \N{...}
Armin Ronacher1cc232c2007-09-07 17:52:41 +020096 """
Armin Ronacher1cc232c2007-09-07 17:52:41 +020097 try:
Armin Ronacherb5124e62008-04-25 00:36:14 +020098 return s.encode('ascii', 'backslashreplace').decode('unicode-escape')
99 except UnicodeError, e:
100 msg = str(e).split(':')[-1].strip()
101 raise TemplateSyntaxError(msg, lineno, filename)
Armin Ronacher2894f222007-03-19 22:39:55 +0100102
Armin Ronacher92f572f2007-02-26 22:17:32 +0100103
104class Failure(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200105 """Class that raises a `TemplateSyntaxError` if called.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100106 Used by the `Lexer` to specify known errors.
107 """
108
109 def __init__(self, message, cls=TemplateSyntaxError):
110 self.message = message
111 self.error_class = cls
112
Armin Ronacher720e55b2007-05-30 00:57:49 +0200113 def __call__(self, lineno, filename):
114 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100115
116
Armin Ronacher4325e372008-05-01 22:59:47 +0200117class Token(tuple):
118 """Token class."""
119 __slots__ = ()
120 lineno, type, value = (property(itemgetter(x)) for x in range(3))
121
122 def __new__(cls, lineno, type, value):
123 return tuple.__new__(cls, (lineno, intern(str(type)), value))
124
125 def __str__(self):
126 from jinja.lexer import keywords, reverse_operators
127 if self.type in keywords:
128 return self.type
129 elif self.type in reverse_operators:
130 return reverse_operators[self.type]
131 elif self.type is 'name':
132 return self.value
133 return self.type
134
135 def test(self, expr):
136 """Test a token against a token expression. This can either be a
137 token type or 'token_type:token_value'. This can only test against
138 string values!
139 """
140 # here we do a regular string equality check as test_many is usually
141 # passed an iterable of not interned strings.
142 if self.type == expr:
143 return True
144 elif ':' in expr:
145 return expr.split(':', 1) == [self.type, self.value]
146 return False
147
148 def test_many(self, iterable):
149 """Test against multiple token expressions."""
150 for expr in iterable:
151 if self.test(expr):
152 return True
153 return False
154
155 def __repr__(self):
156 return 'Token(%r, %r, %r)' % (
157 self.lineno,
158 self.type,
159 self.value
160 )
161
162
163class TokenStreamIterator(object):
164 """The iterator for tokenstreams. Iterate over the stream
165 until the eof token is reached.
166 """
167
168 def __init__(self, stream):
169 self._stream = stream
170
171 def __iter__(self):
172 return self
173
174 def next(self):
175 token = self._stream.current
176 if token.type == 'eof':
177 self._stream.close()
178 raise StopIteration()
179 self._stream.next(False)
180 return token
181
182
183class TokenStream(object):
184 """A token stream wraps a generator and supports pushing tokens back.
185 It also provides some functions to expect tokens and similar stuff.
186
187 Important note: Do never push more than one token back to the
188 stream. Although the stream object won't stop you
189 from doing so, the behavior is undefined. Multiple
190 pushed tokens are only used internally!
191 """
192
193 def __init__(self, generator, filename):
194 self._next = generator.next
195 self._pushed = deque()
196 self.current = Token(1, 'initial', '')
197 self.filename = filename
198 self.next()
199
200 def __iter__(self):
201 return TokenStreamIterator(self)
202
203 def __nonzero__(self):
204 """Are we at the end of the tokenstream?"""
205 return bool(self._pushed) or self.current.type != 'eof'
206
207 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
208
209 def push(self, token):
210 """Push a token back to the stream."""
211 self._pushed.append(token)
212
213 def look(self):
214 """Look at the next token."""
215 old_token = self.next()
216 result = self.current
217 self.push(result)
218 self.current = old_token
219 return result
220
221 def skip(self, n):
222 """Got n tokens ahead."""
223 for x in xrange(n):
224 self.next()
225
226 def next(self, skip_eol=True):
227 """Go one token ahead and return the old one"""
228 rv = self.current
229 while 1:
230 if self._pushed:
231 self.current = self._pushed.popleft()
232 elif self.current.type is not 'eof':
233 try:
234 self.current = self._next()
235 except StopIteration:
236 self.close()
237 if not skip_eol or self.current.type is not 'eol':
238 break
239 return rv
240
241 def close(self):
242 """Close the stream."""
243 self.current = Token(self.current.lineno, 'eof', '')
244 self._next = None
245
246 def expect(self, expr):
247 """Expect a given token type and return it"""
248 if not self.current.test(expr):
249 if ':' in expr:
250 expr = expr.split(':')[1]
251 if self.current.type is 'eof':
252 raise TemplateSyntaxError('unexpected end of template, '
253 'expected %r.' % expr,
254 self.current.lineno,
255 self.filename)
256 raise TemplateSyntaxError("expected token %r, got %r" %
257 (expr, str(self.current)),
258 self.current.lineno,
259 self.filename)
260 try:
261 return self.current
262 finally:
263 self.next()
264
265
Armin Ronacher21580912007-04-17 17:13:10 +0200266class LexerMeta(type):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200267 """Metaclass for the lexer that caches instances for
Armin Ronacher21580912007-04-17 17:13:10 +0200268 the same configuration in a weak value dictionary.
269 """
270
271 def __call__(cls, environment):
Armin Ronacher203bfcb2008-04-24 21:54:44 +0200272 key = (environment.block_start_string,
273 environment.block_end_string,
274 environment.variable_start_string,
275 environment.variable_end_string,
276 environment.comment_start_string,
277 environment.comment_end_string,
278 environment.line_statement_prefix,
279 environment.trim_blocks)
Armin Ronacherb5124e62008-04-25 00:36:14 +0200280 lexer = _lexer_cache.get(key)
281 if lexer is None:
282 lexer = type.__call__(cls, environment)
283 _lexer_cache[key] = lexer
Armin Ronacher21580912007-04-17 17:13:10 +0200284 return lexer
285
286
Armin Ronacher92f572f2007-02-26 22:17:32 +0100287class Lexer(object):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200288 """Class that implements a lexer for a given environment. Automatically
Armin Ronacher92f572f2007-02-26 22:17:32 +0100289 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200290
291 Note that the lexer is not automatically bound to an environment.
292 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100293 """
294
Armin Ronacher21580912007-04-17 17:13:10 +0200295 __metaclass__ = LexerMeta
296
Armin Ronacher92f572f2007-02-26 22:17:32 +0100297 def __init__(self, environment):
298 # shortcuts
299 c = lambda x: re.compile(x, re.M | re.S)
300 e = re.escape
301
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200302 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100303 tag_rules = [
304 (whitespace_re, None, None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200305 (float_re, 'float', None),
306 (integer_re, 'integer', None),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100307 (name_re, 'name', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200308 (string_re, 'string', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200309 (operator_re, 'operator', None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100310 ]
311
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100312 # assamble the root lexing rule. because "|" is ungreedy
313 # we have to sort by length so that the lexer continues working
314 # as expected when we have parsing rules like <% for block and
315 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200316 # variables are just part of the rules if variable processing
317 # is required.
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100318 root_tag_rules = [
319 ('comment', environment.comment_start_string),
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200320 ('block', environment.block_start_string),
321 ('variable', environment.variable_start_string)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100322 ]
Armin Ronacher4f7d2d52008-04-22 10:40:26 +0200323 root_tag_rules.sort(key=lambda x: -len(x[1]))
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200324
325 # now escape the rules. This is done here so that the escape
326 # signs don't count for the lengths of the tags.
327 root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
328
329 # if we have a line statement prefix we need an extra rule for
330 # that. We add this rule *after* all the others.
331 if environment.line_statement_prefix is not None:
332 prefix = e(environment.line_statement_prefix)
333 root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100334
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200335 # block suffix if trimming is enabled
336 block_suffix_re = environment.trim_blocks and '\\n?' or ''
337
338 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100339 self.rules = {
340 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100341 # directives
342 (c('(.*?)(?:%s)' % '|'.join(
343 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
344 e(environment.block_start_string),
345 e(environment.block_start_string),
346 e(environment.block_end_string)
347 )] + [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200348 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100349 for n, r in root_tag_rules
350 ])), ('data', '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200351 # data
Armin Ronacher92f572f2007-02-26 22:17:32 +0100352 (c('.+'), 'data', None)
353 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200354 # comments
Armin Ronacher92f572f2007-02-26 22:17:32 +0100355 'comment_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200356 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200357 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200358 e(environment.comment_end_string),
359 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200360 )), ('comment', 'comment_end'), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100361 (c('(.)'), (Failure('Missing end of comment tag'),), None)
362 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200363 # blocks
Armin Ronacher92f572f2007-02-26 22:17:32 +0100364 'block_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200365 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200366 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200367 e(environment.block_end_string),
368 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200369 )), 'block_end', '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100370 ] + tag_rules,
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200371 # variables
372 'variable_begin': [
373 (c('\-%s\s*|%s' % (
374 e(environment.variable_end_string),
375 e(environment.variable_end_string)
376 )), 'variable_end', '#pop')
377 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200378 # raw block
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100379 'raw_begin': [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200380 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
381 e(environment.block_start_string),
382 e(environment.block_start_string),
383 e(environment.block_end_string),
384 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200385 block_suffix_re
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100386 )), ('data', 'raw_end'), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200387 (c('(.)'), (Failure('Missing end of raw directive'),), None)
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200388 ],
389 # line statements
390 'linestatement_begin': [
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200391 (c(r'\s*(\n|$)'), 'linestatement_end', '#pop')
392 ] + tag_rules
Armin Ronacher2e9396b2008-04-16 14:21:57 +0200393 }
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200394
Armin Ronacher21580912007-04-17 17:13:10 +0200395 def tokenize(self, source, filename=None):
Armin Ronacher71082072008-04-12 14:19:36 +0200396 """Works like `tokeniter` but returns a tokenstream of tokens and not
Armin Ronacher4f7d2d52008-04-22 10:40:26 +0200397 a generator or token tuples. Additionally all token values are already
Armin Ronacher115de2e2008-05-01 22:20:05 +0200398 converted into types and postprocessed. For example comments are removed,
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200399 integers and floats converted, strings unescaped etc.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100400 """
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200401 source = unicode(source)
Armin Ronacher5a8e4972007-04-05 11:21:38 +0200402 def generate():
Armin Ronacher21580912007-04-17 17:13:10 +0200403 for lineno, token, value in self.tokeniter(source, filename):
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200404 if token in ('comment_begin', 'comment', 'comment_end'):
405 continue
Armin Ronacherbf7c4ad2008-04-12 12:02:36 +0200406 elif token == 'linestatement_begin':
407 token = 'block_begin'
408 elif token == 'linestatement_end':
409 token = 'block_end'
Armin Ronacher4f7d2d52008-04-22 10:40:26 +0200410 # we are not interested in those tokens in the parser
411 elif token in ('raw_begin', 'raw_end'):
412 continue
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200413 elif token == 'data':
414 try:
415 value = str(value)
416 except UnicodeError:
417 pass
Armin Ronacher07bc6842008-03-31 14:18:49 +0200418 elif token == 'keyword':
Armin Ronacher82b3f3d2008-03-31 20:01:08 +0200419 token = value
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200420 elif token == 'name':
421 value = str(value)
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200422 elif token == 'string':
423 value = unescape_string(lineno, filename, value[1:-1])
424 try:
425 value = str(value)
426 except UnicodeError:
427 pass
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200428 elif token == 'integer':
429 value = int(value)
430 elif token == 'float':
431 value = float(value)
432 elif token == 'operator':
433 token = operators[value]
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200434 yield Token(lineno, token, value)
Armin Ronacher21580912007-04-17 17:13:10 +0200435 return TokenStream(generate(), filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100436
Armin Ronacher21580912007-04-17 17:13:10 +0200437 def tokeniter(self, source, filename=None):
Armin Ronacherb5124e62008-04-25 00:36:14 +0200438 """This method tokenizes the text and returns the tokens in a
439 generator. Use this method if you just want to tokenize a template.
440 The output you get is not compatible with the input the jinja parser
441 wants. The parser uses the `tokenize` function with returns a
442 `TokenStream` and postprocessed tokens.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100443 """
Armin Ronacher5a8e4972007-04-05 11:21:38 +0200444 source = '\n'.join(source.splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100445 pos = 0
446 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100447 stack = ['root']
448 statetokens = self.rules['root']
449 source_length = len(source)
450
Armin Ronacher21580912007-04-17 17:13:10 +0200451 balancing_stack = []
452
Armin Ronacher71082072008-04-12 14:19:36 +0200453 while 1:
Armin Ronacher92f572f2007-02-26 22:17:32 +0100454 # tokenizer loop
455 for regex, tokens, new_state in statetokens:
456 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200457 # if no match we try again with the next rule
Armin Ronacher71082072008-04-12 14:19:36 +0200458 if m is None:
Armin Ronacher21580912007-04-17 17:13:10 +0200459 continue
460
461 # we only match blocks and variables if brances / parentheses
462 # are balanced. continue parsing with the lower rule which
463 # is the operator rule. do this only if the end tags look
464 # like operators
465 if balancing_stack and \
Armin Ronacher71082072008-04-12 14:19:36 +0200466 tokens in ('variable_end', 'block_end',
467 'linestatement_end'):
Armin Ronacher21580912007-04-17 17:13:10 +0200468 continue
469
470 # tuples support more options
471 if isinstance(tokens, tuple):
472 for idx, token in enumerate(tokens):
473 # hidden group
474 if token is None:
475 g = m.group(idx)
476 if g:
477 lineno += g.count('\n')
478 continue
479 # failure group
Armin Ronacherecc051b2007-06-01 18:25:28 +0200480 elif token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200481 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200482 # bygroup is a bit more complex, in that case we
483 # yield for the current token the first named
484 # group that matched
485 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100486 for key, value in m.groupdict().iteritems():
487 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200488 yield lineno, key, value
489 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100490 break
491 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200492 raise RuntimeError('%r wanted to resolve '
493 'the token dynamically'
494 ' but no group matched'
495 % regex)
496 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100497 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200498 data = m.group(idx + 1)
499 if data:
500 yield lineno, token, data
501 lineno += data.count('\n')
502
Armin Ronacher71082072008-04-12 14:19:36 +0200503 # strings as token just are yielded as it.
Armin Ronacher21580912007-04-17 17:13:10 +0200504 else:
505 data = m.group()
506 # update brace/parentheses balance
507 if tokens == 'operator':
508 if data == '{':
509 balancing_stack.append('}')
510 elif data == '(':
511 balancing_stack.append(')')
512 elif data == '[':
513 balancing_stack.append(']')
514 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200515 if not balancing_stack:
516 raise TemplateSyntaxError('unexpected "%s"' %
517 data, lineno,
518 filename)
519 expected_op = balancing_stack.pop()
520 if expected_op != data:
521 raise TemplateSyntaxError('unexpected "%s", '
522 'expected "%s"' %
523 (data, expected_op),
Armin Ronacher21580912007-04-17 17:13:10 +0200524 lineno, filename)
525 # yield items
526 if tokens is not None:
Armin Ronacher71082072008-04-12 14:19:36 +0200527 yield lineno, tokens, data
Armin Ronacher21580912007-04-17 17:13:10 +0200528 lineno += data.count('\n')
529
530 # fetch new position into new variable so that we can check
531 # if there is a internal parsing error which would result
532 # in an infinite loop
533 pos2 = m.end()
534
535 # handle state changes
536 if new_state is not None:
537 # remove the uppermost state
538 if new_state == '#pop':
539 stack.pop()
540 # resolve the new state by group checking
541 elif new_state == '#bygroup':
542 for key, value in m.groupdict().iteritems():
543 if value is not None:
544 stack.append(key)
545 break
546 else:
547 raise RuntimeError('%r wanted to resolve the '
548 'new state dynamically but'
549 ' no group matched' %
550 regex)
551 # direct state name given
552 else:
553 stack.append(new_state)
554 statetokens = self.rules[stack[-1]]
555 # we are still at the same position and no stack change.
556 # this means a loop without break condition, avoid that and
557 # raise error
558 elif pos2 == pos:
559 raise RuntimeError('%r yielded empty string without '
560 'stack change' % regex)
561 # publish new function and start again
562 pos = pos2
563 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100564 # if loop terminated without break we havn't found a single match
565 # either we are at the end of the file or we have a problem
566 else:
567 # end of text
568 if pos >= source_length:
569 return
570 # something went wrong
571 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200572 (source[pos], pos), lineno,
573 filename)