blob: 6e9fc8943398065348a7d23cd4aed93f3f6b481b [file] [log] [blame]
Armin Ronacher92f572f2007-02-26 22:17:32 +01001# -*- coding: utf-8 -*-
2"""
Armin Ronacher07bc6842008-03-31 14:18:49 +02003 jinja2.lexer
4 ~~~~~~~~~~~~
Armin Ronacher3b65b8a2007-02-27 20:21:45 +01005
Armin Ronacher5a8e4972007-04-05 11:21:38 +02006 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
8 for Jinja.
9
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
13
Armin Ronacher1d51f632008-03-25 14:34:45 +010014 :copyright: 2007-2008 by Armin Ronacher.
Armin Ronacher3b65b8a2007-02-27 20:21:45 +010015 :license: BSD, see LICENSE for more details.
Armin Ronacher92f572f2007-02-26 22:17:32 +010016"""
17import re
Armin Ronacher1cc232c2007-09-07 17:52:41 +020018import unicodedata
Armin Ronacher82b3f3d2008-03-31 20:01:08 +020019from jinja2.datastructure import TokenStream, Token
20from jinja2.exceptions import TemplateSyntaxError
Armin Ronacher21580912007-04-17 17:13:10 +020021from weakref import WeakValueDictionary
Armin Ronacher92f572f2007-02-26 22:17:32 +010022
23
Armin Ronacher5a8e4972007-04-05 11:21:38 +020024__all__ = ['Lexer', 'Failure', 'keywords']
25
26
Armin Ronacher21580912007-04-17 17:13:10 +020027# cache for the lexers. Exists in order to be able to have multiple
28# environments with the same lexer
29_lexer_cache = WeakValueDictionary()
30
31
Armin Ronacher92f572f2007-02-26 22:17:32 +010032# static regular expressions
Armin Ronacher0949e4d2007-10-07 18:53:29 +020033whitespace_re = re.compile(r'\s+(?um)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010034string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
35 r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
Armin Ronacher1cc232c2007-09-07 17:52:41 +020036integer_re = re.compile(r'\d+')
Armin Ronachere791c2a2008-04-07 18:39:54 +020037name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
Armin Ronacher1cc232c2007-09-07 17:52:41 +020038float_re = re.compile(r'\d+\.\d+')
Armin Ronachere791c2a2008-04-07 18:39:54 +020039eol_re = re.compile(r'(\s*$\s*)+(?m)')
Armin Ronacher92f572f2007-02-26 22:17:32 +010040
Armin Ronacher92f572f2007-02-26 22:17:32 +010041
Armin Ronacher9baa5ba2007-03-21 18:05:32 +010042# set of used keywords
Armin Ronachere791c2a2008-04-07 18:39:54 +020043keywords = set(['and', 'block', 'elif', 'else', 'endblock', 'print',
Armin Ronacher9baa5ba2007-03-21 18:05:32 +010044 'endfilter', 'endfor', 'endif', 'endmacro', 'endraw',
45 'endtrans', 'extends', 'filter', 'for', 'if', 'in',
46 'include', 'is', 'macro', 'not', 'or', 'pluralize', 'raw',
Armin Ronachere791c2a2008-04-07 18:39:54 +020047 'recursive', 'set', 'trans', 'call', 'endcall'])
Armin Ronacher1cc232c2007-09-07 17:52:41 +020048
49# bind operators to token types
50operators = {
51 '+': 'add',
52 '-': 'sub',
53 '/': 'div',
54 '//': 'floordiv',
55 '*': 'mul',
56 '%': 'mod',
57 '**': 'pow',
58 '~': 'tilde',
Armin Ronacher1cc232c2007-09-07 17:52:41 +020059 '[': 'lbracket',
60 ']': 'rbracket',
61 '(': 'lparen',
62 ')': 'rparen',
63 '{': 'lbrace',
64 '}': 'rbrace',
65 '==': 'eq',
66 '!=': 'ne',
67 '>': 'gt',
68 '>=': 'gteq',
69 '<': 'lt',
70 '<=': 'lteq',
71 '=': 'assign',
72 '.': 'dot',
73 ':': 'colon',
74 '|': 'pipe',
Armin Ronacher07bc6842008-03-31 14:18:49 +020075 ',': 'comma',
76 ';': 'semicolon'
Armin Ronacher1cc232c2007-09-07 17:52:41 +020077}
78
79reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
80assert len(operators) == len(reverse_operators), 'operators dropped'
Armin Ronachere791c2a2008-04-07 18:39:54 +020081operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
82 sorted(operators, key=lambda x: -len(x))))
Armin Ronacher1cc232c2007-09-07 17:52:41 +020083
Armin Ronacher1d51f632008-03-25 14:34:45 +010084simple_escapes = {
85 'a': '\a',
86 'n': '\n',
87 'r': '\r',
88 'f': '\f',
89 't': '\t',
90 'v': '\v',
91 '\\': '\\',
92 '"': '"',
93 "'": "'",
94 '0': '\x00'
95}
96unicode_escapes = {
97 'x': 2,
98 'u': 4,
99 'U': 8
100}
101
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200102
103def unescape_string(lineno, filename, s):
104 r"""
105 Unescape a string. Supported escapes:
106 \a, \n, \r\, \f, \v, \\, \", \', \0
107
108 \x00, \u0000, \U00000000, \N{...}
109
110 Not supported are \101 because imho redundant.
111 """
112 result = []
113 write = result.append
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200114 chariter = iter(s)
115 next_char = chariter.next
116
Armin Ronacher1d51f632008-03-25 14:34:45 +0100117 # faster lookup
118 sescapes = simple_escapes
119 uescapes = unicode_escapes
120
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200121 try:
122 for char in chariter:
123 if char == '\\':
124 char = next_char()
Armin Ronacher1d51f632008-03-25 14:34:45 +0100125 if char in sescapes:
126 write(sescapes[char])
127 elif char in uescapes:
128 seq = [next_char() for x in xrange(uescapes[char])]
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200129 try:
130 write(unichr(int(''.join(seq), 16)))
131 except ValueError:
132 raise TemplateSyntaxError('invalid unicode codepoint',
133 lineno, filename)
134 elif char == 'N':
135 if next_char() != '{':
136 raise TemplateSyntaxError('no name for codepoint',
137 lineno, filename)
138 seq = []
Armin Ronacher1d51f632008-03-25 14:34:45 +0100139 while 1:
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200140 char = next_char()
141 if char == '}':
142 break
143 seq.append(char)
144 try:
145 write(unicodedata.lookup(u''.join(seq)))
146 except KeyError:
147 raise TemplateSyntaxError('unknown character name',
148 lineno, filename)
149 else:
150 write('\\' + char)
151 else:
152 write(char)
153 except StopIteration:
154 raise TemplateSyntaxError('invalid string escape', lineno, filename)
155 return u''.join(result)
156
157
158def unescape_regex(s):
159 """
160 Unescape rules for regular expressions.
161 """
162 buffer = []
163 write = buffer.append
164 in_escape = False
165 for char in s:
166 if in_escape:
167 in_escape = False
168 if char not in safe_chars:
169 write('\\' + char)
170 continue
171 write(char)
172 return u''.join(buffer)
Armin Ronacher2894f222007-03-19 22:39:55 +0100173
Armin Ronacher92f572f2007-02-26 22:17:32 +0100174
175class Failure(object):
176 """
177 Class that raises a `TemplateSyntaxError` if called.
178 Used by the `Lexer` to specify known errors.
179 """
180
181 def __init__(self, message, cls=TemplateSyntaxError):
182 self.message = message
183 self.error_class = cls
184
Armin Ronacher720e55b2007-05-30 00:57:49 +0200185 def __call__(self, lineno, filename):
186 raise self.error_class(self.message, lineno, filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100187
188
Armin Ronacher21580912007-04-17 17:13:10 +0200189class LexerMeta(type):
190 """
191 Metaclass for the lexer that caches instances for
192 the same configuration in a weak value dictionary.
193 """
194
195 def __call__(cls, environment):
196 key = hash((environment.block_start_string,
197 environment.block_end_string,
198 environment.variable_start_string,
199 environment.variable_end_string,
200 environment.comment_start_string,
201 environment.comment_end_string,
202 environment.trim_blocks))
203
204 # use the cached lexer if possible
205 if key in _lexer_cache:
206 return _lexer_cache[key]
207
208 # create a new lexer and cache it
209 lexer = type.__call__(cls, environment)
210 _lexer_cache[key] = lexer
211 return lexer
212
213
Armin Ronacher92f572f2007-02-26 22:17:32 +0100214class Lexer(object):
215 """
216 Class that implements a lexer for a given environment. Automatically
217 created by the environment class, usually you don't have to do that.
Armin Ronacher21580912007-04-17 17:13:10 +0200218
219 Note that the lexer is not automatically bound to an environment.
220 Multiple environments can share the same lexer.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100221 """
222
Armin Ronacher21580912007-04-17 17:13:10 +0200223 __metaclass__ = LexerMeta
224
Armin Ronacher92f572f2007-02-26 22:17:32 +0100225 def __init__(self, environment):
226 # shortcuts
227 c = lambda x: re.compile(x, re.M | re.S)
228 e = re.escape
229
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200230 # lexing rules for tags
Armin Ronacher92f572f2007-02-26 22:17:32 +0100231 tag_rules = [
Armin Ronachere791c2a2008-04-07 18:39:54 +0200232 (eol_re, 'eol', None),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100233 (whitespace_re, None, None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200234 (float_re, 'float', None),
235 (integer_re, 'integer', None),
Armin Ronachere791c2a2008-04-07 18:39:54 +0200236 (c(r'\b(?:%s)\b' % '|'.join(sorted(keywords, key=lambda x: -len(x)))),
Armin Ronacher07bc6842008-03-31 14:18:49 +0200237 'keyword', None),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100238 (name_re, 'name', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200239 (string_re, 'string', None),
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200240 (operator_re, 'operator', None)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100241 ]
242
Armin Ronacher33d528a2007-05-14 18:21:44 +0200243 #: if variables and blocks have the same delimiters we won't
244 #: receive any variable blocks in the parser. This variable is `True`
245 #: if we need that.
246 self.no_variable_block = (
247 (environment.variable_start_string is
248 environment.variable_end_string is None) or
249 (environment.variable_start_string ==
250 environment.block_start_string and
251 environment.variable_end_string ==
252 environment.block_end_string)
253 )
254
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100255 # assamble the root lexing rule. because "|" is ungreedy
256 # we have to sort by length so that the lexer continues working
257 # as expected when we have parsing rules like <% for block and
258 # <%= for variables. (if someone wants asp like syntax)
Armin Ronacher33d528a2007-05-14 18:21:44 +0200259 # variables are just part of the rules if variable processing
260 # is required.
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100261 root_tag_rules = [
262 ('comment', environment.comment_start_string),
Armin Ronacher33d528a2007-05-14 18:21:44 +0200263 ('block', environment.block_start_string)
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100264 ]
Armin Ronacher33d528a2007-05-14 18:21:44 +0200265 if not self.no_variable_block:
266 root_tag_rules.append(('variable',
267 environment.variable_start_string))
Armin Ronacherd874fbe2007-02-27 20:51:59 +0100268 root_tag_rules.sort(lambda a, b: cmp(len(b[1]), len(a[1])))
269
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200270 # block suffix if trimming is enabled
271 block_suffix_re = environment.trim_blocks and '\\n?' or ''
272
273 # global lexing rules
Armin Ronacher92f572f2007-02-26 22:17:32 +0100274 self.rules = {
275 'root': [
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100276 # directives
277 (c('(.*?)(?:%s)' % '|'.join(
278 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
279 e(environment.block_start_string),
280 e(environment.block_start_string),
281 e(environment.block_end_string)
282 )] + [
283 '(?P<%s_begin>\s*%s\-|%s)' % (n, e(r), e(r))
284 for n, r in root_tag_rules
285 ])), ('data', '#bygroup'), '#bygroup'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200286 # data
Armin Ronacher92f572f2007-02-26 22:17:32 +0100287 (c('.+'), 'data', None)
288 ],
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200289 # comments
Armin Ronacher92f572f2007-02-26 22:17:32 +0100290 'comment_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200291 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200292 e(environment.comment_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200293 e(environment.comment_end_string),
294 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200295 )), ('comment', 'comment_end'), '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100296 (c('(.)'), (Failure('Missing end of comment tag'),), None)
297 ],
Armin Ronacher21580912007-04-17 17:13:10 +0200298 # blocks
Armin Ronacher92f572f2007-02-26 22:17:32 +0100299 'block_begin': [
Armin Ronachera5c8d582007-03-31 20:40:38 +0200300 (c('(?:\-%s\s*|%s)%s' % (
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200301 e(environment.block_end_string),
Armin Ronachera5c8d582007-03-31 20:40:38 +0200302 e(environment.block_end_string),
303 block_suffix_re
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200304 )), 'block_end', '#pop'),
Armin Ronacher92f572f2007-02-26 22:17:32 +0100305 ] + tag_rules,
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200306 # raw block
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100307 'raw_begin': [
Armin Ronacher1151fbc2007-03-28 21:44:04 +0200308 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
309 e(environment.block_start_string),
310 e(environment.block_start_string),
311 e(environment.block_end_string),
312 e(environment.block_end_string),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200313 block_suffix_re
Armin Ronacher523bf4c2007-11-17 23:45:04 +0100314 )), ('data', 'raw_end'), '#pop'),
Armin Ronachera6c3ac52007-03-27 22:51:51 +0200315 (c('(.)'), (Failure('Missing end of raw directive'),), None)
316 ]
Armin Ronacher92f572f2007-02-26 22:17:32 +0100317 }
318
Armin Ronacher33d528a2007-05-14 18:21:44 +0200319 # only add the variable rules to the list if we process variables
320 # the variable_end_string variable could be None and break things.
321 if not self.no_variable_block:
322 self.rules['variable_begin'] = [
323 (c('\-%s\s*|%s' % (
324 e(environment.variable_end_string),
325 e(environment.variable_end_string)
326 )), 'variable_end', '#pop')
327 ] + tag_rules
328
Armin Ronacher21580912007-04-17 17:13:10 +0200329 def tokenize(self, source, filename=None):
Armin Ronacher92f572f2007-02-26 22:17:32 +0100330 """
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200331 Works like `tokeniter` but returns a tokenstream of tokens and not a
332 generator or token tuples. Additionally all token values are already
333 converted into types and postprocessed. For example keywords are
334 already keyword tokens, not named tokens, comments are removed,
335 integers and floats converted, strings unescaped etc.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100336 """
Armin Ronacher5a8e4972007-04-05 11:21:38 +0200337 def generate():
Armin Ronacher21580912007-04-17 17:13:10 +0200338 for lineno, token, value in self.tokeniter(source, filename):
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200339 if token in ('comment_begin', 'comment', 'comment_end'):
340 continue
341 elif token == 'data':
342 try:
343 value = str(value)
344 except UnicodeError:
345 pass
Armin Ronacher07bc6842008-03-31 14:18:49 +0200346 elif token == 'keyword':
Armin Ronacher82b3f3d2008-03-31 20:01:08 +0200347 token = value
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200348 elif token == 'name':
349 value = str(value)
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200350 elif token == 'string':
351 value = unescape_string(lineno, filename, value[1:-1])
352 try:
353 value = str(value)
354 except UnicodeError:
355 pass
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200356 elif token == 'integer':
357 value = int(value)
358 elif token == 'float':
359 value = float(value)
360 elif token == 'operator':
361 token = operators[value]
362 value = ''
363 yield Token(lineno, token, value)
Armin Ronacher21580912007-04-17 17:13:10 +0200364 return TokenStream(generate(), filename)
Armin Ronacher92f572f2007-02-26 22:17:32 +0100365
Armin Ronacher21580912007-04-17 17:13:10 +0200366 def tokeniter(self, source, filename=None):
Armin Ronacher92f572f2007-02-26 22:17:32 +0100367 """
368 This method tokenizes the text and returns the tokens in a generator.
Armin Ronacherf626c8e2007-03-23 16:13:10 +0100369 Use this method if you just want to tokenize a template. The output
370 you get is not compatible with the input the jinja parser wants. The
Armin Ronacher1cc232c2007-09-07 17:52:41 +0200371 parser uses the `tokenize` function with returns a `TokenStream` and
372 keywords instead of just names.
Armin Ronacher92f572f2007-02-26 22:17:32 +0100373 """
Armin Ronacher5a8e4972007-04-05 11:21:38 +0200374 source = '\n'.join(source.splitlines())
Armin Ronacher7977e5c2007-03-12 07:22:17 +0100375 pos = 0
376 lineno = 1
Armin Ronacher92f572f2007-02-26 22:17:32 +0100377 stack = ['root']
378 statetokens = self.rules['root']
379 source_length = len(source)
380
Armin Ronacher21580912007-04-17 17:13:10 +0200381 balancing_stack = []
382
Armin Ronacher92f572f2007-02-26 22:17:32 +0100383 while True:
384 # tokenizer loop
385 for regex, tokens, new_state in statetokens:
386 m = regex.match(source, pos)
Armin Ronacher21580912007-04-17 17:13:10 +0200387 # if no match we try again with the next rule
388 if not m:
389 continue
390
391 # we only match blocks and variables if brances / parentheses
392 # are balanced. continue parsing with the lower rule which
393 # is the operator rule. do this only if the end tags look
394 # like operators
395 if balancing_stack and \
396 tokens in ('variable_end', 'block_end'):
397 continue
398
399 # tuples support more options
400 if isinstance(tokens, tuple):
401 for idx, token in enumerate(tokens):
402 # hidden group
403 if token is None:
404 g = m.group(idx)
405 if g:
406 lineno += g.count('\n')
407 continue
408 # failure group
Armin Ronacherecc051b2007-06-01 18:25:28 +0200409 elif token.__class__ is Failure:
Armin Ronacher720e55b2007-05-30 00:57:49 +0200410 raise token(lineno, filename)
Armin Ronacher21580912007-04-17 17:13:10 +0200411 # bygroup is a bit more complex, in that case we
412 # yield for the current token the first named
413 # group that matched
414 elif token == '#bygroup':
Armin Ronacher92f572f2007-02-26 22:17:32 +0100415 for key, value in m.groupdict().iteritems():
416 if value is not None:
Armin Ronacher21580912007-04-17 17:13:10 +0200417 yield lineno, key, value
418 lineno += value.count('\n')
Armin Ronacher92f572f2007-02-26 22:17:32 +0100419 break
420 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200421 raise RuntimeError('%r wanted to resolve '
422 'the token dynamically'
423 ' but no group matched'
424 % regex)
425 # normal group
Armin Ronacher92f572f2007-02-26 22:17:32 +0100426 else:
Armin Ronacher21580912007-04-17 17:13:10 +0200427 data = m.group(idx + 1)
428 if data:
429 yield lineno, token, data
430 lineno += data.count('\n')
431
432 # strings as token just are yielded as it, but just
433 # if the data is not empty
434 else:
435 data = m.group()
436 # update brace/parentheses balance
437 if tokens == 'operator':
438 if data == '{':
439 balancing_stack.append('}')
440 elif data == '(':
441 balancing_stack.append(')')
442 elif data == '[':
443 balancing_stack.append(']')
444 elif data in ('}', ')', ']'):
Armin Ronacherf750daa2007-05-29 23:22:38 +0200445 if not balancing_stack:
446 raise TemplateSyntaxError('unexpected "%s"' %
447 data, lineno,
448 filename)
449 expected_op = balancing_stack.pop()
450 if expected_op != data:
451 raise TemplateSyntaxError('unexpected "%s", '
452 'expected "%s"' %
453 (data, expected_op),
Armin Ronacher21580912007-04-17 17:13:10 +0200454 lineno, filename)
455 # yield items
456 if tokens is not None:
457 if data:
458 yield lineno, tokens, data
459 lineno += data.count('\n')
460
461 # fetch new position into new variable so that we can check
462 # if there is a internal parsing error which would result
463 # in an infinite loop
464 pos2 = m.end()
465
466 # handle state changes
467 if new_state is not None:
468 # remove the uppermost state
469 if new_state == '#pop':
470 stack.pop()
471 # resolve the new state by group checking
472 elif new_state == '#bygroup':
473 for key, value in m.groupdict().iteritems():
474 if value is not None:
475 stack.append(key)
476 break
477 else:
478 raise RuntimeError('%r wanted to resolve the '
479 'new state dynamically but'
480 ' no group matched' %
481 regex)
482 # direct state name given
483 else:
484 stack.append(new_state)
485 statetokens = self.rules[stack[-1]]
486 # we are still at the same position and no stack change.
487 # this means a loop without break condition, avoid that and
488 # raise error
489 elif pos2 == pos:
490 raise RuntimeError('%r yielded empty string without '
491 'stack change' % regex)
492 # publish new function and start again
493 pos = pos2
494 break
Armin Ronacher92f572f2007-02-26 22:17:32 +0100495 # if loop terminated without break we havn't found a single match
496 # either we are at the end of the file or we have a problem
497 else:
498 # end of text
499 if pos >= source_length:
500 return
501 # something went wrong
502 raise TemplateSyntaxError('unexpected char %r at %d' %
Armin Ronacher21580912007-04-17 17:13:10 +0200503 (source[pos], pos), lineno,
504 filename)