Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | """ |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 3 | jinja2.lexer |
| 4 | ~~~~~~~~~~~~ |
Armin Ronacher | 3b65b8a | 2007-02-27 20:21:45 +0100 | [diff] [blame] | 5 | |
Armin Ronacher | 5a8e497 | 2007-04-05 11:21:38 +0200 | [diff] [blame] | 6 | This module implements a Jinja / Python combination lexer. The |
| 7 | `Lexer` class provided by this module is used to do some preprocessing |
| 8 | for Jinja. |
| 9 | |
| 10 | On the one hand it filters out invalid operators like the bitshift |
| 11 | operators we don't allow in templates. On the other hand it separates |
| 12 | template code and python code in expressions. |
| 13 | |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 14 | :copyright: 2007-2008 by Armin Ronacher. |
Armin Ronacher | 3b65b8a | 2007-02-27 20:21:45 +0100 | [diff] [blame] | 15 | :license: BSD, see LICENSE for more details. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 16 | """ |
| 17 | import re |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 18 | import unicodedata |
Armin Ronacher | 82b3f3d | 2008-03-31 20:01:08 +0200 | [diff] [blame] | 19 | from jinja2.datastructure import TokenStream, Token |
| 20 | from jinja2.exceptions import TemplateSyntaxError |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 21 | from weakref import WeakValueDictionary |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 22 | |
| 23 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 24 | # cache for the lexers. Exists in order to be able to have multiple |
| 25 | # environments with the same lexer |
| 26 | _lexer_cache = WeakValueDictionary() |
| 27 | |
| 28 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 29 | # static regular expressions |
Armin Ronacher | 0949e4d | 2007-10-07 18:53:29 +0200 | [diff] [blame] | 30 | whitespace_re = re.compile(r'\s+(?um)') |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 31 | string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'" |
| 32 | r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)') |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 33 | integer_re = re.compile(r'\d+') |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame] | 34 | name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b') |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 35 | float_re = re.compile(r'\d+\.\d+') |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 36 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 37 | |
Armin Ronacher | 9baa5ba | 2007-03-21 18:05:32 +0100 | [diff] [blame] | 38 | # set of used keywords |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame] | 39 | keywords = set(['and', 'block', 'elif', 'else', 'endblock', 'print', |
Armin Ronacher | 9baa5ba | 2007-03-21 18:05:32 +0100 | [diff] [blame] | 40 | 'endfilter', 'endfor', 'endif', 'endmacro', 'endraw', |
Armin Ronacher | f59bac2 | 2008-04-20 13:11:43 +0200 | [diff] [blame] | 41 | 'extends', 'filter', 'for', 'if', 'in', |
| 42 | 'include', 'is', 'macro', 'not', 'or', 'raw', |
| 43 | 'recursive', 'set', 'call', 'endcall']) |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 44 | |
| 45 | # bind operators to token types |
| 46 | operators = { |
| 47 | '+': 'add', |
| 48 | '-': 'sub', |
| 49 | '/': 'div', |
| 50 | '//': 'floordiv', |
| 51 | '*': 'mul', |
| 52 | '%': 'mod', |
| 53 | '**': 'pow', |
| 54 | '~': 'tilde', |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 55 | '[': 'lbracket', |
| 56 | ']': 'rbracket', |
| 57 | '(': 'lparen', |
| 58 | ')': 'rparen', |
| 59 | '{': 'lbrace', |
| 60 | '}': 'rbrace', |
| 61 | '==': 'eq', |
| 62 | '!=': 'ne', |
| 63 | '>': 'gt', |
| 64 | '>=': 'gteq', |
| 65 | '<': 'lt', |
| 66 | '<=': 'lteq', |
| 67 | '=': 'assign', |
| 68 | '.': 'dot', |
| 69 | ':': 'colon', |
| 70 | '|': 'pipe', |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 71 | ',': 'comma', |
| 72 | ';': 'semicolon' |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 73 | } |
| 74 | |
| 75 | reverse_operators = dict([(v, k) for k, v in operators.iteritems()]) |
| 76 | assert len(operators) == len(reverse_operators), 'operators dropped' |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame] | 77 | operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in |
| 78 | sorted(operators, key=lambda x: -len(x)))) |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 79 | |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 80 | simple_escapes = { |
| 81 | 'a': '\a', |
| 82 | 'n': '\n', |
| 83 | 'r': '\r', |
| 84 | 'f': '\f', |
| 85 | 't': '\t', |
| 86 | 'v': '\v', |
| 87 | '\\': '\\', |
| 88 | '"': '"', |
| 89 | "'": "'", |
| 90 | '0': '\x00' |
| 91 | } |
| 92 | unicode_escapes = { |
| 93 | 'x': 2, |
| 94 | 'u': 4, |
| 95 | 'U': 8 |
| 96 | } |
| 97 | |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 98 | |
| 99 | def unescape_string(lineno, filename, s): |
| 100 | r""" |
| 101 | Unescape a string. Supported escapes: |
| 102 | \a, \n, \r\, \f, \v, \\, \", \', \0 |
| 103 | |
| 104 | \x00, \u0000, \U00000000, \N{...} |
| 105 | |
| 106 | Not supported are \101 because imho redundant. |
| 107 | """ |
| 108 | result = [] |
| 109 | write = result.append |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 110 | chariter = iter(s) |
| 111 | next_char = chariter.next |
| 112 | |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 113 | # faster lookup |
| 114 | sescapes = simple_escapes |
| 115 | uescapes = unicode_escapes |
| 116 | |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 117 | try: |
| 118 | for char in chariter: |
| 119 | if char == '\\': |
| 120 | char = next_char() |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 121 | if char in sescapes: |
| 122 | write(sescapes[char]) |
| 123 | elif char in uescapes: |
| 124 | seq = [next_char() for x in xrange(uescapes[char])] |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 125 | try: |
| 126 | write(unichr(int(''.join(seq), 16))) |
| 127 | except ValueError: |
| 128 | raise TemplateSyntaxError('invalid unicode codepoint', |
| 129 | lineno, filename) |
| 130 | elif char == 'N': |
| 131 | if next_char() != '{': |
| 132 | raise TemplateSyntaxError('no name for codepoint', |
| 133 | lineno, filename) |
| 134 | seq = [] |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 135 | while 1: |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 136 | char = next_char() |
| 137 | if char == '}': |
| 138 | break |
| 139 | seq.append(char) |
| 140 | try: |
| 141 | write(unicodedata.lookup(u''.join(seq))) |
| 142 | except KeyError: |
| 143 | raise TemplateSyntaxError('unknown character name', |
| 144 | lineno, filename) |
| 145 | else: |
| 146 | write('\\' + char) |
| 147 | else: |
| 148 | write(char) |
| 149 | except StopIteration: |
| 150 | raise TemplateSyntaxError('invalid string escape', lineno, filename) |
| 151 | return u''.join(result) |
| 152 | |
| 153 | |
| 154 | def unescape_regex(s): |
| 155 | """ |
| 156 | Unescape rules for regular expressions. |
| 157 | """ |
| 158 | buffer = [] |
| 159 | write = buffer.append |
| 160 | in_escape = False |
| 161 | for char in s: |
| 162 | if in_escape: |
| 163 | in_escape = False |
| 164 | if char not in safe_chars: |
| 165 | write('\\' + char) |
| 166 | continue |
| 167 | write(char) |
| 168 | return u''.join(buffer) |
Armin Ronacher | 2894f22 | 2007-03-19 22:39:55 +0100 | [diff] [blame] | 169 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 170 | |
| 171 | class Failure(object): |
| 172 | """ |
| 173 | Class that raises a `TemplateSyntaxError` if called. |
| 174 | Used by the `Lexer` to specify known errors. |
| 175 | """ |
| 176 | |
| 177 | def __init__(self, message, cls=TemplateSyntaxError): |
| 178 | self.message = message |
| 179 | self.error_class = cls |
| 180 | |
Armin Ronacher | 720e55b | 2007-05-30 00:57:49 +0200 | [diff] [blame] | 181 | def __call__(self, lineno, filename): |
| 182 | raise self.error_class(self.message, lineno, filename) |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 183 | |
| 184 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 185 | class LexerMeta(type): |
| 186 | """ |
| 187 | Metaclass for the lexer that caches instances for |
| 188 | the same configuration in a weak value dictionary. |
| 189 | """ |
| 190 | |
| 191 | def __call__(cls, environment): |
| 192 | key = hash((environment.block_start_string, |
| 193 | environment.block_end_string, |
| 194 | environment.variable_start_string, |
| 195 | environment.variable_end_string, |
| 196 | environment.comment_start_string, |
| 197 | environment.comment_end_string, |
Armin Ronacher | bf7c4ad | 2008-04-12 12:02:36 +0200 | [diff] [blame] | 198 | environment.line_statement_prefix, |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 199 | environment.trim_blocks)) |
| 200 | |
| 201 | # use the cached lexer if possible |
| 202 | if key in _lexer_cache: |
| 203 | return _lexer_cache[key] |
| 204 | |
| 205 | # create a new lexer and cache it |
| 206 | lexer = type.__call__(cls, environment) |
| 207 | _lexer_cache[key] = lexer |
| 208 | return lexer |
| 209 | |
| 210 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 211 | class Lexer(object): |
| 212 | """ |
| 213 | Class that implements a lexer for a given environment. Automatically |
| 214 | created by the environment class, usually you don't have to do that. |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 215 | |
| 216 | Note that the lexer is not automatically bound to an environment. |
| 217 | Multiple environments can share the same lexer. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 218 | """ |
| 219 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 220 | __metaclass__ = LexerMeta |
| 221 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 222 | def __init__(self, environment): |
| 223 | # shortcuts |
| 224 | c = lambda x: re.compile(x, re.M | re.S) |
| 225 | e = re.escape |
| 226 | |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 227 | # lexing rules for tags |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 228 | tag_rules = [ |
| 229 | (whitespace_re, None, None), |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 230 | (float_re, 'float', None), |
| 231 | (integer_re, 'integer', None), |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame] | 232 | (c(r'\b(?:%s)\b' % '|'.join(sorted(keywords, key=lambda x: -len(x)))), |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 233 | 'keyword', None), |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 234 | (name_re, 'name', None), |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 235 | (string_re, 'string', None), |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 236 | (operator_re, 'operator', None) |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 237 | ] |
| 238 | |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 239 | # assamble the root lexing rule. because "|" is ungreedy |
| 240 | # we have to sort by length so that the lexer continues working |
| 241 | # as expected when we have parsing rules like <% for block and |
| 242 | # <%= for variables. (if someone wants asp like syntax) |
Armin Ronacher | 33d528a | 2007-05-14 18:21:44 +0200 | [diff] [blame] | 243 | # variables are just part of the rules if variable processing |
| 244 | # is required. |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 245 | root_tag_rules = [ |
| 246 | ('comment', environment.comment_start_string), |
Armin Ronacher | 2e9396b | 2008-04-16 14:21:57 +0200 | [diff] [blame] | 247 | ('block', environment.block_start_string), |
| 248 | ('variable', environment.variable_start_string) |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 249 | ] |
Armin Ronacher | bf7c4ad | 2008-04-12 12:02:36 +0200 | [diff] [blame] | 250 | root_tag_rules.sort(key=lambda x: len(x[1])) |
| 251 | |
| 252 | # now escape the rules. This is done here so that the escape |
| 253 | # signs don't count for the lengths of the tags. |
| 254 | root_tag_rules = [(a, e(b)) for a, b in root_tag_rules] |
| 255 | |
| 256 | # if we have a line statement prefix we need an extra rule for |
| 257 | # that. We add this rule *after* all the others. |
| 258 | if environment.line_statement_prefix is not None: |
| 259 | prefix = e(environment.line_statement_prefix) |
| 260 | root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix)) |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 261 | |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 262 | # block suffix if trimming is enabled |
| 263 | block_suffix_re = environment.trim_blocks and '\\n?' or '' |
| 264 | |
| 265 | # global lexing rules |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 266 | self.rules = { |
| 267 | 'root': [ |
Armin Ronacher | 523bf4c | 2007-11-17 23:45:04 +0100 | [diff] [blame] | 268 | # directives |
| 269 | (c('(.*?)(?:%s)' % '|'.join( |
| 270 | ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % ( |
| 271 | e(environment.block_start_string), |
| 272 | e(environment.block_start_string), |
| 273 | e(environment.block_end_string) |
| 274 | )] + [ |
Armin Ronacher | bf7c4ad | 2008-04-12 12:02:36 +0200 | [diff] [blame] | 275 | '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r) |
Armin Ronacher | 523bf4c | 2007-11-17 23:45:04 +0100 | [diff] [blame] | 276 | for n, r in root_tag_rules |
| 277 | ])), ('data', '#bygroup'), '#bygroup'), |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 278 | # data |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 279 | (c('.+'), 'data', None) |
| 280 | ], |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 281 | # comments |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 282 | 'comment_begin': [ |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 283 | (c(r'(.*?)((?:\-%s\s*|%s)%s)' % ( |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 284 | e(environment.comment_end_string), |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 285 | e(environment.comment_end_string), |
| 286 | block_suffix_re |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 287 | )), ('comment', 'comment_end'), '#pop'), |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 288 | (c('(.)'), (Failure('Missing end of comment tag'),), None) |
| 289 | ], |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 290 | # blocks |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 291 | 'block_begin': [ |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 292 | (c('(?:\-%s\s*|%s)%s' % ( |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 293 | e(environment.block_end_string), |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 294 | e(environment.block_end_string), |
| 295 | block_suffix_re |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 296 | )), 'block_end', '#pop'), |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 297 | ] + tag_rules, |
Armin Ronacher | 2e9396b | 2008-04-16 14:21:57 +0200 | [diff] [blame] | 298 | # variables |
| 299 | 'variable_begin': [ |
| 300 | (c('\-%s\s*|%s' % ( |
| 301 | e(environment.variable_end_string), |
| 302 | e(environment.variable_end_string) |
| 303 | )), 'variable_end', '#pop') |
| 304 | ] + tag_rules, |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 305 | # raw block |
Armin Ronacher | 523bf4c | 2007-11-17 23:45:04 +0100 | [diff] [blame] | 306 | 'raw_begin': [ |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 307 | (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % ( |
| 308 | e(environment.block_start_string), |
| 309 | e(environment.block_start_string), |
| 310 | e(environment.block_end_string), |
| 311 | e(environment.block_end_string), |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 312 | block_suffix_re |
Armin Ronacher | 523bf4c | 2007-11-17 23:45:04 +0100 | [diff] [blame] | 313 | )), ('data', 'raw_end'), '#pop'), |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 314 | (c('(.)'), (Failure('Missing end of raw directive'),), None) |
Armin Ronacher | 2e9396b | 2008-04-16 14:21:57 +0200 | [diff] [blame] | 315 | ], |
| 316 | # line statements |
| 317 | 'linestatement_begin': [ |
Armin Ronacher | bf7c4ad | 2008-04-12 12:02:36 +0200 | [diff] [blame] | 318 | (c(r'\s*(\n|$)'), 'linestatement_end', '#pop') |
| 319 | ] + tag_rules |
Armin Ronacher | 2e9396b | 2008-04-16 14:21:57 +0200 | [diff] [blame] | 320 | } |
Armin Ronacher | bf7c4ad | 2008-04-12 12:02:36 +0200 | [diff] [blame] | 321 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 322 | def tokenize(self, source, filename=None): |
Armin Ronacher | 7108207 | 2008-04-12 14:19:36 +0200 | [diff] [blame] | 323 | """Works like `tokeniter` but returns a tokenstream of tokens and not |
| 324 | a generator or token tuples. Additionally all token values are already |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 325 | converted into types and postprocessed. For example keywords are |
| 326 | already keyword tokens, not named tokens, comments are removed, |
| 327 | integers and floats converted, strings unescaped etc. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 328 | """ |
Armin Ronacher | bf7c4ad | 2008-04-12 12:02:36 +0200 | [diff] [blame] | 329 | source = unicode(source) |
Armin Ronacher | 5a8e497 | 2007-04-05 11:21:38 +0200 | [diff] [blame] | 330 | def generate(): |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 331 | for lineno, token, value in self.tokeniter(source, filename): |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 332 | if token in ('comment_begin', 'comment', 'comment_end'): |
| 333 | continue |
Armin Ronacher | bf7c4ad | 2008-04-12 12:02:36 +0200 | [diff] [blame] | 334 | elif token == 'linestatement_begin': |
| 335 | token = 'block_begin' |
| 336 | elif token == 'linestatement_end': |
| 337 | token = 'block_end' |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 338 | elif token == 'data': |
| 339 | try: |
| 340 | value = str(value) |
| 341 | except UnicodeError: |
| 342 | pass |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 343 | elif token == 'keyword': |
Armin Ronacher | 82b3f3d | 2008-03-31 20:01:08 +0200 | [diff] [blame] | 344 | token = value |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 345 | elif token == 'name': |
| 346 | value = str(value) |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 347 | elif token == 'string': |
| 348 | value = unescape_string(lineno, filename, value[1:-1]) |
| 349 | try: |
| 350 | value = str(value) |
| 351 | except UnicodeError: |
| 352 | pass |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 353 | elif token == 'integer': |
| 354 | value = int(value) |
| 355 | elif token == 'float': |
| 356 | value = float(value) |
| 357 | elif token == 'operator': |
| 358 | token = operators[value] |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 359 | yield Token(lineno, token, value) |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 360 | return TokenStream(generate(), filename) |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 361 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 362 | def tokeniter(self, source, filename=None): |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 363 | """ |
| 364 | This method tokenizes the text and returns the tokens in a generator. |
Armin Ronacher | f626c8e | 2007-03-23 16:13:10 +0100 | [diff] [blame] | 365 | Use this method if you just want to tokenize a template. The output |
| 366 | you get is not compatible with the input the jinja parser wants. The |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 367 | parser uses the `tokenize` function with returns a `TokenStream` and |
| 368 | keywords instead of just names. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 369 | """ |
Armin Ronacher | 5a8e497 | 2007-04-05 11:21:38 +0200 | [diff] [blame] | 370 | source = '\n'.join(source.splitlines()) |
Armin Ronacher | 7977e5c | 2007-03-12 07:22:17 +0100 | [diff] [blame] | 371 | pos = 0 |
| 372 | lineno = 1 |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 373 | stack = ['root'] |
| 374 | statetokens = self.rules['root'] |
| 375 | source_length = len(source) |
| 376 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 377 | balancing_stack = [] |
| 378 | |
Armin Ronacher | 7108207 | 2008-04-12 14:19:36 +0200 | [diff] [blame] | 379 | while 1: |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 380 | # tokenizer loop |
| 381 | for regex, tokens, new_state in statetokens: |
| 382 | m = regex.match(source, pos) |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 383 | # if no match we try again with the next rule |
Armin Ronacher | 7108207 | 2008-04-12 14:19:36 +0200 | [diff] [blame] | 384 | if m is None: |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 385 | continue |
| 386 | |
| 387 | # we only match blocks and variables if brances / parentheses |
| 388 | # are balanced. continue parsing with the lower rule which |
| 389 | # is the operator rule. do this only if the end tags look |
| 390 | # like operators |
| 391 | if balancing_stack and \ |
Armin Ronacher | 7108207 | 2008-04-12 14:19:36 +0200 | [diff] [blame] | 392 | tokens in ('variable_end', 'block_end', |
| 393 | 'linestatement_end'): |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 394 | continue |
| 395 | |
| 396 | # tuples support more options |
| 397 | if isinstance(tokens, tuple): |
| 398 | for idx, token in enumerate(tokens): |
| 399 | # hidden group |
| 400 | if token is None: |
| 401 | g = m.group(idx) |
| 402 | if g: |
| 403 | lineno += g.count('\n') |
| 404 | continue |
| 405 | # failure group |
Armin Ronacher | ecc051b | 2007-06-01 18:25:28 +0200 | [diff] [blame] | 406 | elif token.__class__ is Failure: |
Armin Ronacher | 720e55b | 2007-05-30 00:57:49 +0200 | [diff] [blame] | 407 | raise token(lineno, filename) |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 408 | # bygroup is a bit more complex, in that case we |
| 409 | # yield for the current token the first named |
| 410 | # group that matched |
| 411 | elif token == '#bygroup': |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 412 | for key, value in m.groupdict().iteritems(): |
| 413 | if value is not None: |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 414 | yield lineno, key, value |
| 415 | lineno += value.count('\n') |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 416 | break |
| 417 | else: |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 418 | raise RuntimeError('%r wanted to resolve ' |
| 419 | 'the token dynamically' |
| 420 | ' but no group matched' |
| 421 | % regex) |
| 422 | # normal group |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 423 | else: |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 424 | data = m.group(idx + 1) |
| 425 | if data: |
| 426 | yield lineno, token, data |
| 427 | lineno += data.count('\n') |
| 428 | |
Armin Ronacher | 7108207 | 2008-04-12 14:19:36 +0200 | [diff] [blame] | 429 | # strings as token just are yielded as it. |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 430 | else: |
| 431 | data = m.group() |
| 432 | # update brace/parentheses balance |
| 433 | if tokens == 'operator': |
| 434 | if data == '{': |
| 435 | balancing_stack.append('}') |
| 436 | elif data == '(': |
| 437 | balancing_stack.append(')') |
| 438 | elif data == '[': |
| 439 | balancing_stack.append(']') |
| 440 | elif data in ('}', ')', ']'): |
Armin Ronacher | f750daa | 2007-05-29 23:22:38 +0200 | [diff] [blame] | 441 | if not balancing_stack: |
| 442 | raise TemplateSyntaxError('unexpected "%s"' % |
| 443 | data, lineno, |
| 444 | filename) |
| 445 | expected_op = balancing_stack.pop() |
| 446 | if expected_op != data: |
| 447 | raise TemplateSyntaxError('unexpected "%s", ' |
| 448 | 'expected "%s"' % |
| 449 | (data, expected_op), |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 450 | lineno, filename) |
| 451 | # yield items |
| 452 | if tokens is not None: |
Armin Ronacher | 7108207 | 2008-04-12 14:19:36 +0200 | [diff] [blame] | 453 | yield lineno, tokens, data |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 454 | lineno += data.count('\n') |
| 455 | |
| 456 | # fetch new position into new variable so that we can check |
| 457 | # if there is a internal parsing error which would result |
| 458 | # in an infinite loop |
| 459 | pos2 = m.end() |
| 460 | |
| 461 | # handle state changes |
| 462 | if new_state is not None: |
| 463 | # remove the uppermost state |
| 464 | if new_state == '#pop': |
| 465 | stack.pop() |
| 466 | # resolve the new state by group checking |
| 467 | elif new_state == '#bygroup': |
| 468 | for key, value in m.groupdict().iteritems(): |
| 469 | if value is not None: |
| 470 | stack.append(key) |
| 471 | break |
| 472 | else: |
| 473 | raise RuntimeError('%r wanted to resolve the ' |
| 474 | 'new state dynamically but' |
| 475 | ' no group matched' % |
| 476 | regex) |
| 477 | # direct state name given |
| 478 | else: |
| 479 | stack.append(new_state) |
| 480 | statetokens = self.rules[stack[-1]] |
| 481 | # we are still at the same position and no stack change. |
| 482 | # this means a loop without break condition, avoid that and |
| 483 | # raise error |
| 484 | elif pos2 == pos: |
| 485 | raise RuntimeError('%r yielded empty string without ' |
| 486 | 'stack change' % regex) |
| 487 | # publish new function and start again |
| 488 | pos = pos2 |
| 489 | break |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 490 | # if loop terminated without break we havn't found a single match |
| 491 | # either we are at the end of the file or we have a problem |
| 492 | else: |
| 493 | # end of text |
| 494 | if pos >= source_length: |
| 495 | return |
| 496 | # something went wrong |
| 497 | raise TemplateSyntaxError('unexpected char %r at %d' % |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 498 | (source[pos], pos), lineno, |
| 499 | filename) |