Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | """ |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 3 | jinja2.lexer |
| 4 | ~~~~~~~~~~~~ |
Armin Ronacher | 3b65b8a | 2007-02-27 20:21:45 +0100 | [diff] [blame] | 5 | |
Armin Ronacher | 5a8e497 | 2007-04-05 11:21:38 +0200 | [diff] [blame] | 6 | This module implements a Jinja / Python combination lexer. The |
| 7 | `Lexer` class provided by this module is used to do some preprocessing |
| 8 | for Jinja. |
| 9 | |
| 10 | On the one hand it filters out invalid operators like the bitshift |
| 11 | operators we don't allow in templates. On the other hand it separates |
| 12 | template code and python code in expressions. |
| 13 | |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 14 | :copyright: 2007-2008 by Armin Ronacher. |
Armin Ronacher | 3b65b8a | 2007-02-27 20:21:45 +0100 | [diff] [blame] | 15 | :license: BSD, see LICENSE for more details. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 16 | """ |
| 17 | import re |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 18 | import unicodedata |
Armin Ronacher | 82b3f3d | 2008-03-31 20:01:08 +0200 | [diff] [blame] | 19 | from jinja2.datastructure import TokenStream, Token |
| 20 | from jinja2.exceptions import TemplateSyntaxError |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 21 | from weakref import WeakValueDictionary |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 22 | |
| 23 | |
Armin Ronacher | 5a8e497 | 2007-04-05 11:21:38 +0200 | [diff] [blame] | 24 | __all__ = ['Lexer', 'Failure', 'keywords'] |
| 25 | |
| 26 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 27 | # cache for the lexers. Exists in order to be able to have multiple |
| 28 | # environments with the same lexer |
| 29 | _lexer_cache = WeakValueDictionary() |
| 30 | |
| 31 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 32 | # static regular expressions |
Armin Ronacher | 0949e4d | 2007-10-07 18:53:29 +0200 | [diff] [blame] | 33 | whitespace_re = re.compile(r'\s+(?um)') |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 34 | string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'" |
| 35 | r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)') |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 36 | integer_re = re.compile(r'\d+') |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame^] | 37 | name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b') |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 38 | float_re = re.compile(r'\d+\.\d+') |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame^] | 39 | eol_re = re.compile(r'(\s*$\s*)+(?m)') |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 40 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 41 | |
Armin Ronacher | 9baa5ba | 2007-03-21 18:05:32 +0100 | [diff] [blame] | 42 | # set of used keywords |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame^] | 43 | keywords = set(['and', 'block', 'elif', 'else', 'endblock', 'print', |
Armin Ronacher | 9baa5ba | 2007-03-21 18:05:32 +0100 | [diff] [blame] | 44 | 'endfilter', 'endfor', 'endif', 'endmacro', 'endraw', |
| 45 | 'endtrans', 'extends', 'filter', 'for', 'if', 'in', |
| 46 | 'include', 'is', 'macro', 'not', 'or', 'pluralize', 'raw', |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame^] | 47 | 'recursive', 'set', 'trans', 'call', 'endcall']) |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 48 | |
| 49 | # bind operators to token types |
| 50 | operators = { |
| 51 | '+': 'add', |
| 52 | '-': 'sub', |
| 53 | '/': 'div', |
| 54 | '//': 'floordiv', |
| 55 | '*': 'mul', |
| 56 | '%': 'mod', |
| 57 | '**': 'pow', |
| 58 | '~': 'tilde', |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 59 | '[': 'lbracket', |
| 60 | ']': 'rbracket', |
| 61 | '(': 'lparen', |
| 62 | ')': 'rparen', |
| 63 | '{': 'lbrace', |
| 64 | '}': 'rbrace', |
| 65 | '==': 'eq', |
| 66 | '!=': 'ne', |
| 67 | '>': 'gt', |
| 68 | '>=': 'gteq', |
| 69 | '<': 'lt', |
| 70 | '<=': 'lteq', |
| 71 | '=': 'assign', |
| 72 | '.': 'dot', |
| 73 | ':': 'colon', |
| 74 | '|': 'pipe', |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 75 | ',': 'comma', |
| 76 | ';': 'semicolon' |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 77 | } |
| 78 | |
| 79 | reverse_operators = dict([(v, k) for k, v in operators.iteritems()]) |
| 80 | assert len(operators) == len(reverse_operators), 'operators dropped' |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame^] | 81 | operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in |
| 82 | sorted(operators, key=lambda x: -len(x)))) |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 83 | |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 84 | simple_escapes = { |
| 85 | 'a': '\a', |
| 86 | 'n': '\n', |
| 87 | 'r': '\r', |
| 88 | 'f': '\f', |
| 89 | 't': '\t', |
| 90 | 'v': '\v', |
| 91 | '\\': '\\', |
| 92 | '"': '"', |
| 93 | "'": "'", |
| 94 | '0': '\x00' |
| 95 | } |
| 96 | unicode_escapes = { |
| 97 | 'x': 2, |
| 98 | 'u': 4, |
| 99 | 'U': 8 |
| 100 | } |
| 101 | |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 102 | |
| 103 | def unescape_string(lineno, filename, s): |
| 104 | r""" |
| 105 | Unescape a string. Supported escapes: |
| 106 | \a, \n, \r\, \f, \v, \\, \", \', \0 |
| 107 | |
| 108 | \x00, \u0000, \U00000000, \N{...} |
| 109 | |
| 110 | Not supported are \101 because imho redundant. |
| 111 | """ |
| 112 | result = [] |
| 113 | write = result.append |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 114 | chariter = iter(s) |
| 115 | next_char = chariter.next |
| 116 | |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 117 | # faster lookup |
| 118 | sescapes = simple_escapes |
| 119 | uescapes = unicode_escapes |
| 120 | |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 121 | try: |
| 122 | for char in chariter: |
| 123 | if char == '\\': |
| 124 | char = next_char() |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 125 | if char in sescapes: |
| 126 | write(sescapes[char]) |
| 127 | elif char in uescapes: |
| 128 | seq = [next_char() for x in xrange(uescapes[char])] |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 129 | try: |
| 130 | write(unichr(int(''.join(seq), 16))) |
| 131 | except ValueError: |
| 132 | raise TemplateSyntaxError('invalid unicode codepoint', |
| 133 | lineno, filename) |
| 134 | elif char == 'N': |
| 135 | if next_char() != '{': |
| 136 | raise TemplateSyntaxError('no name for codepoint', |
| 137 | lineno, filename) |
| 138 | seq = [] |
Armin Ronacher | 1d51f63 | 2008-03-25 14:34:45 +0100 | [diff] [blame] | 139 | while 1: |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 140 | char = next_char() |
| 141 | if char == '}': |
| 142 | break |
| 143 | seq.append(char) |
| 144 | try: |
| 145 | write(unicodedata.lookup(u''.join(seq))) |
| 146 | except KeyError: |
| 147 | raise TemplateSyntaxError('unknown character name', |
| 148 | lineno, filename) |
| 149 | else: |
| 150 | write('\\' + char) |
| 151 | else: |
| 152 | write(char) |
| 153 | except StopIteration: |
| 154 | raise TemplateSyntaxError('invalid string escape', lineno, filename) |
| 155 | return u''.join(result) |
| 156 | |
| 157 | |
| 158 | def unescape_regex(s): |
| 159 | """ |
| 160 | Unescape rules for regular expressions. |
| 161 | """ |
| 162 | buffer = [] |
| 163 | write = buffer.append |
| 164 | in_escape = False |
| 165 | for char in s: |
| 166 | if in_escape: |
| 167 | in_escape = False |
| 168 | if char not in safe_chars: |
| 169 | write('\\' + char) |
| 170 | continue |
| 171 | write(char) |
| 172 | return u''.join(buffer) |
Armin Ronacher | 2894f22 | 2007-03-19 22:39:55 +0100 | [diff] [blame] | 173 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 174 | |
| 175 | class Failure(object): |
| 176 | """ |
| 177 | Class that raises a `TemplateSyntaxError` if called. |
| 178 | Used by the `Lexer` to specify known errors. |
| 179 | """ |
| 180 | |
| 181 | def __init__(self, message, cls=TemplateSyntaxError): |
| 182 | self.message = message |
| 183 | self.error_class = cls |
| 184 | |
Armin Ronacher | 720e55b | 2007-05-30 00:57:49 +0200 | [diff] [blame] | 185 | def __call__(self, lineno, filename): |
| 186 | raise self.error_class(self.message, lineno, filename) |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 187 | |
| 188 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 189 | class LexerMeta(type): |
| 190 | """ |
| 191 | Metaclass for the lexer that caches instances for |
| 192 | the same configuration in a weak value dictionary. |
| 193 | """ |
| 194 | |
| 195 | def __call__(cls, environment): |
| 196 | key = hash((environment.block_start_string, |
| 197 | environment.block_end_string, |
| 198 | environment.variable_start_string, |
| 199 | environment.variable_end_string, |
| 200 | environment.comment_start_string, |
| 201 | environment.comment_end_string, |
| 202 | environment.trim_blocks)) |
| 203 | |
| 204 | # use the cached lexer if possible |
| 205 | if key in _lexer_cache: |
| 206 | return _lexer_cache[key] |
| 207 | |
| 208 | # create a new lexer and cache it |
| 209 | lexer = type.__call__(cls, environment) |
| 210 | _lexer_cache[key] = lexer |
| 211 | return lexer |
| 212 | |
| 213 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 214 | class Lexer(object): |
| 215 | """ |
| 216 | Class that implements a lexer for a given environment. Automatically |
| 217 | created by the environment class, usually you don't have to do that. |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 218 | |
| 219 | Note that the lexer is not automatically bound to an environment. |
| 220 | Multiple environments can share the same lexer. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 221 | """ |
| 222 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 223 | __metaclass__ = LexerMeta |
| 224 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 225 | def __init__(self, environment): |
| 226 | # shortcuts |
| 227 | c = lambda x: re.compile(x, re.M | re.S) |
| 228 | e = re.escape |
| 229 | |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 230 | # lexing rules for tags |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 231 | tag_rules = [ |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame^] | 232 | (eol_re, 'eol', None), |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 233 | (whitespace_re, None, None), |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 234 | (float_re, 'float', None), |
| 235 | (integer_re, 'integer', None), |
Armin Ronacher | e791c2a | 2008-04-07 18:39:54 +0200 | [diff] [blame^] | 236 | (c(r'\b(?:%s)\b' % '|'.join(sorted(keywords, key=lambda x: -len(x)))), |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 237 | 'keyword', None), |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 238 | (name_re, 'name', None), |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 239 | (string_re, 'string', None), |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 240 | (operator_re, 'operator', None) |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 241 | ] |
| 242 | |
Armin Ronacher | 33d528a | 2007-05-14 18:21:44 +0200 | [diff] [blame] | 243 | #: if variables and blocks have the same delimiters we won't |
| 244 | #: receive any variable blocks in the parser. This variable is `True` |
| 245 | #: if we need that. |
| 246 | self.no_variable_block = ( |
| 247 | (environment.variable_start_string is |
| 248 | environment.variable_end_string is None) or |
| 249 | (environment.variable_start_string == |
| 250 | environment.block_start_string and |
| 251 | environment.variable_end_string == |
| 252 | environment.block_end_string) |
| 253 | ) |
| 254 | |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 255 | # assamble the root lexing rule. because "|" is ungreedy |
| 256 | # we have to sort by length so that the lexer continues working |
| 257 | # as expected when we have parsing rules like <% for block and |
| 258 | # <%= for variables. (if someone wants asp like syntax) |
Armin Ronacher | 33d528a | 2007-05-14 18:21:44 +0200 | [diff] [blame] | 259 | # variables are just part of the rules if variable processing |
| 260 | # is required. |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 261 | root_tag_rules = [ |
| 262 | ('comment', environment.comment_start_string), |
Armin Ronacher | 33d528a | 2007-05-14 18:21:44 +0200 | [diff] [blame] | 263 | ('block', environment.block_start_string) |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 264 | ] |
Armin Ronacher | 33d528a | 2007-05-14 18:21:44 +0200 | [diff] [blame] | 265 | if not self.no_variable_block: |
| 266 | root_tag_rules.append(('variable', |
| 267 | environment.variable_start_string)) |
Armin Ronacher | d874fbe | 2007-02-27 20:51:59 +0100 | [diff] [blame] | 268 | root_tag_rules.sort(lambda a, b: cmp(len(b[1]), len(a[1]))) |
| 269 | |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 270 | # block suffix if trimming is enabled |
| 271 | block_suffix_re = environment.trim_blocks and '\\n?' or '' |
| 272 | |
| 273 | # global lexing rules |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 274 | self.rules = { |
| 275 | 'root': [ |
Armin Ronacher | 523bf4c | 2007-11-17 23:45:04 +0100 | [diff] [blame] | 276 | # directives |
| 277 | (c('(.*?)(?:%s)' % '|'.join( |
| 278 | ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % ( |
| 279 | e(environment.block_start_string), |
| 280 | e(environment.block_start_string), |
| 281 | e(environment.block_end_string) |
| 282 | )] + [ |
| 283 | '(?P<%s_begin>\s*%s\-|%s)' % (n, e(r), e(r)) |
| 284 | for n, r in root_tag_rules |
| 285 | ])), ('data', '#bygroup'), '#bygroup'), |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 286 | # data |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 287 | (c('.+'), 'data', None) |
| 288 | ], |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 289 | # comments |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 290 | 'comment_begin': [ |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 291 | (c(r'(.*?)((?:\-%s\s*|%s)%s)' % ( |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 292 | e(environment.comment_end_string), |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 293 | e(environment.comment_end_string), |
| 294 | block_suffix_re |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 295 | )), ('comment', 'comment_end'), '#pop'), |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 296 | (c('(.)'), (Failure('Missing end of comment tag'),), None) |
| 297 | ], |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 298 | # blocks |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 299 | 'block_begin': [ |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 300 | (c('(?:\-%s\s*|%s)%s' % ( |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 301 | e(environment.block_end_string), |
Armin Ronacher | a5c8d58 | 2007-03-31 20:40:38 +0200 | [diff] [blame] | 302 | e(environment.block_end_string), |
| 303 | block_suffix_re |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 304 | )), 'block_end', '#pop'), |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 305 | ] + tag_rules, |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 306 | # raw block |
Armin Ronacher | 523bf4c | 2007-11-17 23:45:04 +0100 | [diff] [blame] | 307 | 'raw_begin': [ |
Armin Ronacher | 1151fbc | 2007-03-28 21:44:04 +0200 | [diff] [blame] | 308 | (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % ( |
| 309 | e(environment.block_start_string), |
| 310 | e(environment.block_start_string), |
| 311 | e(environment.block_end_string), |
| 312 | e(environment.block_end_string), |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 313 | block_suffix_re |
Armin Ronacher | 523bf4c | 2007-11-17 23:45:04 +0100 | [diff] [blame] | 314 | )), ('data', 'raw_end'), '#pop'), |
Armin Ronacher | a6c3ac5 | 2007-03-27 22:51:51 +0200 | [diff] [blame] | 315 | (c('(.)'), (Failure('Missing end of raw directive'),), None) |
| 316 | ] |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 317 | } |
| 318 | |
Armin Ronacher | 33d528a | 2007-05-14 18:21:44 +0200 | [diff] [blame] | 319 | # only add the variable rules to the list if we process variables |
| 320 | # the variable_end_string variable could be None and break things. |
| 321 | if not self.no_variable_block: |
| 322 | self.rules['variable_begin'] = [ |
| 323 | (c('\-%s\s*|%s' % ( |
| 324 | e(environment.variable_end_string), |
| 325 | e(environment.variable_end_string) |
| 326 | )), 'variable_end', '#pop') |
| 327 | ] + tag_rules |
| 328 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 329 | def tokenize(self, source, filename=None): |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 330 | """ |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 331 | Works like `tokeniter` but returns a tokenstream of tokens and not a |
| 332 | generator or token tuples. Additionally all token values are already |
| 333 | converted into types and postprocessed. For example keywords are |
| 334 | already keyword tokens, not named tokens, comments are removed, |
| 335 | integers and floats converted, strings unescaped etc. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 336 | """ |
Armin Ronacher | 5a8e497 | 2007-04-05 11:21:38 +0200 | [diff] [blame] | 337 | def generate(): |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 338 | for lineno, token, value in self.tokeniter(source, filename): |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 339 | if token in ('comment_begin', 'comment', 'comment_end'): |
| 340 | continue |
| 341 | elif token == 'data': |
| 342 | try: |
| 343 | value = str(value) |
| 344 | except UnicodeError: |
| 345 | pass |
Armin Ronacher | 07bc684 | 2008-03-31 14:18:49 +0200 | [diff] [blame] | 346 | elif token == 'keyword': |
Armin Ronacher | 82b3f3d | 2008-03-31 20:01:08 +0200 | [diff] [blame] | 347 | token = value |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 348 | elif token == 'name': |
| 349 | value = str(value) |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 350 | elif token == 'string': |
| 351 | value = unescape_string(lineno, filename, value[1:-1]) |
| 352 | try: |
| 353 | value = str(value) |
| 354 | except UnicodeError: |
| 355 | pass |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 356 | elif token == 'integer': |
| 357 | value = int(value) |
| 358 | elif token == 'float': |
| 359 | value = float(value) |
| 360 | elif token == 'operator': |
| 361 | token = operators[value] |
| 362 | value = '' |
| 363 | yield Token(lineno, token, value) |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 364 | return TokenStream(generate(), filename) |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 365 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 366 | def tokeniter(self, source, filename=None): |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 367 | """ |
| 368 | This method tokenizes the text and returns the tokens in a generator. |
Armin Ronacher | f626c8e | 2007-03-23 16:13:10 +0100 | [diff] [blame] | 369 | Use this method if you just want to tokenize a template. The output |
| 370 | you get is not compatible with the input the jinja parser wants. The |
Armin Ronacher | 1cc232c | 2007-09-07 17:52:41 +0200 | [diff] [blame] | 371 | parser uses the `tokenize` function with returns a `TokenStream` and |
| 372 | keywords instead of just names. |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 373 | """ |
Armin Ronacher | 5a8e497 | 2007-04-05 11:21:38 +0200 | [diff] [blame] | 374 | source = '\n'.join(source.splitlines()) |
Armin Ronacher | 7977e5c | 2007-03-12 07:22:17 +0100 | [diff] [blame] | 375 | pos = 0 |
| 376 | lineno = 1 |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 377 | stack = ['root'] |
| 378 | statetokens = self.rules['root'] |
| 379 | source_length = len(source) |
| 380 | |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 381 | balancing_stack = [] |
| 382 | |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 383 | while True: |
| 384 | # tokenizer loop |
| 385 | for regex, tokens, new_state in statetokens: |
| 386 | m = regex.match(source, pos) |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 387 | # if no match we try again with the next rule |
| 388 | if not m: |
| 389 | continue |
| 390 | |
| 391 | # we only match blocks and variables if brances / parentheses |
| 392 | # are balanced. continue parsing with the lower rule which |
| 393 | # is the operator rule. do this only if the end tags look |
| 394 | # like operators |
| 395 | if balancing_stack and \ |
| 396 | tokens in ('variable_end', 'block_end'): |
| 397 | continue |
| 398 | |
| 399 | # tuples support more options |
| 400 | if isinstance(tokens, tuple): |
| 401 | for idx, token in enumerate(tokens): |
| 402 | # hidden group |
| 403 | if token is None: |
| 404 | g = m.group(idx) |
| 405 | if g: |
| 406 | lineno += g.count('\n') |
| 407 | continue |
| 408 | # failure group |
Armin Ronacher | ecc051b | 2007-06-01 18:25:28 +0200 | [diff] [blame] | 409 | elif token.__class__ is Failure: |
Armin Ronacher | 720e55b | 2007-05-30 00:57:49 +0200 | [diff] [blame] | 410 | raise token(lineno, filename) |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 411 | # bygroup is a bit more complex, in that case we |
| 412 | # yield for the current token the first named |
| 413 | # group that matched |
| 414 | elif token == '#bygroup': |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 415 | for key, value in m.groupdict().iteritems(): |
| 416 | if value is not None: |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 417 | yield lineno, key, value |
| 418 | lineno += value.count('\n') |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 419 | break |
| 420 | else: |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 421 | raise RuntimeError('%r wanted to resolve ' |
| 422 | 'the token dynamically' |
| 423 | ' but no group matched' |
| 424 | % regex) |
| 425 | # normal group |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 426 | else: |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 427 | data = m.group(idx + 1) |
| 428 | if data: |
| 429 | yield lineno, token, data |
| 430 | lineno += data.count('\n') |
| 431 | |
| 432 | # strings as token just are yielded as it, but just |
| 433 | # if the data is not empty |
| 434 | else: |
| 435 | data = m.group() |
| 436 | # update brace/parentheses balance |
| 437 | if tokens == 'operator': |
| 438 | if data == '{': |
| 439 | balancing_stack.append('}') |
| 440 | elif data == '(': |
| 441 | balancing_stack.append(')') |
| 442 | elif data == '[': |
| 443 | balancing_stack.append(']') |
| 444 | elif data in ('}', ')', ']'): |
Armin Ronacher | f750daa | 2007-05-29 23:22:38 +0200 | [diff] [blame] | 445 | if not balancing_stack: |
| 446 | raise TemplateSyntaxError('unexpected "%s"' % |
| 447 | data, lineno, |
| 448 | filename) |
| 449 | expected_op = balancing_stack.pop() |
| 450 | if expected_op != data: |
| 451 | raise TemplateSyntaxError('unexpected "%s", ' |
| 452 | 'expected "%s"' % |
| 453 | (data, expected_op), |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 454 | lineno, filename) |
| 455 | # yield items |
| 456 | if tokens is not None: |
| 457 | if data: |
| 458 | yield lineno, tokens, data |
| 459 | lineno += data.count('\n') |
| 460 | |
| 461 | # fetch new position into new variable so that we can check |
| 462 | # if there is a internal parsing error which would result |
| 463 | # in an infinite loop |
| 464 | pos2 = m.end() |
| 465 | |
| 466 | # handle state changes |
| 467 | if new_state is not None: |
| 468 | # remove the uppermost state |
| 469 | if new_state == '#pop': |
| 470 | stack.pop() |
| 471 | # resolve the new state by group checking |
| 472 | elif new_state == '#bygroup': |
| 473 | for key, value in m.groupdict().iteritems(): |
| 474 | if value is not None: |
| 475 | stack.append(key) |
| 476 | break |
| 477 | else: |
| 478 | raise RuntimeError('%r wanted to resolve the ' |
| 479 | 'new state dynamically but' |
| 480 | ' no group matched' % |
| 481 | regex) |
| 482 | # direct state name given |
| 483 | else: |
| 484 | stack.append(new_state) |
| 485 | statetokens = self.rules[stack[-1]] |
| 486 | # we are still at the same position and no stack change. |
| 487 | # this means a loop without break condition, avoid that and |
| 488 | # raise error |
| 489 | elif pos2 == pos: |
| 490 | raise RuntimeError('%r yielded empty string without ' |
| 491 | 'stack change' % regex) |
| 492 | # publish new function and start again |
| 493 | pos = pos2 |
| 494 | break |
Armin Ronacher | 92f572f | 2007-02-26 22:17:32 +0100 | [diff] [blame] | 495 | # if loop terminated without break we havn't found a single match |
| 496 | # either we are at the end of the file or we have a problem |
| 497 | else: |
| 498 | # end of text |
| 499 | if pos >= source_length: |
| 500 | return |
| 501 | # something went wrong |
| 502 | raise TemplateSyntaxError('unexpected char %r at %d' % |
Armin Ronacher | 2158091 | 2007-04-17 17:13:10 +0200 | [diff] [blame] | 503 | (source[pos], pos), lineno, |
| 504 | filename) |