Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 1 | # |
| 2 | # Secret Labs' Regular Expression Engine |
| 3 | # $Id$ |
| 4 | # |
| 5 | # convert template to internal format |
| 6 | # |
| 7 | # Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. |
| 8 | # |
| 9 | # This code can only be used for 1.6 alpha testing. All other use |
| 10 | # require explicit permission from Secret Labs AB. |
| 11 | # |
| 12 | # Portions of this engine have been developed in cooperation with |
| 13 | # CNRI. Hewlett-Packard provided funding for 1.6 integration and |
| 14 | # other compatibility work. |
| 15 | # |
| 16 | |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 17 | import array, string, sys |
| 18 | |
| 19 | import _sre |
| 20 | |
| 21 | from sre_constants import * |
| 22 | |
| 23 | # find an array type code that matches the engine's code size |
| 24 | for WORDSIZE in "BHil": |
| 25 | if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 26 | break |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 27 | else: |
| 28 | raise RuntimeError, "cannot find a useable array type" |
| 29 | |
| 30 | # FIXME: <fl> should move some optimizations from the parser to here! |
| 31 | |
| 32 | class Code: |
| 33 | def __init__(self): |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 34 | self.data = [] |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 35 | def __len__(self): |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 36 | return len(self.data) |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 37 | def __getitem__(self, index): |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 38 | return self.data[index] |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 39 | def __setitem__(self, index, code): |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 40 | self.data[index] = code |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 41 | def append(self, code): |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 42 | self.data.append(code) |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 43 | def todata(self): |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 44 | # print self.data |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 45 | try: |
| 46 | return array.array(WORDSIZE, self.data).tostring() |
| 47 | except OverflowError: |
| 48 | print self.data |
| 49 | raise |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 50 | |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 51 | def _compile(code, pattern, flags, level=0): |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 52 | append = code.append |
| 53 | for op, av in pattern: |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 54 | if op is ANY: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 55 | if flags & SRE_FLAG_DOTALL: |
| 56 | append(OPCODES[op]) # any character at all! |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 57 | else: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 58 | append(OPCODES[CATEGORY]) |
| 59 | append(CHCODES[CATEGORY_NOT_LINEBREAK]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 60 | elif op in (SUCCESS, FAILURE): |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 61 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 62 | elif op is AT: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 63 | append(OPCODES[op]) |
| 64 | if flags & SRE_FLAG_MULTILINE: |
| 65 | append(ATCODES[AT_MULTILINE[av]]) |
| 66 | else: |
| 67 | append(ATCODES[av]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 68 | elif op is BRANCH: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 69 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 70 | tail = [] |
| 71 | for av in av[1]: |
| 72 | skip = len(code); append(0) |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 73 | _compile(code, av, flags, level) |
| 74 | append(OPCODES[JUMP]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 75 | tail.append(len(code)); append(0) |
| 76 | code[skip] = len(code) - skip |
| 77 | append(0) # end of branch |
| 78 | for tail in tail: |
| 79 | code[tail] = len(code) - tail |
| 80 | elif op is CALL: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 81 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 82 | skip = len(code); append(0) |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 83 | _compile(code, av, flags, level+1) |
| 84 | append(OPCODES[SUCCESS]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 85 | code[skip] = len(code) - skip |
| 86 | elif op is CATEGORY: # not used by current parser |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 87 | append(OPCODES[op]) |
| 88 | if flags & SRE_FLAG_LOCALE: |
| 89 | append(CH_LOCALE[CHCODES[av]]) |
| 90 | else: |
| 91 | append(CHCODES[av]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 92 | elif op is GROUP: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 93 | if flags & SRE_FLAG_IGNORECASE: |
| 94 | append(OPCODES[OP_IGNORE[op]]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 95 | else: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 96 | append(OPCODES[op]) |
| 97 | append(av-1) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 98 | elif op is IN: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 99 | if flags & SRE_FLAG_IGNORECASE: |
| 100 | append(OPCODES[OP_IGNORE[op]]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 101 | def fixup(literal): |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 102 | return ord(literal.lower()) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 103 | else: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 104 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 105 | fixup = ord |
| 106 | skip = len(code); append(0) |
| 107 | for op, av in av: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 108 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 109 | if op is NEGATE: |
| 110 | pass |
| 111 | elif op is LITERAL: |
| 112 | append(fixup(av)) |
| 113 | elif op is RANGE: |
| 114 | append(fixup(av[0])) |
| 115 | append(fixup(av[1])) |
| 116 | elif op is CATEGORY: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 117 | if flags & SRE_FLAG_LOCALE: |
| 118 | append(CH_LOCALE[CHCODES[av]]) |
| 119 | else: |
| 120 | append(CHCODES[av]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 121 | else: |
| 122 | raise ValueError, "unsupported set operator" |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 123 | append(OPCODES[FAILURE]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 124 | code[skip] = len(code) - skip |
| 125 | elif op in (LITERAL, NOT_LITERAL): |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 126 | if flags & SRE_FLAG_IGNORECASE: |
| 127 | append(OPCODES[OP_IGNORE[op]]) |
| 128 | append(ord(av.lower())) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 129 | else: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 130 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 131 | append(ord(av)) |
| 132 | elif op is MARK: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 133 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 134 | append(av) |
| 135 | elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): |
| 136 | lo, hi = av[2].getwidth() |
| 137 | if lo == 0: |
| 138 | raise SyntaxError, "cannot repeat zero-width items" |
| 139 | if lo == hi == 1 and op is MAX_REPEAT: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 140 | append(OPCODES[MAX_REPEAT_ONE]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 141 | skip = len(code); append(0) |
| 142 | append(av[0]) |
| 143 | append(av[1]) |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 144 | _compile(code, av[2], flags, level+1) |
| 145 | append(OPCODES[SUCCESS]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 146 | code[skip] = len(code) - skip |
| 147 | else: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 148 | append(OPCODES[op]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 149 | skip = len(code); append(0) |
| 150 | append(av[0]) |
| 151 | append(av[1]) |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 152 | _compile(code, av[2], flags, level+1) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 153 | if op is MIN_REPEAT: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 154 | append(OPCODES[MIN_UNTIL]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 155 | else: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 156 | append(OPCODES[MAX_UNTIL]) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 157 | code[skip] = len(code) - skip |
| 158 | elif op is SUBPATTERN: |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 159 | group = av[0] |
| 160 | if group: |
| 161 | append(OPCODES[MARK]) |
| 162 | append((group-1)*2) |
| 163 | _compile(code, av[1], flags, level+1) |
| 164 | if group: |
| 165 | append(OPCODES[MARK]) |
| 166 | append((group-1)*2+1) |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 167 | else: |
| 168 | raise ValueError, ("unsupported operand type", op) |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 169 | |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 170 | def compile(p, flags=0): |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 171 | # convert pattern list to internal format |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 172 | if type(p) in (type(""), type(u"")): |
| 173 | import sre_parse |
| 174 | pattern = p |
| 175 | p = sre_parse.parse(p) |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 176 | else: |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 177 | pattern = None |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 178 | flags = p.pattern.flags | flags |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 179 | code = Code() |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 180 | _compile(code, p.data, flags) |
| 181 | code.append(OPCODES[SUCCESS]) |
Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 182 | data = code.todata() |
| 183 | if 0: # debugging |
Guido van Rossum | b81e70e | 2000-04-10 17:10:48 +0000 | [diff] [blame] | 184 | print |
| 185 | print "-" * 68 |
| 186 | import sre_disasm |
| 187 | sre_disasm.disasm(data) |
| 188 | print "-" * 68 |
Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 189 | return _sre.compile( |
| 190 | pattern, flags, |
| 191 | data, |
| 192 | p.pattern.groups-1, p.pattern.groupdict |
| 193 | ) |