blob: 53da005e763a727057281ff4ba9f7cc0079b4dec [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
3# $Id$
4#
5# convert template to internal format
6#
7# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
8#
9# This code can only be used for 1.6 alpha testing. All other use
10# require explicit permission from Secret Labs AB.
11#
12# Portions of this engine have been developed in cooperation with
13# CNRI. Hewlett-Packard provided funding for 1.6 integration and
14# other compatibility work.
15#
16
Guido van Rossum7627c0d2000-03-31 14:58:54 +000017import array, string, sys
18
19import _sre
20
21from sre_constants import *
22
23# find an array type code that matches the engine's code size
24for WORDSIZE in "BHil":
25 if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
Guido van Rossumb81e70e2000-04-10 17:10:48 +000026 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +000027else:
28 raise RuntimeError, "cannot find a useable array type"
29
30# FIXME: <fl> should move some optimizations from the parser to here!
31
32class Code:
33 def __init__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000034 self.data = []
Guido van Rossum7627c0d2000-03-31 14:58:54 +000035 def __len__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000036 return len(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000037 def __getitem__(self, index):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000038 return self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000039 def __setitem__(self, index, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000040 self.data[index] = code
Guido van Rossum7627c0d2000-03-31 14:58:54 +000041 def append(self, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000042 self.data.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000043 def todata(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000044 # print self.data
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000045 try:
46 return array.array(WORDSIZE, self.data).tostring()
47 except OverflowError:
48 print self.data
49 raise
Guido van Rossum7627c0d2000-03-31 14:58:54 +000050
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000051def _compile(code, pattern, flags, level=0):
Guido van Rossum7627c0d2000-03-31 14:58:54 +000052 append = code.append
53 for op, av in pattern:
Guido van Rossumb81e70e2000-04-10 17:10:48 +000054 if op is ANY:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000055 if flags & SRE_FLAG_DOTALL:
56 append(OPCODES[op]) # any character at all!
Guido van Rossumb81e70e2000-04-10 17:10:48 +000057 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000058 append(OPCODES[CATEGORY])
59 append(CHCODES[CATEGORY_NOT_LINEBREAK])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000060 elif op in (SUCCESS, FAILURE):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000061 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000062 elif op is AT:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000063 append(OPCODES[op])
64 if flags & SRE_FLAG_MULTILINE:
65 append(ATCODES[AT_MULTILINE[av]])
66 else:
67 append(ATCODES[av])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000068 elif op is BRANCH:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000069 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000070 tail = []
71 for av in av[1]:
72 skip = len(code); append(0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000073 _compile(code, av, flags, level)
74 append(OPCODES[JUMP])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000075 tail.append(len(code)); append(0)
76 code[skip] = len(code) - skip
77 append(0) # end of branch
78 for tail in tail:
79 code[tail] = len(code) - tail
80 elif op is CALL:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000081 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000082 skip = len(code); append(0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000083 _compile(code, av, flags, level+1)
84 append(OPCODES[SUCCESS])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000085 code[skip] = len(code) - skip
86 elif op is CATEGORY: # not used by current parser
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000087 append(OPCODES[op])
88 if flags & SRE_FLAG_LOCALE:
89 append(CH_LOCALE[CHCODES[av]])
90 else:
91 append(CHCODES[av])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000092 elif op is GROUP:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000093 if flags & SRE_FLAG_IGNORECASE:
94 append(OPCODES[OP_IGNORE[op]])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000095 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000096 append(OPCODES[op])
97 append(av-1)
Guido van Rossumb81e70e2000-04-10 17:10:48 +000098 elif op is IN:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000099 if flags & SRE_FLAG_IGNORECASE:
100 append(OPCODES[OP_IGNORE[op]])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000101 def fixup(literal):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000102 return ord(literal.lower())
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000103 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000104 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000105 fixup = ord
106 skip = len(code); append(0)
107 for op, av in av:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000108 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000109 if op is NEGATE:
110 pass
111 elif op is LITERAL:
112 append(fixup(av))
113 elif op is RANGE:
114 append(fixup(av[0]))
115 append(fixup(av[1]))
116 elif op is CATEGORY:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000117 if flags & SRE_FLAG_LOCALE:
118 append(CH_LOCALE[CHCODES[av]])
119 else:
120 append(CHCODES[av])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000121 else:
122 raise ValueError, "unsupported set operator"
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000123 append(OPCODES[FAILURE])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000124 code[skip] = len(code) - skip
125 elif op in (LITERAL, NOT_LITERAL):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000126 if flags & SRE_FLAG_IGNORECASE:
127 append(OPCODES[OP_IGNORE[op]])
128 append(ord(av.lower()))
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000129 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000130 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000131 append(ord(av))
132 elif op is MARK:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000133 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000134 append(av)
135 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
136 lo, hi = av[2].getwidth()
137 if lo == 0:
138 raise SyntaxError, "cannot repeat zero-width items"
139 if lo == hi == 1 and op is MAX_REPEAT:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000140 append(OPCODES[MAX_REPEAT_ONE])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000141 skip = len(code); append(0)
142 append(av[0])
143 append(av[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000144 _compile(code, av[2], flags, level+1)
145 append(OPCODES[SUCCESS])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000146 code[skip] = len(code) - skip
147 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000148 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000149 skip = len(code); append(0)
150 append(av[0])
151 append(av[1])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000152 _compile(code, av[2], flags, level+1)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000153 if op is MIN_REPEAT:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000154 append(OPCODES[MIN_UNTIL])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000155 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000156 append(OPCODES[MAX_UNTIL])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000157 code[skip] = len(code) - skip
158 elif op is SUBPATTERN:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000159 group = av[0]
160 if group:
161 append(OPCODES[MARK])
162 append((group-1)*2)
163 _compile(code, av[1], flags, level+1)
164 if group:
165 append(OPCODES[MARK])
166 append((group-1)*2+1)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000167 else:
168 raise ValueError, ("unsupported operand type", op)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000169
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000170def compile(p, flags=0):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000171 # convert pattern list to internal format
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000172 if type(p) in (type(""), type(u"")):
173 import sre_parse
174 pattern = p
175 p = sre_parse.parse(p)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000176 else:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000177 pattern = None
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000178 flags = p.pattern.flags | flags
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000179 code = Code()
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000180 _compile(code, p.data, flags)
181 code.append(OPCODES[SUCCESS])
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000182 data = code.todata()
183 if 0: # debugging
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000184 print
185 print "-" * 68
186 import sre_disasm
187 sre_disasm.disasm(data)
188 print "-" * 68
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000189 return _sre.compile(
190 pattern, flags,
191 data,
192 p.pattern.groups-1, p.pattern.groupdict
193 )