blob: 2d7c0212733e6e23c0f7cd54339158b24f79b0c1 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
3# $Id$
4#
5# convert template to internal format
6#
7# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
8#
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009# Portions of this engine have been developed in cooperation with
10# CNRI. Hewlett-Packard provided funding for 1.6 integration and
11# other compatibility work.
12#
13
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014import array, string, sys
15
16import _sre
17
18from sre_constants import *
19
20# find an array type code that matches the engine's code size
21for WORDSIZE in "BHil":
22 if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
Guido van Rossumb81e70e2000-04-10 17:10:48 +000023 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +000024else:
25 raise RuntimeError, "cannot find a useable array type"
26
27# FIXME: <fl> should move some optimizations from the parser to here!
28
29class Code:
30 def __init__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000031 self.data = []
Guido van Rossum7627c0d2000-03-31 14:58:54 +000032 def __len__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000033 return len(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000034 def __getitem__(self, index):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000035 return self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000036 def __setitem__(self, index, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000037 self.data[index] = code
Guido van Rossum7627c0d2000-03-31 14:58:54 +000038 def append(self, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000039 self.data.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000040 def todata(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000041 # print self.data
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000042 try:
43 return array.array(WORDSIZE, self.data).tostring()
44 except OverflowError:
45 print self.data
46 raise
Guido van Rossum7627c0d2000-03-31 14:58:54 +000047
Fredrik Lundh436c3d52000-06-29 08:58:44 +000048def _compile(code, pattern, flags):
Guido van Rossum7627c0d2000-03-31 14:58:54 +000049 append = code.append
50 for op, av in pattern:
Guido van Rossumb81e70e2000-04-10 17:10:48 +000051 if op is ANY:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000052 if flags & SRE_FLAG_DOTALL:
53 append(OPCODES[op]) # any character at all!
Guido van Rossumb81e70e2000-04-10 17:10:48 +000054 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000055 append(OPCODES[CATEGORY])
56 append(CHCODES[CATEGORY_NOT_LINEBREAK])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000057 elif op in (SUCCESS, FAILURE):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000058 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000059 elif op is AT:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000060 append(OPCODES[op])
61 if flags & SRE_FLAG_MULTILINE:
62 append(ATCODES[AT_MULTILINE[av]])
63 else:
64 append(ATCODES[av])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000065 elif op is BRANCH:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000066 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000067 tail = []
68 for av in av[1]:
69 skip = len(code); append(0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000070 _compile(code, av, flags)
71## append(OPCODES[SUCCESS])
72 append(OPCODES[JUMP])
73 tail.append(len(code)); append(0)
Guido van Rossumb81e70e2000-04-10 17:10:48 +000074 code[skip] = len(code) - skip
75 append(0) # end of branch
Fredrik Lundh436c3d52000-06-29 08:58:44 +000076 for tail in tail:
Guido van Rossumb81e70e2000-04-10 17:10:48 +000077 code[tail] = len(code) - tail
78 elif op is CALL:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000079 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000080 skip = len(code); append(0)
Fredrik Lundh436c3d52000-06-29 08:58:44 +000081 _compile(code, av, flags)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000082 append(OPCODES[SUCCESS])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000083 code[skip] = len(code) - skip
Fredrik Lundh436c3d52000-06-29 08:58:44 +000084 elif op is CATEGORY:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000085 append(OPCODES[op])
86 if flags & SRE_FLAG_LOCALE:
87 append(CH_LOCALE[CHCODES[av]])
Fredrik Lundh436c3d52000-06-29 08:58:44 +000088 elif flags & SRE_FLAG_UNICODE:
89 append(CH_UNICODE[CHCODES[av]])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000090 else:
91 append(CHCODES[av])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000092 elif op is GROUP:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000093 if flags & SRE_FLAG_IGNORECASE:
94 append(OPCODES[OP_IGNORE[op]])
Guido van Rossumb81e70e2000-04-10 17:10:48 +000095 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000096 append(OPCODES[op])
97 append(av-1)
Guido van Rossumb81e70e2000-04-10 17:10:48 +000098 elif op is IN:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000099 if flags & SRE_FLAG_IGNORECASE:
100 append(OPCODES[OP_IGNORE[op]])
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000101 def fixup(literal, flags=flags):
102 return _sre.getlower(ord(literal), flags)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000103 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000104 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000105 fixup = ord
106 skip = len(code); append(0)
107 for op, av in av:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000108 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000109 if op is NEGATE:
110 pass
111 elif op is LITERAL:
112 append(fixup(av))
113 elif op is RANGE:
114 append(fixup(av[0]))
115 append(fixup(av[1]))
116 elif op is CATEGORY:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000117 if flags & SRE_FLAG_LOCALE:
118 append(CH_LOCALE[CHCODES[av]])
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000119 elif flags & SRE_FLAG_UNICODE:
120 append(CH_UNICODE[CHCODES[av]])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000121 else:
122 append(CHCODES[av])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000123 else:
124 raise ValueError, "unsupported set operator"
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000125 append(OPCODES[FAILURE])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000126 code[skip] = len(code) - skip
127 elif op in (LITERAL, NOT_LITERAL):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000128 if flags & SRE_FLAG_IGNORECASE:
129 append(OPCODES[OP_IGNORE[op]])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000130 else:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000131 append(OPCODES[op])
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000132 append(ord(av))
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000133 elif op is MARK:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000134 append(OPCODES[op])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000135 append(av)
136 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000137 if flags & SRE_FLAG_TEMPLATE:
138 append(OPCODES[REPEAT])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000139 skip = len(code); append(0)
140 append(av[0])
141 append(av[1])
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000142 _compile(code, av[2], flags)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000143 append(OPCODES[SUCCESS])
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000144 code[skip] = len(code) - skip
145 else:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000146 lo, hi = av[2].getwidth()
147 if lo == 0:
148 raise error, "nothing to repeat"
149 if 0 and lo == hi == 1 and op is MAX_REPEAT:
150 # FIXME: <fl> need a better way to figure out when
151 # it's safe to use this one (in the parser, probably)
152 append(OPCODES[MAX_REPEAT_ONE])
153 skip = len(code); append(0)
154 append(av[0])
155 append(av[1])
156 _compile(code, av[2], flags)
157 append(OPCODES[SUCCESS])
158 code[skip] = len(code) - skip
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000159 else:
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000160 append(OPCODES[op])
161 skip = len(code); append(0)
162 append(av[0])
163 append(av[1])
164 _compile(code, av[2], flags)
165 append(OPCODES[SUCCESS])
166 code[skip] = len(code) - skip
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000167 elif op is SUBPATTERN:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000168 group = av[0]
169 if group:
170 append(OPCODES[MARK])
171 append((group-1)*2)
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000172 _compile(code, av[1], flags)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000173 if group:
174 append(OPCODES[MARK])
175 append((group-1)*2+1)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000176 else:
177 raise ValueError, ("unsupported operand type", op)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000178
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000179def compile(p, flags=0):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000180 # convert pattern list to internal format
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000181 if type(p) in (type(""), type(u"")):
182 import sre_parse
183 pattern = p
184 p = sre_parse.parse(p)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000185 else:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000186 pattern = None
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000187 flags = p.pattern.flags | flags
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000188 code = Code()
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000189 _compile(code, p.data, flags)
190 code.append(OPCODES[SUCCESS])
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000191 data = code.todata()
192 if 0: # debugging
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000193 print
194 print "-" * 68
195 import sre_disasm
196 sre_disasm.disasm(data)
197 print "-" * 68
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000198 return _sre.compile(
199 pattern, flags,
200 data,
201 p.pattern.groups-1, p.pattern.groupdict
202 )