blob: fa0cf37e8b8b05b537ad2658a94f8288070e1808 [file] [log] [blame]
Guido van Rossumaad67612000-05-08 17:31:04 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossumaad67612000-05-08 17:31:04 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossumaad67612000-05-08 17:31:04 +00008# Portions of this engine have been developed in cooperation with
9# CNRI. Hewlett-Packard provided funding for 1.6 integration and
10# other compatibility work.
11#
12
Guido van Rossum3e06ab12000-06-29 19:35:29 +000013import array
Guido van Rossumaad67612000-05-08 17:31:04 +000014import _sre
15
16from sre_constants import *
17
18# find an array type code that matches the engine's code size
19for WORDSIZE in "BHil":
20 if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
Guido van Rossum4358b2c2000-06-30 16:13:37 +000021 break
Guido van Rossumaad67612000-05-08 17:31:04 +000022else:
Thomas Wouters7e474022000-07-16 12:04:32 +000023 raise RuntimeError, "cannot find a usable array type"
Guido van Rossumaad67612000-05-08 17:31:04 +000024
Guido van Rossumaad67612000-05-08 17:31:04 +000025def _compile(code, pattern, flags):
Guido van Rossum4358b2c2000-06-30 16:13:37 +000026 # internal: compile a (sub)pattern
Guido van Rossum3e06ab12000-06-29 19:35:29 +000027 emit = code.append
Guido van Rossumaad67612000-05-08 17:31:04 +000028 for op, av in pattern:
Guido van Rossum4358b2c2000-06-30 16:13:37 +000029 if op in (LITERAL, NOT_LITERAL):
30 if flags & SRE_FLAG_IGNORECASE:
31 emit(OPCODES[OP_IGNORE[op]])
32 else:
33 emit(OPCODES[op])
34 emit(av)
35 elif op is IN:
36 if flags & SRE_FLAG_IGNORECASE:
37 emit(OPCODES[OP_IGNORE[op]])
38 def fixup(literal, flags=flags):
39 return _sre.getlower(literal, flags)
40 else:
41 emit(OPCODES[op])
Guido van Rossumc08cb042000-07-01 04:23:47 +000042 fixup = lambda x: x
Guido van Rossum4358b2c2000-06-30 16:13:37 +000043 skip = len(code); emit(0)
44 for op, av in av:
45 emit(OPCODES[op])
46 if op is NEGATE:
47 pass
48 elif op is LITERAL:
49 emit(fixup(av))
50 elif op is RANGE:
51 emit(fixup(av[0]))
52 emit(fixup(av[1]))
53 elif op is CATEGORY:
54 if flags & SRE_FLAG_LOCALE:
55 emit(CHCODES[CH_LOCALE[av]])
56 elif flags & SRE_FLAG_UNICODE:
57 emit(CHCODES[CH_UNICODE[av]])
58 else:
59 emit(CHCODES[av])
60 else:
61 raise error, "internal: unsupported set operator"
62 emit(OPCODES[FAILURE])
63 code[skip] = len(code) - skip
64 elif op is ANY:
65 if flags & SRE_FLAG_DOTALL:
66 emit(OPCODES[op])
67 else:
68 emit(OPCODES[CATEGORY])
69 emit(CHCODES[CATEGORY_NOT_LINEBREAK])
70 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
71 if flags & SRE_FLAG_TEMPLATE:
72 emit(OPCODES[REPEAT])
73 skip = len(code); emit(0)
74 emit(av[0])
75 emit(av[1])
76 _compile(code, av[2], flags)
77 emit(OPCODES[SUCCESS])
78 code[skip] = len(code) - skip
79 else:
80 lo, hi = av[2].getwidth()
81 if lo == 0:
82 raise error, "nothing to repeat"
83 if 0 and lo == hi == 1 and op is MAX_REPEAT:
84 # FIXME: <fl> need a better way to figure out when
85 # it's safe to use this one (in the parser, probably)
86 emit(OPCODES[MAX_REPEAT_ONE])
87 skip = len(code); emit(0)
88 emit(av[0])
89 emit(av[1])
90 _compile(code, av[2], flags)
91 emit(OPCODES[SUCCESS])
92 code[skip] = len(code) - skip
93 else:
94 emit(OPCODES[op])
95 skip = len(code); emit(0)
96 emit(av[0])
97 emit(av[1])
98 _compile(code, av[2], flags)
99 emit(OPCODES[SUCCESS])
100 code[skip] = len(code) - skip
101 elif op is SUBPATTERN:
102 group = av[0]
103 if group:
104 emit(OPCODES[MARK])
105 emit((group-1)*2)
106 _compile(code, av[1], flags)
107 if group:
108 emit(OPCODES[MARK])
109 emit((group-1)*2+1)
110 elif op in (SUCCESS, FAILURE):
111 emit(OPCODES[op])
112 elif op in (ASSERT, ASSERT_NOT, CALL):
113 emit(OPCODES[op])
114 skip = len(code); emit(0)
115 _compile(code, av, flags)
116 emit(OPCODES[SUCCESS])
117 code[skip] = len(code) - skip
118 elif op is AT:
119 emit(OPCODES[op])
120 if flags & SRE_FLAG_MULTILINE:
Guido van Rossumc08cb042000-07-01 04:23:47 +0000121 emit(ATCODES[AT_MULTILINE.get(av, av)])
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000122 else:
123 emit(ATCODES[av])
124 elif op is BRANCH:
125 emit(OPCODES[op])
126 tail = []
127 for av in av[1]:
128 skip = len(code); emit(0)
129 _compile(code, av, flags)
130 emit(OPCODES[JUMP])
131 tail.append(len(code)); emit(0)
132 code[skip] = len(code) - skip
133 emit(0) # end of branch
134 for tail in tail:
135 code[tail] = len(code) - tail
136 elif op is CATEGORY:
137 emit(OPCODES[op])
138 if flags & SRE_FLAG_LOCALE:
139 emit(CHCODES[CH_LOCALE[av]])
140 elif flags & SRE_FLAG_UNICODE:
141 emit(CHCODES[CH_UNICODE[av]])
142 else:
143 emit(CHCODES[av])
144 elif op is GROUP:
145 if flags & SRE_FLAG_IGNORECASE:
146 emit(OPCODES[OP_IGNORE[op]])
147 else:
148 emit(OPCODES[op])
149 emit(av-1)
150 elif op is MARK:
151 emit(OPCODES[op])
152 emit(av)
153 else:
154 raise ValueError, ("unsupported operand type", op)
155
156def _compile_info(code, pattern, flags):
157 # internal: compile an info block. in the current version,
158 # this contains min/max pattern width and a literal prefix,
159 # if any
160 lo, hi = pattern.getwidth()
161 if lo == 0:
162 return # not worth it
163 # look for a literal prefix
164 prefix = []
165 if not (flags & SRE_FLAG_IGNORECASE):
166 for op, av in pattern.data:
167 if op is LITERAL:
168 prefix.append(av)
169 else:
170 break
171 # add an info block
172 emit = code.append
173 emit(OPCODES[INFO])
174 skip = len(code); emit(0)
175 # literal flag
176 mask = 0
177 if len(prefix) == len(pattern.data):
178 mask = 1
179 emit(mask)
180 # pattern length
181 emit(lo)
182 if hi < 32768:
183 emit(hi)
184 else:
185 emit(0)
186 # add literal prefix
187 emit(len(prefix))
188 if prefix:
189 code.extend(prefix)
190 # generate overlap table
191 table = [-1] + ([0]*len(prefix))
192 for i in range(len(prefix)):
193 table[i+1] = table[i]+1
194 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
195 table[i+1] = table[table[i+1]-1]+1
196 code.extend(table[1:]) # don't store first entry
197 code[skip] = len(code) - skip
Guido van Rossumaad67612000-05-08 17:31:04 +0000198
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000199def compile(p, flags=0):
200 # internal: convert pattern list to internal format
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000201
202 # compile, as necessary
Guido van Rossumaad67612000-05-08 17:31:04 +0000203 if type(p) in (type(""), type(u"")):
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000204 import sre_parse
205 pattern = p
Guido van Rossumc08cb042000-07-01 04:23:47 +0000206 p = sre_parse.parse(p, flags)
Guido van Rossumaad67612000-05-08 17:31:04 +0000207 else:
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000208 pattern = None
209
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000210 flags = p.pattern.flags | flags
211 code = []
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000212
213 # compile info block
214 _compile_info(code, p, flags)
215
216 # compile the pattern
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000217 _compile(code, p.data, flags)
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000218
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000219 code.append(OPCODES[SUCCESS])
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000220
221 # FIXME: <fl> get rid of this limitation!
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000222 assert p.pattern.groups <= 100,\
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000223 "sorry, but this version only supports 100 named groups"
224
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000225 return _sre.compile(
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000226 pattern, flags,
227 array.array(WORDSIZE, code).tostring(),
228 p.pattern.groups-1, p.pattern.groupdict
229 )