blob: 97a57e2db19cdf2db71604976fd6d786cf5d2173 [file] [log] [blame]
Guido van Rossumaad67612000-05-08 17:31:04 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossumaad67612000-05-08 17:31:04 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossum8d691c82000-09-01 19:25:51 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossumaad67612000-05-08 17:31:04 +00009#
10
Guido van Rossumaad67612000-05-08 17:31:04 +000011import _sre
12
13from sre_constants import *
14
Guido van Rossum8d691c82000-09-01 19:25:51 +000015MAXCODE = 65535
Guido van Rossumaad67612000-05-08 17:31:04 +000016
Guido van Rossumaad67612000-05-08 17:31:04 +000017def _compile(code, pattern, flags):
Guido van Rossum4358b2c2000-06-30 16:13:37 +000018 # internal: compile a (sub)pattern
Guido van Rossum3e06ab12000-06-29 19:35:29 +000019 emit = code.append
Guido van Rossumaad67612000-05-08 17:31:04 +000020 for op, av in pattern:
Guido van Rossum4358b2c2000-06-30 16:13:37 +000021 if op in (LITERAL, NOT_LITERAL):
22 if flags & SRE_FLAG_IGNORECASE:
23 emit(OPCODES[OP_IGNORE[op]])
24 else:
25 emit(OPCODES[op])
26 emit(av)
27 elif op is IN:
28 if flags & SRE_FLAG_IGNORECASE:
29 emit(OPCODES[OP_IGNORE[op]])
30 def fixup(literal, flags=flags):
31 return _sre.getlower(literal, flags)
32 else:
33 emit(OPCODES[op])
Guido van Rossumc08cb042000-07-01 04:23:47 +000034 fixup = lambda x: x
Guido van Rossum4358b2c2000-06-30 16:13:37 +000035 skip = len(code); emit(0)
Guido van Rossum8d691c82000-09-01 19:25:51 +000036 _compile_charset(av, flags, code, fixup)
Guido van Rossum4358b2c2000-06-30 16:13:37 +000037 code[skip] = len(code) - skip
38 elif op is ANY:
39 if flags & SRE_FLAG_DOTALL:
Guido van Rossum8d691c82000-09-01 19:25:51 +000040 emit(OPCODES[ANY_ALL])
Guido van Rossum4358b2c2000-06-30 16:13:37 +000041 else:
Guido van Rossum8d691c82000-09-01 19:25:51 +000042 emit(OPCODES[ANY])
Guido van Rossum4358b2c2000-06-30 16:13:37 +000043 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
44 if flags & SRE_FLAG_TEMPLATE:
Guido van Rossum8d691c82000-09-01 19:25:51 +000045 raise error, "internal: unsupported template operator"
Guido van Rossum4358b2c2000-06-30 16:13:37 +000046 emit(OPCODES[REPEAT])
47 skip = len(code); emit(0)
48 emit(av[0])
49 emit(av[1])
50 _compile(code, av[2], flags)
51 emit(OPCODES[SUCCESS])
52 code[skip] = len(code) - skip
Guido van Rossum8d691c82000-09-01 19:25:51 +000053 elif _simple(av) and op == MAX_REPEAT:
54 emit(OPCODES[REPEAT_ONE])
55 skip = len(code); emit(0)
56 emit(av[0])
57 emit(av[1])
58 _compile(code, av[2], flags)
59 emit(OPCODES[SUCCESS])
60 code[skip] = len(code) - skip
Guido van Rossum4358b2c2000-06-30 16:13:37 +000061 else:
Guido van Rossum8d691c82000-09-01 19:25:51 +000062 emit(OPCODES[REPEAT])
63 skip = len(code); emit(0)
64 emit(av[0])
65 emit(av[1])
66 _compile(code, av[2], flags)
67 code[skip] = len(code) - skip
68 if op == MAX_REPEAT:
69 emit(OPCODES[MAX_UNTIL])
Guido van Rossum4358b2c2000-06-30 16:13:37 +000070 else:
Guido van Rossum8d691c82000-09-01 19:25:51 +000071 emit(OPCODES[MIN_UNTIL])
Guido van Rossum4358b2c2000-06-30 16:13:37 +000072 elif op is SUBPATTERN:
Guido van Rossum8d691c82000-09-01 19:25:51 +000073 if av[0]:
Guido van Rossum4358b2c2000-06-30 16:13:37 +000074 emit(OPCODES[MARK])
Guido van Rossum8d691c82000-09-01 19:25:51 +000075 emit((av[0]-1)*2)
76 # _compile_info(code, av[1], flags)
Guido van Rossum4358b2c2000-06-30 16:13:37 +000077 _compile(code, av[1], flags)
Guido van Rossum8d691c82000-09-01 19:25:51 +000078 if av[0]:
Guido van Rossum4358b2c2000-06-30 16:13:37 +000079 emit(OPCODES[MARK])
Guido van Rossum8d691c82000-09-01 19:25:51 +000080 emit((av[0]-1)*2+1)
Guido van Rossum4358b2c2000-06-30 16:13:37 +000081 elif op in (SUCCESS, FAILURE):
82 emit(OPCODES[op])
Guido van Rossum8d691c82000-09-01 19:25:51 +000083 elif op in (ASSERT, ASSERT_NOT):
84 emit(OPCODES[op])
85 skip = len(code); emit(0)
86 if av[0] >= 0:
87 emit(0) # look ahead
88 else:
89 lo, hi = av[1].getwidth()
90 if lo != hi:
91 raise error, "look-behind requires fixed-width pattern"
92 emit(lo) # look behind
93 _compile(code, av[1], flags)
94 emit(OPCODES[SUCCESS])
95 code[skip] = len(code) - skip
96 elif op is CALL:
Guido van Rossum4358b2c2000-06-30 16:13:37 +000097 emit(OPCODES[op])
98 skip = len(code); emit(0)
99 _compile(code, av, flags)
100 emit(OPCODES[SUCCESS])
101 code[skip] = len(code) - skip
102 elif op is AT:
103 emit(OPCODES[op])
104 if flags & SRE_FLAG_MULTILINE:
Guido van Rossumc08cb042000-07-01 04:23:47 +0000105 emit(ATCODES[AT_MULTILINE.get(av, av)])
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000106 else:
107 emit(ATCODES[av])
108 elif op is BRANCH:
109 emit(OPCODES[op])
110 tail = []
111 for av in av[1]:
112 skip = len(code); emit(0)
Guido van Rossum8d691c82000-09-01 19:25:51 +0000113 # _compile_info(code, av, flags)
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000114 _compile(code, av, flags)
115 emit(OPCODES[JUMP])
116 tail.append(len(code)); emit(0)
117 code[skip] = len(code) - skip
118 emit(0) # end of branch
119 for tail in tail:
120 code[tail] = len(code) - tail
121 elif op is CATEGORY:
122 emit(OPCODES[op])
123 if flags & SRE_FLAG_LOCALE:
124 emit(CHCODES[CH_LOCALE[av]])
125 elif flags & SRE_FLAG_UNICODE:
126 emit(CHCODES[CH_UNICODE[av]])
127 else:
128 emit(CHCODES[av])
Guido van Rossum8d691c82000-09-01 19:25:51 +0000129 elif op is GROUPREF:
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000130 if flags & SRE_FLAG_IGNORECASE:
131 emit(OPCODES[OP_IGNORE[op]])
132 else:
133 emit(OPCODES[op])
134 emit(av-1)
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000135 else:
136 raise ValueError, ("unsupported operand type", op)
137
Guido van Rossum8d691c82000-09-01 19:25:51 +0000138def _compile_charset(charset, flags, code, fixup=None):
139 # compile charset subprogram
140 emit = code.append
141 if not fixup:
142 fixup = lambda x: x
143 for op, av in _optimize_charset(charset, fixup):
144 emit(OPCODES[op])
145 if op is NEGATE:
146 pass
147 elif op is LITERAL:
148 emit(fixup(av))
149 elif op is RANGE:
150 emit(fixup(av[0]))
151 emit(fixup(av[1]))
152 elif op is CHARSET:
153 code.extend(av)
154 elif op is CATEGORY:
155 if flags & SRE_FLAG_LOCALE:
156 emit(CHCODES[CH_LOCALE[av]])
157 elif flags & SRE_FLAG_UNICODE:
158 emit(CHCODES[CH_UNICODE[av]])
159 else:
160 emit(CHCODES[av])
161 else:
162 raise error, "internal: unsupported set operator"
163 emit(OPCODES[FAILURE])
164
165def _optimize_charset(charset, fixup):
166 # internal: optimize character set
167 out = []
168 charmap = [0]*256
169 try:
170 for op, av in charset:
171 if op is NEGATE:
172 out.append((op, av))
173 elif op is LITERAL:
174 charmap[fixup(av)] = 1
175 elif op is RANGE:
176 for i in range(fixup(av[0]), fixup(av[1])+1):
177 charmap[i] = 1
178 elif op is CATEGORY:
179 # FIXME: could append to charmap tail
180 return charset # cannot compress
181 except IndexError:
182 # character set contains unicode characters
183 return charset
184 # compress character map
185 i = p = n = 0
186 runs = []
187 for c in charmap:
188 if c:
189 if n == 0:
190 p = i
191 n = n + 1
192 elif n:
193 runs.append((p, n))
194 n = 0
195 i = i + 1
196 if n:
197 runs.append((p, n))
198 if len(runs) <= 2:
199 # use literal/range
200 for p, n in runs:
201 if n == 1:
202 out.append((LITERAL, p))
203 else:
204 out.append((RANGE, (p, p+n-1)))
205 if len(out) < len(charset):
206 return out
207 else:
208 # use bitmap
209 data = []
210 m = 1; v = 0
211 for c in charmap:
212 if c:
213 v = v + m
214 m = m << 1
215 if m > MAXCODE:
216 data.append(v)
217 m = 1; v = 0
218 out.append((CHARSET, data))
219 return out
220 return charset
221
222def _simple(av):
223 # check if av is a "simple" operator
224 lo, hi = av[2].getwidth()
225 if lo == 0:
226 raise error, "nothing to repeat"
227 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
228
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000229def _compile_info(code, pattern, flags):
230 # internal: compile an info block. in the current version,
Guido van Rossum8d691c82000-09-01 19:25:51 +0000231 # this contains min/max pattern width, and an optional literal
232 # prefix or a character map
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000233 lo, hi = pattern.getwidth()
234 if lo == 0:
235 return # not worth it
236 # look for a literal prefix
237 prefix = []
Guido van Rossum8d691c82000-09-01 19:25:51 +0000238 prefix_skip = 0
239 charset = [] # not used
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000240 if not (flags & SRE_FLAG_IGNORECASE):
Guido van Rossum8d691c82000-09-01 19:25:51 +0000241 # look for literal prefix
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000242 for op, av in pattern.data:
243 if op is LITERAL:
Guido van Rossum8d691c82000-09-01 19:25:51 +0000244 if len(prefix) == prefix_skip:
245 prefix_skip = prefix_skip + 1
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000246 prefix.append(av)
Guido van Rossum8d691c82000-09-01 19:25:51 +0000247 elif op is SUBPATTERN and len(av[1]) == 1:
248 op, av = av[1][0]
249 if op is LITERAL:
250 prefix.append(av)
251 else:
252 break
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000253 else:
254 break
Guido van Rossum8d691c82000-09-01 19:25:51 +0000255 # if no prefix, look for charset prefix
256 if not prefix and pattern.data:
257 op, av = pattern.data[0]
258 if op is SUBPATTERN and av[1]:
259 op, av = av[1][0]
260 if op is LITERAL:
261 charset.append((op, av))
262 elif op is BRANCH:
263 c = []
264 for p in av[1]:
265 if not p:
266 break
267 op, av = p[0]
268 if op is LITERAL:
269 c.append((op, av))
270 else:
271 break
272 else:
273 charset = c
274 elif op is BRANCH:
275 c = []
276 for p in av[1]:
277 if not p:
278 break
279 op, av = p[0]
280 if op is LITERAL:
281 c.append((op, av))
282 else:
283 break
284 else:
285 charset = c
286 elif op is IN:
287 charset = av
288## if prefix:
289## print "*** PREFIX", prefix, prefix_skip
290## if charset:
291## print "*** CHARSET", charset
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000292 # add an info block
293 emit = code.append
294 emit(OPCODES[INFO])
295 skip = len(code); emit(0)
296 # literal flag
297 mask = 0
Guido van Rossum8d691c82000-09-01 19:25:51 +0000298 if prefix:
299 mask = SRE_INFO_PREFIX
300 if len(prefix) == prefix_skip == len(pattern.data):
301 mask = mask + SRE_INFO_LITERAL
302 elif charset:
303 mask = mask + SRE_INFO_CHARSET
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000304 emit(mask)
305 # pattern length
Guido van Rossum8d691c82000-09-01 19:25:51 +0000306 if lo < MAXCODE:
307 emit(lo)
308 else:
309 emit(MAXCODE)
310 prefix = prefix[:MAXCODE]
311 if hi < MAXCODE:
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000312 emit(hi)
313 else:
314 emit(0)
315 # add literal prefix
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000316 if prefix:
Guido van Rossum8d691c82000-09-01 19:25:51 +0000317 emit(len(prefix)) # length
318 emit(prefix_skip) # skip
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000319 code.extend(prefix)
320 # generate overlap table
321 table = [-1] + ([0]*len(prefix))
322 for i in range(len(prefix)):
323 table[i+1] = table[i]+1
324 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
325 table[i+1] = table[table[i+1]-1]+1
326 code.extend(table[1:]) # don't store first entry
Guido van Rossum8d691c82000-09-01 19:25:51 +0000327 elif charset:
328 _compile_charset(charset, 0, code)
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000329 code[skip] = len(code) - skip
Guido van Rossumaad67612000-05-08 17:31:04 +0000330
Guido van Rossum8d691c82000-09-01 19:25:51 +0000331STRING_TYPES = [type("")]
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000332
Guido van Rossum8d691c82000-09-01 19:25:51 +0000333try:
334 STRING_TYPES.append(type(unicode("")))
335except NameError:
336 pass
337
338def _code(p, flags):
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000339
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000340 flags = p.pattern.flags | flags
341 code = []
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000342
343 # compile info block
344 _compile_info(code, p, flags)
345
346 # compile the pattern
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000347 _compile(code, p.data, flags)
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000348
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000349 code.append(OPCODES[SUCCESS])
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000350
Guido van Rossum8d691c82000-09-01 19:25:51 +0000351 return code
352
353def compile(p, flags=0):
354 # internal: convert pattern list to internal format
355
356 if type(p) in STRING_TYPES:
357 import sre_parse
358 pattern = p
359 p = sre_parse.parse(p, flags)
360 else:
361 pattern = None
362
363 code = _code(p, flags)
364
365 # print code
366
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000367 # FIXME: <fl> get rid of this limitation!
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000368 assert p.pattern.groups <= 100,\
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000369 "sorry, but this version only supports 100 named groups"
370
Guido van Rossum8d691c82000-09-01 19:25:51 +0000371 # map in either direction
372 groupindex = p.pattern.groupdict
373 indexgroup = [None] * p.pattern.groups
374 for k, i in groupindex.items():
375 indexgroup[i] = k
376
Guido van Rossum3e06ab12000-06-29 19:35:29 +0000377 return _sre.compile(
Guido van Rossum8d691c82000-09-01 19:25:51 +0000378 pattern, flags, code,
379 p.pattern.groups-1,
380 groupindex, indexgroup
Guido van Rossum4358b2c2000-06-30 16:13:37 +0000381 )