blob: e48be72285b4277747cf2be59b149693d33f1177 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Fred Drakeb8f22742001-09-04 19:10:20 +000011"""Internal support module for sre"""
12
Martin v. Löwis3550dd32001-07-19 14:26:10 +000013import _sre,sys
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014
15from sre_constants import *
16
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000017assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
Fredrik Lundh3562f112000-07-02 12:00:07 +000019MAXCODE = 65535
20
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000021def _compile(code, pattern, flags):
22 # internal: compile a (sub)pattern
23 emit = code.append
24 for op, av in pattern:
25 if op in (LITERAL, NOT_LITERAL):
26 if flags & SRE_FLAG_IGNORECASE:
27 emit(OPCODES[OP_IGNORE[op]])
Fredrik Lundh2e240442001-01-15 18:28:14 +000028 emit(_sre.getlower(av, flags))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000029 else:
30 emit(OPCODES[op])
Fredrik Lundh2e240442001-01-15 18:28:14 +000031 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000032 elif op is IN:
33 if flags & SRE_FLAG_IGNORECASE:
34 emit(OPCODES[OP_IGNORE[op]])
35 def fixup(literal, flags=flags):
36 return _sre.getlower(literal, flags)
37 else:
38 emit(OPCODES[op])
39 fixup = lambda x: x
40 skip = len(code); emit(0)
41 _compile_charset(av, flags, code, fixup)
42 code[skip] = len(code) - skip
43 elif op is ANY:
44 if flags & SRE_FLAG_DOTALL:
45 emit(OPCODES[ANY_ALL])
46 else:
47 emit(OPCODES[ANY])
48 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
49 if flags & SRE_FLAG_TEMPLATE:
50 raise error, "internal: unsupported template operator"
51 emit(OPCODES[REPEAT])
52 skip = len(code); emit(0)
53 emit(av[0])
54 emit(av[1])
55 _compile(code, av[2], flags)
56 emit(OPCODES[SUCCESS])
57 code[skip] = len(code) - skip
58 elif _simple(av) and op == MAX_REPEAT:
59 emit(OPCODES[REPEAT_ONE])
60 skip = len(code); emit(0)
61 emit(av[0])
62 emit(av[1])
63 _compile(code, av[2], flags)
64 emit(OPCODES[SUCCESS])
65 code[skip] = len(code) - skip
66 else:
67 emit(OPCODES[REPEAT])
68 skip = len(code); emit(0)
69 emit(av[0])
70 emit(av[1])
71 _compile(code, av[2], flags)
72 code[skip] = len(code) - skip
73 if op == MAX_REPEAT:
74 emit(OPCODES[MAX_UNTIL])
75 else:
76 emit(OPCODES[MIN_UNTIL])
77 elif op is SUBPATTERN:
78 if av[0]:
79 emit(OPCODES[MARK])
80 emit((av[0]-1)*2)
81 # _compile_info(code, av[1], flags)
82 _compile(code, av[1], flags)
83 if av[0]:
84 emit(OPCODES[MARK])
85 emit((av[0]-1)*2+1)
86 elif op in (SUCCESS, FAILURE):
87 emit(OPCODES[op])
88 elif op in (ASSERT, ASSERT_NOT):
89 emit(OPCODES[op])
90 skip = len(code); emit(0)
91 if av[0] >= 0:
92 emit(0) # look ahead
93 else:
94 lo, hi = av[1].getwidth()
95 if lo != hi:
96 raise error, "look-behind requires fixed-width pattern"
97 emit(lo) # look behind
98 _compile(code, av[1], flags)
99 emit(OPCODES[SUCCESS])
100 code[skip] = len(code) - skip
101 elif op is CALL:
102 emit(OPCODES[op])
103 skip = len(code); emit(0)
104 _compile(code, av, flags)
105 emit(OPCODES[SUCCESS])
106 code[skip] = len(code) - skip
107 elif op is AT:
108 emit(OPCODES[op])
109 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000110 av = AT_MULTILINE.get(av, av)
111 if flags & SRE_FLAG_LOCALE:
112 av = AT_LOCALE.get(av, av)
113 elif flags & SRE_FLAG_UNICODE:
114 av = AT_UNICODE.get(av, av)
115 emit(ATCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000116 elif op is BRANCH:
117 emit(OPCODES[op])
118 tail = []
119 for av in av[1]:
120 skip = len(code); emit(0)
121 # _compile_info(code, av, flags)
122 _compile(code, av, flags)
123 emit(OPCODES[JUMP])
124 tail.append(len(code)); emit(0)
125 code[skip] = len(code) - skip
126 emit(0) # end of branch
127 for tail in tail:
128 code[tail] = len(code) - tail
129 elif op is CATEGORY:
130 emit(OPCODES[op])
131 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000132 av = CH_LOCALE[av]
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000133 elif flags & SRE_FLAG_UNICODE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000134 av = CH_UNICODE[av]
135 emit(CHCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000136 elif op is GROUPREF:
137 if flags & SRE_FLAG_IGNORECASE:
138 emit(OPCODES[OP_IGNORE[op]])
139 else:
140 emit(OPCODES[op])
141 emit(av-1)
142 else:
143 raise ValueError, ("unsupported operand type", op)
144
145def _compile_charset(charset, flags, code, fixup=None):
146 # compile charset subprogram
147 emit = code.append
Fredrik Lundh28552902000-07-05 21:14:16 +0000148 if not fixup:
149 fixup = lambda x: x
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000150 for op, av in _optimize_charset(charset, fixup):
151 emit(OPCODES[op])
152 if op is NEGATE:
153 pass
154 elif op is LITERAL:
155 emit(fixup(av))
156 elif op is RANGE:
157 emit(fixup(av[0]))
158 emit(fixup(av[1]))
159 elif op is CHARSET:
160 code.extend(av)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000161 elif op is BIGCHARSET:
162 code.extend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000163 elif op is CATEGORY:
164 if flags & SRE_FLAG_LOCALE:
165 emit(CHCODES[CH_LOCALE[av]])
166 elif flags & SRE_FLAG_UNICODE:
167 emit(CHCODES[CH_UNICODE[av]])
168 else:
169 emit(CHCODES[av])
170 else:
171 raise error, "internal: unsupported set operator"
172 emit(OPCODES[FAILURE])
173
174def _optimize_charset(charset, fixup):
175 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000176 out = []
177 charmap = [0]*256
178 try:
179 for op, av in charset:
180 if op is NEGATE:
181 out.append((op, av))
182 elif op is LITERAL:
183 charmap[fixup(av)] = 1
184 elif op is RANGE:
185 for i in range(fixup(av[0]), fixup(av[1])+1):
186 charmap[i] = 1
187 elif op is CATEGORY:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000188 # XXX: could append to charmap tail
Fredrik Lundh3562f112000-07-02 12:00:07 +0000189 return charset # cannot compress
190 except IndexError:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000191 # character set contains unicode characters
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000192 return _optimize_unicode(charset, fixup)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000193 # compress character map
194 i = p = n = 0
195 runs = []
196 for c in charmap:
197 if c:
198 if n == 0:
199 p = i
200 n = n + 1
201 elif n:
202 runs.append((p, n))
203 n = 0
204 i = i + 1
205 if n:
206 runs.append((p, n))
207 if len(runs) <= 2:
208 # use literal/range
209 for p, n in runs:
210 if n == 1:
211 out.append((LITERAL, p))
212 else:
213 out.append((RANGE, (p, p+n-1)))
214 if len(out) < len(charset):
215 return out
216 else:
217 # use bitmap
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000218 data = _mk_bitmap(charmap)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000219 out.append((CHARSET, data))
220 return out
221 return charset
222
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000223def _mk_bitmap(bits):
224 data = []
225 m = 1; v = 0
226 for c in bits:
227 if c:
228 v = v + m
229 m = m << 1
230 if m > MAXCODE:
231 data.append(v)
232 m = 1; v = 0
233 return data
234
235# To represent a big charset, first a bitmap of all characters in the
236# set is constructed. Then, this bitmap is sliced into chunks of 256
237# characters, duplicate chunks are eliminitated, and each chunk is
238# given a number. In the compiled expression, the charset is
239# represented by a 16-bit word sequence, consisting of one word for
240# the number of different chunks, a sequence of 256 bytes (128 words)
241# of chunk numbers indexed by their original chunk position, and a
242# sequence of chunks (16 words each).
243
244# Compression is normally good: in a typical charset, large ranges of
245# Unicode will be either completely excluded (e.g. if only cyrillic
246# letters are to be matched), or completely included (e.g. if large
247# subranges of Kanji match). These ranges will be represented by
248# chunks of all one-bits or all zero-bits.
249
250# Matching can be also done efficiently: the more significant byte of
251# the Unicode character is an index into the chunk number, and the
252# less significant byte is a bit index in the chunk (just like the
253# CHARSET matching).
254
255def _optimize_unicode(charset, fixup):
256 charmap = [0]*65536
257 negate = 0
258 for op, av in charset:
259 if op is NEGATE:
260 negate = 1
261 elif op is LITERAL:
262 charmap[fixup(av)] = 1
263 elif op is RANGE:
264 for i in range(fixup(av[0]), fixup(av[1])+1):
265 charmap[i] = 1
266 elif op is CATEGORY:
267 # XXX: could expand category
268 return charset # cannot compress
269 if negate:
270 for i in range(65536):
271 charmap[i] = not charmap[i]
272 comps = {}
273 mapping = [0]*256
274 block = 0
275 data = []
276 for i in range(256):
277 chunk = tuple(charmap[i*256:(i+1)*256])
278 new = comps.setdefault(chunk, block)
279 mapping[i] = new
280 if new == block:
281 block += 1
282 data += _mk_bitmap(chunk)
283 header = [block]
284 assert MAXCODE == 65535
285 for i in range(128):
Martin v. Löwis3550dd32001-07-19 14:26:10 +0000286 if sys.byteorder == 'big':
287 header.append(256*mapping[2*i]+mapping[2*i+1])
288 else:
289 header.append(mapping[2*i]+256*mapping[2*i+1])
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000290 data[0:0] = header
Tim Peters87cc0c32001-07-21 01:41:30 +0000291 return [(BIGCHARSET, data)]
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000292
Fredrik Lundhe1869832000-08-01 22:47:49 +0000293def _simple(av):
294 # check if av is a "simple" operator
295 lo, hi = av[2].getwidth()
Fredrik Lundh13ac9922000-10-07 17:38:23 +0000296 if lo == 0 and hi == MAXREPEAT:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000297 raise error, "nothing to repeat"
298 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
299
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000300def _compile_info(code, pattern, flags):
301 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000302 # this contains min/max pattern width, and an optional literal
303 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000304 lo, hi = pattern.getwidth()
305 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000306 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000307 # look for a literal prefix
308 prefix = []
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000309 prefix_skip = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000310 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000311 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000312 # look for literal prefix
Fredrik Lundh90a07912000-06-30 07:50:59 +0000313 for op, av in pattern.data:
314 if op is LITERAL:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315 if len(prefix) == prefix_skip:
316 prefix_skip = prefix_skip + 1
Fredrik Lundh0640e112000-06-30 13:55:15 +0000317 prefix.append(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000318 elif op is SUBPATTERN and len(av[1]) == 1:
319 op, av = av[1][0]
320 if op is LITERAL:
321 prefix.append(av)
322 else:
323 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000324 else:
325 break
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000326 # if no prefix, look for charset prefix
327 if not prefix and pattern.data:
328 op, av = pattern.data[0]
329 if op is SUBPATTERN and av[1]:
330 op, av = av[1][0]
331 if op is LITERAL:
332 charset.append((op, av))
333 elif op is BRANCH:
334 c = []
335 for p in av[1]:
336 if not p:
337 break
338 op, av = p[0]
339 if op is LITERAL:
340 c.append((op, av))
341 else:
342 break
343 else:
344 charset = c
345 elif op is BRANCH:
346 c = []
347 for p in av[1]:
348 if not p:
349 break
350 op, av = p[0]
351 if op is LITERAL:
352 c.append((op, av))
353 else:
354 break
355 else:
356 charset = c
357 elif op is IN:
358 charset = av
359## if prefix:
360## print "*** PREFIX", prefix, prefix_skip
361## if charset:
362## print "*** CHARSET", charset
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000363 # add an info block
364 emit = code.append
365 emit(OPCODES[INFO])
366 skip = len(code); emit(0)
367 # literal flag
368 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000369 if prefix:
370 mask = SRE_INFO_PREFIX
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000371 if len(prefix) == prefix_skip == len(pattern.data):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000372 mask = mask + SRE_INFO_LITERAL
373 elif charset:
374 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000375 emit(mask)
376 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000377 if lo < MAXCODE:
378 emit(lo)
379 else:
380 emit(MAXCODE)
381 prefix = prefix[:MAXCODE]
382 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000383 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000384 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000385 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000386 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000387 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000388 emit(len(prefix)) # length
389 emit(prefix_skip) # skip
390 code.extend(prefix)
391 # generate overlap table
392 table = [-1] + ([0]*len(prefix))
393 for i in range(len(prefix)):
394 table[i+1] = table[i]+1
395 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
396 table[i+1] = table[table[i+1]-1]+1
397 code.extend(table[1:]) # don't store first entry
Fredrik Lundh3562f112000-07-02 12:00:07 +0000398 elif charset:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000399 _compile_charset(charset, 0, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000400 code[skip] = len(code) - skip
401
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000402STRING_TYPES = [type("")]
403
404try:
405 STRING_TYPES.append(type(unicode("")))
406except NameError:
407 pass
408
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000409def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000410
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000411 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000412 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000413
414 # compile info block
415 _compile_info(code, p, flags)
416
417 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000418 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000419
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000420 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000421
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000422 return code
423
424def compile(p, flags=0):
425 # internal: convert pattern list to internal format
426
427 if type(p) in STRING_TYPES:
428 import sre_parse
429 pattern = p
430 p = sre_parse.parse(p, flags)
431 else:
432 pattern = None
433
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000434 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000435
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 # print code
437
Fredrik Lundh770617b2001-01-14 15:06:11 +0000438 # XXX: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000439 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000440 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000441
Fredrik Lundhc2301732000-07-02 22:25:39 +0000442 # map in either direction
443 groupindex = p.pattern.groupdict
444 indexgroup = [None] * p.pattern.groups
445 for k, i in groupindex.items():
446 indexgroup[i] = k
447
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000448 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000449 pattern, flags, code,
450 p.pattern.groups-1,
451 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000452 )