blob: e5adb7e46a7817a57e2c4f43c46547d947cd755b [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Fred Drakeb8f22742001-09-04 19:10:20 +000011"""Internal support module for sre"""
12
Fredrik Lundh4fb70272002-06-27 20:08:25 +000013import _sre, sys
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014
15from sre_constants import *
16
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000017assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
Fredrik Lundh3562f112000-07-02 12:00:07 +000019MAXCODE = 65535
20
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000021def _compile(code, pattern, flags):
22 # internal: compile a (sub)pattern
23 emit = code.append
24 for op, av in pattern:
25 if op in (LITERAL, NOT_LITERAL):
26 if flags & SRE_FLAG_IGNORECASE:
27 emit(OPCODES[OP_IGNORE[op]])
Fredrik Lundh2e240442001-01-15 18:28:14 +000028 emit(_sre.getlower(av, flags))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000029 else:
30 emit(OPCODES[op])
Fredrik Lundh2e240442001-01-15 18:28:14 +000031 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000032 elif op is IN:
33 if flags & SRE_FLAG_IGNORECASE:
34 emit(OPCODES[OP_IGNORE[op]])
35 def fixup(literal, flags=flags):
36 return _sre.getlower(literal, flags)
37 else:
38 emit(OPCODES[op])
39 fixup = lambda x: x
40 skip = len(code); emit(0)
41 _compile_charset(av, flags, code, fixup)
42 code[skip] = len(code) - skip
43 elif op is ANY:
44 if flags & SRE_FLAG_DOTALL:
45 emit(OPCODES[ANY_ALL])
46 else:
47 emit(OPCODES[ANY])
48 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
49 if flags & SRE_FLAG_TEMPLATE:
50 raise error, "internal: unsupported template operator"
51 emit(OPCODES[REPEAT])
52 skip = len(code); emit(0)
53 emit(av[0])
54 emit(av[1])
55 _compile(code, av[2], flags)
56 emit(OPCODES[SUCCESS])
57 code[skip] = len(code) - skip
58 elif _simple(av) and op == MAX_REPEAT:
59 emit(OPCODES[REPEAT_ONE])
60 skip = len(code); emit(0)
61 emit(av[0])
62 emit(av[1])
63 _compile(code, av[2], flags)
64 emit(OPCODES[SUCCESS])
65 code[skip] = len(code) - skip
66 else:
67 emit(OPCODES[REPEAT])
68 skip = len(code); emit(0)
69 emit(av[0])
70 emit(av[1])
71 _compile(code, av[2], flags)
72 code[skip] = len(code) - skip
73 if op == MAX_REPEAT:
74 emit(OPCODES[MAX_UNTIL])
75 else:
76 emit(OPCODES[MIN_UNTIL])
77 elif op is SUBPATTERN:
78 if av[0]:
79 emit(OPCODES[MARK])
80 emit((av[0]-1)*2)
81 # _compile_info(code, av[1], flags)
82 _compile(code, av[1], flags)
83 if av[0]:
84 emit(OPCODES[MARK])
85 emit((av[0]-1)*2+1)
86 elif op in (SUCCESS, FAILURE):
87 emit(OPCODES[op])
88 elif op in (ASSERT, ASSERT_NOT):
89 emit(OPCODES[op])
90 skip = len(code); emit(0)
91 if av[0] >= 0:
92 emit(0) # look ahead
93 else:
94 lo, hi = av[1].getwidth()
95 if lo != hi:
96 raise error, "look-behind requires fixed-width pattern"
97 emit(lo) # look behind
98 _compile(code, av[1], flags)
99 emit(OPCODES[SUCCESS])
100 code[skip] = len(code) - skip
101 elif op is CALL:
102 emit(OPCODES[op])
103 skip = len(code); emit(0)
104 _compile(code, av, flags)
105 emit(OPCODES[SUCCESS])
106 code[skip] = len(code) - skip
107 elif op is AT:
108 emit(OPCODES[op])
109 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000110 av = AT_MULTILINE.get(av, av)
111 if flags & SRE_FLAG_LOCALE:
112 av = AT_LOCALE.get(av, av)
113 elif flags & SRE_FLAG_UNICODE:
114 av = AT_UNICODE.get(av, av)
115 emit(ATCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000116 elif op is BRANCH:
117 emit(OPCODES[op])
118 tail = []
119 for av in av[1]:
120 skip = len(code); emit(0)
121 # _compile_info(code, av, flags)
122 _compile(code, av, flags)
123 emit(OPCODES[JUMP])
124 tail.append(len(code)); emit(0)
125 code[skip] = len(code) - skip
126 emit(0) # end of branch
127 for tail in tail:
128 code[tail] = len(code) - tail
129 elif op is CATEGORY:
130 emit(OPCODES[op])
131 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000132 av = CH_LOCALE[av]
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000133 elif flags & SRE_FLAG_UNICODE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000134 av = CH_UNICODE[av]
135 emit(CHCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000136 elif op is GROUPREF:
137 if flags & SRE_FLAG_IGNORECASE:
138 emit(OPCODES[OP_IGNORE[op]])
139 else:
140 emit(OPCODES[op])
141 emit(av-1)
142 else:
143 raise ValueError, ("unsupported operand type", op)
144
145def _compile_charset(charset, flags, code, fixup=None):
146 # compile charset subprogram
147 emit = code.append
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000148 if fixup is None:
Fredrik Lundh28552902000-07-05 21:14:16 +0000149 fixup = lambda x: x
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000150 for op, av in _optimize_charset(charset, fixup):
151 emit(OPCODES[op])
152 if op is NEGATE:
153 pass
154 elif op is LITERAL:
155 emit(fixup(av))
156 elif op is RANGE:
157 emit(fixup(av[0]))
158 emit(fixup(av[1]))
159 elif op is CHARSET:
160 code.extend(av)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000161 elif op is BIGCHARSET:
162 code.extend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000163 elif op is CATEGORY:
164 if flags & SRE_FLAG_LOCALE:
165 emit(CHCODES[CH_LOCALE[av]])
166 elif flags & SRE_FLAG_UNICODE:
167 emit(CHCODES[CH_UNICODE[av]])
168 else:
169 emit(CHCODES[av])
170 else:
171 raise error, "internal: unsupported set operator"
172 emit(OPCODES[FAILURE])
173
174def _optimize_charset(charset, fixup):
175 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000176 out = []
177 charmap = [0]*256
178 try:
179 for op, av in charset:
180 if op is NEGATE:
181 out.append((op, av))
182 elif op is LITERAL:
183 charmap[fixup(av)] = 1
184 elif op is RANGE:
185 for i in range(fixup(av[0]), fixup(av[1])+1):
186 charmap[i] = 1
187 elif op is CATEGORY:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000188 # XXX: could append to charmap tail
Fredrik Lundh3562f112000-07-02 12:00:07 +0000189 return charset # cannot compress
190 except IndexError:
Martin v. Löwis67c4cb12002-09-26 16:39:20 +0000191 if sys.maxunicode != 65535:
192 # XXX: big charsets don't work in UCS-4 builds
193 return charset
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000194 # character set contains unicode characters
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000195 return _optimize_unicode(charset, fixup)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000196 # compress character map
197 i = p = n = 0
198 runs = []
199 for c in charmap:
200 if c:
201 if n == 0:
202 p = i
203 n = n + 1
204 elif n:
205 runs.append((p, n))
206 n = 0
207 i = i + 1
208 if n:
209 runs.append((p, n))
210 if len(runs) <= 2:
211 # use literal/range
212 for p, n in runs:
213 if n == 1:
214 out.append((LITERAL, p))
215 else:
216 out.append((RANGE, (p, p+n-1)))
217 if len(out) < len(charset):
218 return out
219 else:
220 # use bitmap
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000221 data = _mk_bitmap(charmap)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000222 out.append((CHARSET, data))
223 return out
224 return charset
225
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000226def _mk_bitmap(bits):
227 data = []
228 m = 1; v = 0
229 for c in bits:
230 if c:
231 v = v + m
232 m = m << 1
233 if m > MAXCODE:
234 data.append(v)
235 m = 1; v = 0
236 return data
237
238# To represent a big charset, first a bitmap of all characters in the
239# set is constructed. Then, this bitmap is sliced into chunks of 256
240# characters, duplicate chunks are eliminitated, and each chunk is
241# given a number. In the compiled expression, the charset is
242# represented by a 16-bit word sequence, consisting of one word for
243# the number of different chunks, a sequence of 256 bytes (128 words)
244# of chunk numbers indexed by their original chunk position, and a
245# sequence of chunks (16 words each).
246
247# Compression is normally good: in a typical charset, large ranges of
248# Unicode will be either completely excluded (e.g. if only cyrillic
249# letters are to be matched), or completely included (e.g. if large
250# subranges of Kanji match). These ranges will be represented by
251# chunks of all one-bits or all zero-bits.
252
253# Matching can be also done efficiently: the more significant byte of
254# the Unicode character is an index into the chunk number, and the
255# less significant byte is a bit index in the chunk (just like the
256# CHARSET matching).
257
258def _optimize_unicode(charset, fixup):
259 charmap = [0]*65536
260 negate = 0
261 for op, av in charset:
262 if op is NEGATE:
263 negate = 1
264 elif op is LITERAL:
265 charmap[fixup(av)] = 1
266 elif op is RANGE:
267 for i in range(fixup(av[0]), fixup(av[1])+1):
268 charmap[i] = 1
269 elif op is CATEGORY:
270 # XXX: could expand category
271 return charset # cannot compress
272 if negate:
273 for i in range(65536):
274 charmap[i] = not charmap[i]
275 comps = {}
276 mapping = [0]*256
277 block = 0
278 data = []
279 for i in range(256):
280 chunk = tuple(charmap[i*256:(i+1)*256])
281 new = comps.setdefault(chunk, block)
282 mapping[i] = new
283 if new == block:
Fredrik Lundh4fb70272002-06-27 20:08:25 +0000284 block = block + 1
285 data = data + _mk_bitmap(chunk)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000286 header = [block]
287 assert MAXCODE == 65535
288 for i in range(128):
Martin v. Löwis3550dd32001-07-19 14:26:10 +0000289 if sys.byteorder == 'big':
290 header.append(256*mapping[2*i]+mapping[2*i+1])
291 else:
292 header.append(mapping[2*i]+256*mapping[2*i+1])
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000293 data[0:0] = header
Tim Peters87cc0c32001-07-21 01:41:30 +0000294 return [(BIGCHARSET, data)]
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000295
Fredrik Lundhe1869832000-08-01 22:47:49 +0000296def _simple(av):
297 # check if av is a "simple" operator
298 lo, hi = av[2].getwidth()
Fredrik Lundh13ac9922000-10-07 17:38:23 +0000299 if lo == 0 and hi == MAXREPEAT:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000300 raise error, "nothing to repeat"
301 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
302
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000303def _compile_info(code, pattern, flags):
304 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000305 # this contains min/max pattern width, and an optional literal
306 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000307 lo, hi = pattern.getwidth()
308 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000309 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000310 # look for a literal prefix
311 prefix = []
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000312 prefix_skip = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000313 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000314 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000315 # look for literal prefix
Fredrik Lundh90a07912000-06-30 07:50:59 +0000316 for op, av in pattern.data:
317 if op is LITERAL:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000318 if len(prefix) == prefix_skip:
319 prefix_skip = prefix_skip + 1
Fredrik Lundh0640e112000-06-30 13:55:15 +0000320 prefix.append(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000321 elif op is SUBPATTERN and len(av[1]) == 1:
322 op, av = av[1][0]
323 if op is LITERAL:
324 prefix.append(av)
325 else:
326 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000327 else:
328 break
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000329 # if no prefix, look for charset prefix
330 if not prefix and pattern.data:
331 op, av = pattern.data[0]
332 if op is SUBPATTERN and av[1]:
333 op, av = av[1][0]
334 if op is LITERAL:
335 charset.append((op, av))
336 elif op is BRANCH:
337 c = []
338 for p in av[1]:
339 if not p:
340 break
341 op, av = p[0]
342 if op is LITERAL:
343 c.append((op, av))
344 else:
345 break
346 else:
347 charset = c
348 elif op is BRANCH:
349 c = []
350 for p in av[1]:
351 if not p:
352 break
353 op, av = p[0]
354 if op is LITERAL:
355 c.append((op, av))
356 else:
357 break
358 else:
359 charset = c
360 elif op is IN:
361 charset = av
362## if prefix:
363## print "*** PREFIX", prefix, prefix_skip
364## if charset:
365## print "*** CHARSET", charset
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000366 # add an info block
367 emit = code.append
368 emit(OPCODES[INFO])
369 skip = len(code); emit(0)
370 # literal flag
371 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000372 if prefix:
373 mask = SRE_INFO_PREFIX
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000374 if len(prefix) == prefix_skip == len(pattern.data):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000375 mask = mask + SRE_INFO_LITERAL
376 elif charset:
377 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000378 emit(mask)
379 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000380 if lo < MAXCODE:
381 emit(lo)
382 else:
383 emit(MAXCODE)
384 prefix = prefix[:MAXCODE]
385 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000386 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000387 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000388 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000389 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000390 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000391 emit(len(prefix)) # length
392 emit(prefix_skip) # skip
393 code.extend(prefix)
394 # generate overlap table
395 table = [-1] + ([0]*len(prefix))
396 for i in range(len(prefix)):
397 table[i+1] = table[i]+1
398 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
399 table[i+1] = table[table[i+1]-1]+1
400 code.extend(table[1:]) # don't store first entry
Fredrik Lundh3562f112000-07-02 12:00:07 +0000401 elif charset:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000402 _compile_charset(charset, 0, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000403 code[skip] = len(code) - skip
404
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000405STRING_TYPES = [type("")]
406
407try:
408 STRING_TYPES.append(type(unicode("")))
409except NameError:
410 pass
411
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000412def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000413
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000414 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000415 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000416
417 # compile info block
418 _compile_info(code, p, flags)
419
420 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000421 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000422
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000423 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000424
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000425 return code
426
427def compile(p, flags=0):
428 # internal: convert pattern list to internal format
429
430 if type(p) in STRING_TYPES:
431 import sre_parse
432 pattern = p
433 p = sre_parse.parse(p, flags)
434 else:
435 pattern = None
436
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000437 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000438
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000439 # print code
440
Fredrik Lundh770617b2001-01-14 15:06:11 +0000441 # XXX: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000442 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000443 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000444
Fredrik Lundhc2301732000-07-02 22:25:39 +0000445 # map in either direction
446 groupindex = p.pattern.groupdict
447 indexgroup = [None] * p.pattern.groups
448 for k, i in groupindex.items():
449 indexgroup[i] = k
450
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000451 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000452 pattern, flags, code,
453 p.pattern.groups-1,
454 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000455 )