blob: 44cb23e6a4a78c26621c7a3df356f36e0729b7bc [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Guido van Rossum7627c0d2000-03-31 14:58:54 +000011import _sre
12
13from sre_constants import *
14
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000015assert _sre.MAGIC == MAGIC, "SRE module mismatch"
16
Fredrik Lundh3562f112000-07-02 12:00:07 +000017MAXCODE = 65535
18
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000019def _compile(code, pattern, flags):
20 # internal: compile a (sub)pattern
21 emit = code.append
22 for op, av in pattern:
23 if op in (LITERAL, NOT_LITERAL):
24 if flags & SRE_FLAG_IGNORECASE:
25 emit(OPCODES[OP_IGNORE[op]])
Fredrik Lundh2e240442001-01-15 18:28:14 +000026 emit(_sre.getlower(av, flags))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000027 else:
28 emit(OPCODES[op])
Fredrik Lundh2e240442001-01-15 18:28:14 +000029 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000030 elif op is IN:
31 if flags & SRE_FLAG_IGNORECASE:
32 emit(OPCODES[OP_IGNORE[op]])
33 def fixup(literal, flags=flags):
34 return _sre.getlower(literal, flags)
35 else:
36 emit(OPCODES[op])
37 fixup = lambda x: x
38 skip = len(code); emit(0)
39 _compile_charset(av, flags, code, fixup)
40 code[skip] = len(code) - skip
41 elif op is ANY:
42 if flags & SRE_FLAG_DOTALL:
43 emit(OPCODES[ANY_ALL])
44 else:
45 emit(OPCODES[ANY])
46 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
47 if flags & SRE_FLAG_TEMPLATE:
48 raise error, "internal: unsupported template operator"
49 emit(OPCODES[REPEAT])
50 skip = len(code); emit(0)
51 emit(av[0])
52 emit(av[1])
53 _compile(code, av[2], flags)
54 emit(OPCODES[SUCCESS])
55 code[skip] = len(code) - skip
56 elif _simple(av) and op == MAX_REPEAT:
57 emit(OPCODES[REPEAT_ONE])
58 skip = len(code); emit(0)
59 emit(av[0])
60 emit(av[1])
61 _compile(code, av[2], flags)
62 emit(OPCODES[SUCCESS])
63 code[skip] = len(code) - skip
64 else:
65 emit(OPCODES[REPEAT])
66 skip = len(code); emit(0)
67 emit(av[0])
68 emit(av[1])
69 _compile(code, av[2], flags)
70 code[skip] = len(code) - skip
71 if op == MAX_REPEAT:
72 emit(OPCODES[MAX_UNTIL])
73 else:
74 emit(OPCODES[MIN_UNTIL])
75 elif op is SUBPATTERN:
76 if av[0]:
77 emit(OPCODES[MARK])
78 emit((av[0]-1)*2)
79 # _compile_info(code, av[1], flags)
80 _compile(code, av[1], flags)
81 if av[0]:
82 emit(OPCODES[MARK])
83 emit((av[0]-1)*2+1)
84 elif op in (SUCCESS, FAILURE):
85 emit(OPCODES[op])
86 elif op in (ASSERT, ASSERT_NOT):
87 emit(OPCODES[op])
88 skip = len(code); emit(0)
89 if av[0] >= 0:
90 emit(0) # look ahead
91 else:
92 lo, hi = av[1].getwidth()
93 if lo != hi:
94 raise error, "look-behind requires fixed-width pattern"
95 emit(lo) # look behind
96 _compile(code, av[1], flags)
97 emit(OPCODES[SUCCESS])
98 code[skip] = len(code) - skip
99 elif op is CALL:
100 emit(OPCODES[op])
101 skip = len(code); emit(0)
102 _compile(code, av, flags)
103 emit(OPCODES[SUCCESS])
104 code[skip] = len(code) - skip
105 elif op is AT:
106 emit(OPCODES[op])
107 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000108 av = AT_MULTILINE.get(av, av)
109 if flags & SRE_FLAG_LOCALE:
110 av = AT_LOCALE.get(av, av)
111 elif flags & SRE_FLAG_UNICODE:
112 av = AT_UNICODE.get(av, av)
113 emit(ATCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000114 elif op is BRANCH:
115 emit(OPCODES[op])
116 tail = []
117 for av in av[1]:
118 skip = len(code); emit(0)
119 # _compile_info(code, av, flags)
120 _compile(code, av, flags)
121 emit(OPCODES[JUMP])
122 tail.append(len(code)); emit(0)
123 code[skip] = len(code) - skip
124 emit(0) # end of branch
125 for tail in tail:
126 code[tail] = len(code) - tail
127 elif op is CATEGORY:
128 emit(OPCODES[op])
129 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000130 av = CH_LOCALE[av]
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000131 elif flags & SRE_FLAG_UNICODE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000132 av = CH_UNICODE[av]
133 emit(CHCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000134 elif op is GROUPREF:
135 if flags & SRE_FLAG_IGNORECASE:
136 emit(OPCODES[OP_IGNORE[op]])
137 else:
138 emit(OPCODES[op])
139 emit(av-1)
140 else:
141 raise ValueError, ("unsupported operand type", op)
142
143def _compile_charset(charset, flags, code, fixup=None):
144 # compile charset subprogram
145 emit = code.append
Fredrik Lundh28552902000-07-05 21:14:16 +0000146 if not fixup:
147 fixup = lambda x: x
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000148 for op, av in _optimize_charset(charset, fixup):
149 emit(OPCODES[op])
150 if op is NEGATE:
151 pass
152 elif op is LITERAL:
153 emit(fixup(av))
154 elif op is RANGE:
155 emit(fixup(av[0]))
156 emit(fixup(av[1]))
157 elif op is CHARSET:
158 code.extend(av)
159 elif op is CATEGORY:
160 if flags & SRE_FLAG_LOCALE:
161 emit(CHCODES[CH_LOCALE[av]])
162 elif flags & SRE_FLAG_UNICODE:
163 emit(CHCODES[CH_UNICODE[av]])
164 else:
165 emit(CHCODES[av])
166 else:
167 raise error, "internal: unsupported set operator"
168 emit(OPCODES[FAILURE])
169
170def _optimize_charset(charset, fixup):
171 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000172 out = []
173 charmap = [0]*256
174 try:
175 for op, av in charset:
176 if op is NEGATE:
177 out.append((op, av))
178 elif op is LITERAL:
179 charmap[fixup(av)] = 1
180 elif op is RANGE:
181 for i in range(fixup(av[0]), fixup(av[1])+1):
182 charmap[i] = 1
183 elif op is CATEGORY:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000184 # XXX: could append to charmap tail
Fredrik Lundh3562f112000-07-02 12:00:07 +0000185 return charset # cannot compress
186 except IndexError:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000187 # character set contains unicode characters
Fredrik Lundh3562f112000-07-02 12:00:07 +0000188 return charset
189 # compress character map
190 i = p = n = 0
191 runs = []
192 for c in charmap:
193 if c:
194 if n == 0:
195 p = i
196 n = n + 1
197 elif n:
198 runs.append((p, n))
199 n = 0
200 i = i + 1
201 if n:
202 runs.append((p, n))
203 if len(runs) <= 2:
204 # use literal/range
205 for p, n in runs:
206 if n == 1:
207 out.append((LITERAL, p))
208 else:
209 out.append((RANGE, (p, p+n-1)))
210 if len(out) < len(charset):
211 return out
212 else:
213 # use bitmap
214 data = []
215 m = 1; v = 0
216 for c in charmap:
217 if c:
218 v = v + m
219 m = m << 1
220 if m > MAXCODE:
221 data.append(v)
222 m = 1; v = 0
223 out.append((CHARSET, data))
224 return out
225 return charset
226
Fredrik Lundhe1869832000-08-01 22:47:49 +0000227def _simple(av):
228 # check if av is a "simple" operator
229 lo, hi = av[2].getwidth()
Fredrik Lundh13ac9922000-10-07 17:38:23 +0000230 if lo == 0 and hi == MAXREPEAT:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000231 raise error, "nothing to repeat"
232 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
233
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000234def _compile_info(code, pattern, flags):
235 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000236 # this contains min/max pattern width, and an optional literal
237 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000238 lo, hi = pattern.getwidth()
239 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000240 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000241 # look for a literal prefix
242 prefix = []
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000243 prefix_skip = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000244 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000245 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000246 # look for literal prefix
Fredrik Lundh90a07912000-06-30 07:50:59 +0000247 for op, av in pattern.data:
248 if op is LITERAL:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000249 if len(prefix) == prefix_skip:
250 prefix_skip = prefix_skip + 1
Fredrik Lundh0640e112000-06-30 13:55:15 +0000251 prefix.append(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000252 elif op is SUBPATTERN and len(av[1]) == 1:
253 op, av = av[1][0]
254 if op is LITERAL:
255 prefix.append(av)
256 else:
257 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000258 else:
259 break
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000260 # if no prefix, look for charset prefix
261 if not prefix and pattern.data:
262 op, av = pattern.data[0]
263 if op is SUBPATTERN and av[1]:
264 op, av = av[1][0]
265 if op is LITERAL:
266 charset.append((op, av))
267 elif op is BRANCH:
268 c = []
269 for p in av[1]:
270 if not p:
271 break
272 op, av = p[0]
273 if op is LITERAL:
274 c.append((op, av))
275 else:
276 break
277 else:
278 charset = c
279 elif op is BRANCH:
280 c = []
281 for p in av[1]:
282 if not p:
283 break
284 op, av = p[0]
285 if op is LITERAL:
286 c.append((op, av))
287 else:
288 break
289 else:
290 charset = c
291 elif op is IN:
292 charset = av
293## if prefix:
294## print "*** PREFIX", prefix, prefix_skip
295## if charset:
296## print "*** CHARSET", charset
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000297 # add an info block
298 emit = code.append
299 emit(OPCODES[INFO])
300 skip = len(code); emit(0)
301 # literal flag
302 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000303 if prefix:
304 mask = SRE_INFO_PREFIX
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000305 if len(prefix) == prefix_skip == len(pattern.data):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000306 mask = mask + SRE_INFO_LITERAL
307 elif charset:
308 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000309 emit(mask)
310 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000311 if lo < MAXCODE:
312 emit(lo)
313 else:
314 emit(MAXCODE)
315 prefix = prefix[:MAXCODE]
316 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000317 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000318 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000319 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000320 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000321 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000322 emit(len(prefix)) # length
323 emit(prefix_skip) # skip
324 code.extend(prefix)
325 # generate overlap table
326 table = [-1] + ([0]*len(prefix))
327 for i in range(len(prefix)):
328 table[i+1] = table[i]+1
329 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
330 table[i+1] = table[table[i+1]-1]+1
331 code.extend(table[1:]) # don't store first entry
Fredrik Lundh3562f112000-07-02 12:00:07 +0000332 elif charset:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000333 _compile_charset(charset, 0, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000334 code[skip] = len(code) - skip
335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336STRING_TYPES = [type("")]
337
338try:
339 STRING_TYPES.append(type(unicode("")))
340except NameError:
341 pass
342
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000343def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000344
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000345 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000346 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000347
348 # compile info block
349 _compile_info(code, p, flags)
350
351 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000352 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000353
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000354 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000355
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000356 return code
357
358def compile(p, flags=0):
359 # internal: convert pattern list to internal format
360
361 if type(p) in STRING_TYPES:
362 import sre_parse
363 pattern = p
364 p = sre_parse.parse(p, flags)
365 else:
366 pattern = None
367
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000368 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000370 # print code
371
Fredrik Lundh770617b2001-01-14 15:06:11 +0000372 # XXX: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000373 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000374 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000375
Fredrik Lundhc2301732000-07-02 22:25:39 +0000376 # map in either direction
377 groupindex = p.pattern.groupdict
378 indexgroup = [None] * p.pattern.groups
379 for k, i in groupindex.items():
380 indexgroup[i] = k
381
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000382 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000383 pattern, flags, code,
384 p.pattern.groups-1,
385 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000386 )