blob: 97a57e2db19cdf2db71604976fd6d786cf5d2173 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Guido van Rossum7627c0d2000-03-31 14:58:54 +000011import _sre
12
13from sre_constants import *
14
Fredrik Lundh3562f112000-07-02 12:00:07 +000015MAXCODE = 65535
16
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000017def _compile(code, pattern, flags):
18 # internal: compile a (sub)pattern
19 emit = code.append
20 for op, av in pattern:
21 if op in (LITERAL, NOT_LITERAL):
22 if flags & SRE_FLAG_IGNORECASE:
23 emit(OPCODES[OP_IGNORE[op]])
24 else:
25 emit(OPCODES[op])
26 emit(av)
27 elif op is IN:
28 if flags & SRE_FLAG_IGNORECASE:
29 emit(OPCODES[OP_IGNORE[op]])
30 def fixup(literal, flags=flags):
31 return _sre.getlower(literal, flags)
32 else:
33 emit(OPCODES[op])
34 fixup = lambda x: x
35 skip = len(code); emit(0)
36 _compile_charset(av, flags, code, fixup)
37 code[skip] = len(code) - skip
38 elif op is ANY:
39 if flags & SRE_FLAG_DOTALL:
40 emit(OPCODES[ANY_ALL])
41 else:
42 emit(OPCODES[ANY])
43 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
44 if flags & SRE_FLAG_TEMPLATE:
45 raise error, "internal: unsupported template operator"
46 emit(OPCODES[REPEAT])
47 skip = len(code); emit(0)
48 emit(av[0])
49 emit(av[1])
50 _compile(code, av[2], flags)
51 emit(OPCODES[SUCCESS])
52 code[skip] = len(code) - skip
53 elif _simple(av) and op == MAX_REPEAT:
54 emit(OPCODES[REPEAT_ONE])
55 skip = len(code); emit(0)
56 emit(av[0])
57 emit(av[1])
58 _compile(code, av[2], flags)
59 emit(OPCODES[SUCCESS])
60 code[skip] = len(code) - skip
61 else:
62 emit(OPCODES[REPEAT])
63 skip = len(code); emit(0)
64 emit(av[0])
65 emit(av[1])
66 _compile(code, av[2], flags)
67 code[skip] = len(code) - skip
68 if op == MAX_REPEAT:
69 emit(OPCODES[MAX_UNTIL])
70 else:
71 emit(OPCODES[MIN_UNTIL])
72 elif op is SUBPATTERN:
73 if av[0]:
74 emit(OPCODES[MARK])
75 emit((av[0]-1)*2)
76 # _compile_info(code, av[1], flags)
77 _compile(code, av[1], flags)
78 if av[0]:
79 emit(OPCODES[MARK])
80 emit((av[0]-1)*2+1)
81 elif op in (SUCCESS, FAILURE):
82 emit(OPCODES[op])
83 elif op in (ASSERT, ASSERT_NOT):
84 emit(OPCODES[op])
85 skip = len(code); emit(0)
86 if av[0] >= 0:
87 emit(0) # look ahead
88 else:
89 lo, hi = av[1].getwidth()
90 if lo != hi:
91 raise error, "look-behind requires fixed-width pattern"
92 emit(lo) # look behind
93 _compile(code, av[1], flags)
94 emit(OPCODES[SUCCESS])
95 code[skip] = len(code) - skip
96 elif op is CALL:
97 emit(OPCODES[op])
98 skip = len(code); emit(0)
99 _compile(code, av, flags)
100 emit(OPCODES[SUCCESS])
101 code[skip] = len(code) - skip
102 elif op is AT:
103 emit(OPCODES[op])
104 if flags & SRE_FLAG_MULTILINE:
105 emit(ATCODES[AT_MULTILINE.get(av, av)])
106 else:
107 emit(ATCODES[av])
108 elif op is BRANCH:
109 emit(OPCODES[op])
110 tail = []
111 for av in av[1]:
112 skip = len(code); emit(0)
113 # _compile_info(code, av, flags)
114 _compile(code, av, flags)
115 emit(OPCODES[JUMP])
116 tail.append(len(code)); emit(0)
117 code[skip] = len(code) - skip
118 emit(0) # end of branch
119 for tail in tail:
120 code[tail] = len(code) - tail
121 elif op is CATEGORY:
122 emit(OPCODES[op])
123 if flags & SRE_FLAG_LOCALE:
124 emit(CHCODES[CH_LOCALE[av]])
125 elif flags & SRE_FLAG_UNICODE:
126 emit(CHCODES[CH_UNICODE[av]])
127 else:
128 emit(CHCODES[av])
129 elif op is GROUPREF:
130 if flags & SRE_FLAG_IGNORECASE:
131 emit(OPCODES[OP_IGNORE[op]])
132 else:
133 emit(OPCODES[op])
134 emit(av-1)
135 else:
136 raise ValueError, ("unsupported operand type", op)
137
138def _compile_charset(charset, flags, code, fixup=None):
139 # compile charset subprogram
140 emit = code.append
Fredrik Lundh28552902000-07-05 21:14:16 +0000141 if not fixup:
142 fixup = lambda x: x
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000143 for op, av in _optimize_charset(charset, fixup):
144 emit(OPCODES[op])
145 if op is NEGATE:
146 pass
147 elif op is LITERAL:
148 emit(fixup(av))
149 elif op is RANGE:
150 emit(fixup(av[0]))
151 emit(fixup(av[1]))
152 elif op is CHARSET:
153 code.extend(av)
154 elif op is CATEGORY:
155 if flags & SRE_FLAG_LOCALE:
156 emit(CHCODES[CH_LOCALE[av]])
157 elif flags & SRE_FLAG_UNICODE:
158 emit(CHCODES[CH_UNICODE[av]])
159 else:
160 emit(CHCODES[av])
161 else:
162 raise error, "internal: unsupported set operator"
163 emit(OPCODES[FAILURE])
164
165def _optimize_charset(charset, fixup):
166 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000167 out = []
168 charmap = [0]*256
169 try:
170 for op, av in charset:
171 if op is NEGATE:
172 out.append((op, av))
173 elif op is LITERAL:
174 charmap[fixup(av)] = 1
175 elif op is RANGE:
176 for i in range(fixup(av[0]), fixup(av[1])+1):
177 charmap[i] = 1
178 elif op is CATEGORY:
179 # FIXME: could append to charmap tail
180 return charset # cannot compress
181 except IndexError:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000182 # character set contains unicode characters
Fredrik Lundh3562f112000-07-02 12:00:07 +0000183 return charset
184 # compress character map
185 i = p = n = 0
186 runs = []
187 for c in charmap:
188 if c:
189 if n == 0:
190 p = i
191 n = n + 1
192 elif n:
193 runs.append((p, n))
194 n = 0
195 i = i + 1
196 if n:
197 runs.append((p, n))
198 if len(runs) <= 2:
199 # use literal/range
200 for p, n in runs:
201 if n == 1:
202 out.append((LITERAL, p))
203 else:
204 out.append((RANGE, (p, p+n-1)))
205 if len(out) < len(charset):
206 return out
207 else:
208 # use bitmap
209 data = []
210 m = 1; v = 0
211 for c in charmap:
212 if c:
213 v = v + m
214 m = m << 1
215 if m > MAXCODE:
216 data.append(v)
217 m = 1; v = 0
218 out.append((CHARSET, data))
219 return out
220 return charset
221
Fredrik Lundhe1869832000-08-01 22:47:49 +0000222def _simple(av):
223 # check if av is a "simple" operator
224 lo, hi = av[2].getwidth()
225 if lo == 0:
226 raise error, "nothing to repeat"
227 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
228
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000229def _compile_info(code, pattern, flags):
230 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000231 # this contains min/max pattern width, and an optional literal
232 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000233 lo, hi = pattern.getwidth()
234 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000235 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000236 # look for a literal prefix
237 prefix = []
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000238 prefix_skip = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000239 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000240 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000241 # look for literal prefix
Fredrik Lundh90a07912000-06-30 07:50:59 +0000242 for op, av in pattern.data:
243 if op is LITERAL:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000244 if len(prefix) == prefix_skip:
245 prefix_skip = prefix_skip + 1
Fredrik Lundh0640e112000-06-30 13:55:15 +0000246 prefix.append(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000247 elif op is SUBPATTERN and len(av[1]) == 1:
248 op, av = av[1][0]
249 if op is LITERAL:
250 prefix.append(av)
251 else:
252 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000253 else:
254 break
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000255 # if no prefix, look for charset prefix
256 if not prefix and pattern.data:
257 op, av = pattern.data[0]
258 if op is SUBPATTERN and av[1]:
259 op, av = av[1][0]
260 if op is LITERAL:
261 charset.append((op, av))
262 elif op is BRANCH:
263 c = []
264 for p in av[1]:
265 if not p:
266 break
267 op, av = p[0]
268 if op is LITERAL:
269 c.append((op, av))
270 else:
271 break
272 else:
273 charset = c
274 elif op is BRANCH:
275 c = []
276 for p in av[1]:
277 if not p:
278 break
279 op, av = p[0]
280 if op is LITERAL:
281 c.append((op, av))
282 else:
283 break
284 else:
285 charset = c
286 elif op is IN:
287 charset = av
288## if prefix:
289## print "*** PREFIX", prefix, prefix_skip
290## if charset:
291## print "*** CHARSET", charset
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000292 # add an info block
293 emit = code.append
294 emit(OPCODES[INFO])
295 skip = len(code); emit(0)
296 # literal flag
297 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000298 if prefix:
299 mask = SRE_INFO_PREFIX
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000300 if len(prefix) == prefix_skip == len(pattern.data):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000301 mask = mask + SRE_INFO_LITERAL
302 elif charset:
303 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000304 emit(mask)
305 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000306 if lo < MAXCODE:
307 emit(lo)
308 else:
309 emit(MAXCODE)
310 prefix = prefix[:MAXCODE]
311 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000312 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000313 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000314 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000315 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000316 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000317 emit(len(prefix)) # length
318 emit(prefix_skip) # skip
319 code.extend(prefix)
320 # generate overlap table
321 table = [-1] + ([0]*len(prefix))
322 for i in range(len(prefix)):
323 table[i+1] = table[i]+1
324 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
325 table[i+1] = table[table[i+1]-1]+1
326 code.extend(table[1:]) # don't store first entry
Fredrik Lundh3562f112000-07-02 12:00:07 +0000327 elif charset:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000328 _compile_charset(charset, 0, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000329 code[skip] = len(code) - skip
330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000331STRING_TYPES = [type("")]
332
333try:
334 STRING_TYPES.append(type(unicode("")))
335except NameError:
336 pass
337
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000338def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000339
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000340 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000341 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000342
343 # compile info block
344 _compile_info(code, p, flags)
345
346 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000347 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000348
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000349 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000350
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000351 return code
352
353def compile(p, flags=0):
354 # internal: convert pattern list to internal format
355
356 if type(p) in STRING_TYPES:
357 import sre_parse
358 pattern = p
359 p = sre_parse.parse(p, flags)
360 else:
361 pattern = None
362
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000363 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000364
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 # print code
366
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000367 # FIXME: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000368 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000369 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000370
Fredrik Lundhc2301732000-07-02 22:25:39 +0000371 # map in either direction
372 groupindex = p.pattern.groupdict
373 indexgroup = [None] * p.pattern.groups
374 for k, i in groupindex.items():
375 indexgroup[i] = k
376
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000377 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000378 pattern, flags, code,
379 p.pattern.groups-1,
380 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000381 )