blob: abd619e1e9bd7769f411fe3b286484818f2551cf [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Guido van Rossum7627c0d2000-03-31 14:58:54 +000011import _sre
12
13from sre_constants import *
14
Fredrik Lundh3562f112000-07-02 12:00:07 +000015MAXCODE = 65535
16
Fredrik Lundh28552902000-07-05 21:14:16 +000017def _charset(charset, fixup=None):
Fredrik Lundh3562f112000-07-02 12:00:07 +000018 # internal: optimize character set
Fredrik Lundh28552902000-07-05 21:14:16 +000019 if not fixup:
20 fixup = lambda x: x
Fredrik Lundh3562f112000-07-02 12:00:07 +000021 out = []
22 charmap = [0]*256
23 try:
24 for op, av in charset:
25 if op is NEGATE:
26 out.append((op, av))
27 elif op is LITERAL:
28 charmap[fixup(av)] = 1
29 elif op is RANGE:
30 for i in range(fixup(av[0]), fixup(av[1])+1):
31 charmap[i] = 1
32 elif op is CATEGORY:
33 # FIXME: could append to charmap tail
34 return charset # cannot compress
35 except IndexError:
36 # unicode
37 return charset
38 # compress character map
39 i = p = n = 0
40 runs = []
41 for c in charmap:
42 if c:
43 if n == 0:
44 p = i
45 n = n + 1
46 elif n:
47 runs.append((p, n))
48 n = 0
49 i = i + 1
50 if n:
51 runs.append((p, n))
52 if len(runs) <= 2:
53 # use literal/range
54 for p, n in runs:
55 if n == 1:
56 out.append((LITERAL, p))
57 else:
58 out.append((RANGE, (p, p+n-1)))
59 if len(out) < len(charset):
60 return out
61 else:
62 # use bitmap
63 data = []
64 m = 1; v = 0
65 for c in charmap:
66 if c:
67 v = v + m
68 m = m << 1
69 if m > MAXCODE:
70 data.append(v)
71 m = 1; v = 0
72 out.append((CHARSET, data))
73 return out
74 return charset
75
Fredrik Lundhe1869832000-08-01 22:47:49 +000076def _simple(av):
77 # check if av is a "simple" operator
78 lo, hi = av[2].getwidth()
79 if lo == 0:
80 raise error, "nothing to repeat"
81 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
82
Fredrik Lundh436c3d582000-06-29 08:58:44 +000083def _compile(code, pattern, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +000084 # internal: compile a (sub)pattern
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000085 emit = code.append
Guido van Rossum7627c0d2000-03-31 14:58:54 +000086 for op, av in pattern:
Fredrik Lundh43b3b492000-06-30 10:41:31 +000087 if op in (LITERAL, NOT_LITERAL):
Fredrik Lundh90a07912000-06-30 07:50:59 +000088 if flags & SRE_FLAG_IGNORECASE:
89 emit(OPCODES[OP_IGNORE[op]])
90 else:
91 emit(OPCODES[op])
Fredrik Lundh0640e112000-06-30 13:55:15 +000092 emit(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +000093 elif op is IN:
94 if flags & SRE_FLAG_IGNORECASE:
95 emit(OPCODES[OP_IGNORE[op]])
96 def fixup(literal, flags=flags):
Fredrik Lundh0640e112000-06-30 13:55:15 +000097 return _sre.getlower(literal, flags)
Fredrik Lundh90a07912000-06-30 07:50:59 +000098 else:
99 emit(OPCODES[op])
Fredrik Lundh4ccea942000-06-30 18:39:20 +0000100 fixup = lambda x: x
Fredrik Lundh90a07912000-06-30 07:50:59 +0000101 skip = len(code); emit(0)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000102 for op, av in _charset(av, fixup):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000103 emit(OPCODES[op])
104 if op is NEGATE:
105 pass
106 elif op is LITERAL:
107 emit(fixup(av))
108 elif op is RANGE:
109 emit(fixup(av[0]))
110 emit(fixup(av[1]))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000111 elif op is CHARSET:
112 code.extend(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000113 elif op is CATEGORY:
114 if flags & SRE_FLAG_LOCALE:
115 emit(CHCODES[CH_LOCALE[av]])
116 elif flags & SRE_FLAG_UNICODE:
117 emit(CHCODES[CH_UNICODE[av]])
118 else:
119 emit(CHCODES[av])
120 else:
121 raise error, "internal: unsupported set operator"
122 emit(OPCODES[FAILURE])
123 code[skip] = len(code) - skip
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000124 elif op is ANY:
125 if flags & SRE_FLAG_DOTALL:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000126 emit(OPCODES[ANY_ALL])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000127 else:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000128 emit(OPCODES[ANY])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000129 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
130 if flags & SRE_FLAG_TEMPLATE:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000131 raise error, "internal: unsupported template operator"
Fredrik Lundh90a07912000-06-30 07:50:59 +0000132 emit(OPCODES[REPEAT])
133 skip = len(code); emit(0)
134 emit(av[0])
135 emit(av[1])
136 _compile(code, av[2], flags)
137 emit(OPCODES[SUCCESS])
138 code[skip] = len(code) - skip
Fredrik Lundhe1869832000-08-01 22:47:49 +0000139 elif _simple(av) and op == MAX_REPEAT:
140 emit(OPCODES[REPEAT_ONE])
141 skip = len(code); emit(0)
142 emit(av[0])
143 emit(av[1])
144 _compile(code, av[2], flags)
145 emit(OPCODES[SUCCESS])
146 code[skip] = len(code) - skip
Fredrik Lundh90a07912000-06-30 07:50:59 +0000147 else:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000148 emit(OPCODES[REPEAT])
149 skip = len(code); emit(0)
150 emit(av[0])
151 emit(av[1])
152 _compile(code, av[2], flags)
153 code[skip] = len(code) - skip
154 if op == MAX_REPEAT:
155 emit(OPCODES[MAX_UNTIL])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000156 else:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000157 emit(OPCODES[MIN_UNTIL])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000158 elif op is SUBPATTERN:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000159 if av[0]:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000160 emit(OPCODES[MARK])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000161 emit((av[0]-1)*2)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000162 _compile(code, av[1], flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000163 if av[0]:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000164 emit(OPCODES[MARK])
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000165 emit((av[0]-1)*2+1)
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000166 elif op in (SUCCESS, FAILURE):
167 emit(OPCODES[op])
Fredrik Lundh6f013982000-07-03 18:44:21 +0000168 elif op in (ASSERT, ASSERT_NOT):
169 emit(OPCODES[op])
170 skip = len(code); emit(0)
171 if av[0] >= 0:
172 emit(0) # look ahead
173 else:
174 lo, hi = av[1].getwidth()
175 if lo != hi:
176 raise error, "look-behind requires fixed-width pattern"
177 emit(lo) # look behind
178 _compile(code, av[1], flags)
179 emit(OPCODES[SUCCESS])
180 code[skip] = len(code) - skip
181 elif op is CALL:
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000182 emit(OPCODES[op])
183 skip = len(code); emit(0)
184 _compile(code, av, flags)
185 emit(OPCODES[SUCCESS])
186 code[skip] = len(code) - skip
187 elif op is AT:
188 emit(OPCODES[op])
189 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundh55a4f4a2000-06-30 22:37:31 +0000190 emit(ATCODES[AT_MULTILINE.get(av, av)])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000191 else:
192 emit(ATCODES[av])
193 elif op is BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000194 emit(OPCODES[op])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000195 tail = []
196 for av in av[1]:
197 skip = len(code); emit(0)
198 _compile(code, av, flags)
199 emit(OPCODES[JUMP])
200 tail.append(len(code)); emit(0)
201 code[skip] = len(code) - skip
202 emit(0) # end of branch
203 for tail in tail:
204 code[tail] = len(code) - tail
205 elif op is CATEGORY:
206 emit(OPCODES[op])
207 if flags & SRE_FLAG_LOCALE:
208 emit(CHCODES[CH_LOCALE[av]])
209 elif flags & SRE_FLAG_UNICODE:
210 emit(CHCODES[CH_UNICODE[av]])
211 else:
212 emit(CHCODES[av])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000213 elif op is GROUPREF:
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000214 if flags & SRE_FLAG_IGNORECASE:
215 emit(OPCODES[OP_IGNORE[op]])
216 else:
217 emit(OPCODES[op])
218 emit(av-1)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000219 else:
220 raise ValueError, ("unsupported operand type", op)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000221
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000222def _compile_info(code, pattern, flags):
223 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000224 # this contains min/max pattern width, and an optional literal
225 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000226 lo, hi = pattern.getwidth()
227 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000228 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000229 # look for a literal prefix
230 prefix = []
Fredrik Lundh3562f112000-07-02 12:00:07 +0000231 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000232 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000233 for op, av in pattern.data:
234 if op is LITERAL:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000235 prefix.append(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000236 else:
237 break
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000238 # add an info block
239 emit = code.append
240 emit(OPCODES[INFO])
241 skip = len(code); emit(0)
242 # literal flag
243 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000244 if prefix:
245 mask = SRE_INFO_PREFIX
246 if len(prefix) == len(pattern.data):
247 mask = mask + SRE_INFO_LITERAL
248 elif charset:
249 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000250 emit(mask)
251 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000252 if lo < MAXCODE:
253 emit(lo)
254 else:
255 emit(MAXCODE)
256 prefix = prefix[:MAXCODE]
257 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000258 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000259 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000260 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000261 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000262 if prefix:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000263 emit(len(prefix))
264 if prefix:
265 code.extend(prefix)
266 # generate overlap table
267 table = [-1] + ([0]*len(prefix))
268 for i in range(len(prefix)):
269 table[i+1] = table[i]+1
270 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
271 table[i+1] = table[table[i+1]-1]+1
272 code.extend(table[1:]) # don't store first entry
273 elif charset:
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000274 # FIXME: use charset optimizer!
Fredrik Lundh3562f112000-07-02 12:00:07 +0000275 for char in charset:
276 emit(OPCODES[LITERAL])
277 emit(char)
278 emit(OPCODES[FAILURE])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000279 code[skip] = len(code) - skip
280
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000281STRING_TYPES = [type("")]
282
283try:
284 STRING_TYPES.append(type(unicode("")))
285except NameError:
286 pass
287
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000288def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000289
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000290 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000291 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000292
293 # compile info block
294 _compile_info(code, p, flags)
295
296 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000297 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000298
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000299 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000300
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000301 return code
302
303def compile(p, flags=0):
304 # internal: convert pattern list to internal format
305
306 if type(p) in STRING_TYPES:
307 import sre_parse
308 pattern = p
309 p = sre_parse.parse(p, flags)
310 else:
311 pattern = None
312
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000313 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 # print code
316
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000317 # FIXME: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000318 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000319 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000320
Fredrik Lundhc2301732000-07-02 22:25:39 +0000321 # map in either direction
322 groupindex = p.pattern.groupdict
323 indexgroup = [None] * p.pattern.groups
324 for k, i in groupindex.items():
325 indexgroup[i] = k
326
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000327 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000328 pattern, flags, code,
329 p.pattern.groups-1,
330 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000331 )