blob: ef26e1cab113630a1a830a19ffc6d020a421da15 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossum7627c0d2000-03-31 14:58:54 +00008# Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +00009# CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossum7627c0d2000-03-31 14:58:54 +000010# other compatibility work.
11#
12
Guido van Rossum7627c0d2000-03-31 14:58:54 +000013import _sre
14
15from sre_constants import *
16
Fredrik Lundh3562f112000-07-02 12:00:07 +000017MAXCODE = 65535
18
Fredrik Lundh28552902000-07-05 21:14:16 +000019def _charset(charset, fixup=None):
Fredrik Lundh3562f112000-07-02 12:00:07 +000020 # internal: optimize character set
Fredrik Lundh28552902000-07-05 21:14:16 +000021 if not fixup:
22 fixup = lambda x: x
Fredrik Lundh3562f112000-07-02 12:00:07 +000023 out = []
24 charmap = [0]*256
25 try:
26 for op, av in charset:
27 if op is NEGATE:
28 out.append((op, av))
29 elif op is LITERAL:
30 charmap[fixup(av)] = 1
31 elif op is RANGE:
32 for i in range(fixup(av[0]), fixup(av[1])+1):
33 charmap[i] = 1
34 elif op is CATEGORY:
35 # FIXME: could append to charmap tail
36 return charset # cannot compress
37 except IndexError:
38 # unicode
39 return charset
40 # compress character map
41 i = p = n = 0
42 runs = []
43 for c in charmap:
44 if c:
45 if n == 0:
46 p = i
47 n = n + 1
48 elif n:
49 runs.append((p, n))
50 n = 0
51 i = i + 1
52 if n:
53 runs.append((p, n))
54 if len(runs) <= 2:
55 # use literal/range
56 for p, n in runs:
57 if n == 1:
58 out.append((LITERAL, p))
59 else:
60 out.append((RANGE, (p, p+n-1)))
61 if len(out) < len(charset):
62 return out
63 else:
64 # use bitmap
65 data = []
66 m = 1; v = 0
67 for c in charmap:
68 if c:
69 v = v + m
70 m = m << 1
71 if m > MAXCODE:
72 data.append(v)
73 m = 1; v = 0
74 out.append((CHARSET, data))
75 return out
76 return charset
77
Fredrik Lundh436c3d582000-06-29 08:58:44 +000078def _compile(code, pattern, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +000079 # internal: compile a (sub)pattern
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000080 emit = code.append
Guido van Rossum7627c0d2000-03-31 14:58:54 +000081 for op, av in pattern:
Fredrik Lundh43b3b492000-06-30 10:41:31 +000082 if op in (LITERAL, NOT_LITERAL):
Fredrik Lundh90a07912000-06-30 07:50:59 +000083 if flags & SRE_FLAG_IGNORECASE:
84 emit(OPCODES[OP_IGNORE[op]])
85 else:
86 emit(OPCODES[op])
Fredrik Lundh0640e112000-06-30 13:55:15 +000087 emit(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +000088 elif op is IN:
89 if flags & SRE_FLAG_IGNORECASE:
90 emit(OPCODES[OP_IGNORE[op]])
91 def fixup(literal, flags=flags):
Fredrik Lundh0640e112000-06-30 13:55:15 +000092 return _sre.getlower(literal, flags)
Fredrik Lundh90a07912000-06-30 07:50:59 +000093 else:
94 emit(OPCODES[op])
Fredrik Lundh4ccea942000-06-30 18:39:20 +000095 fixup = lambda x: x
Fredrik Lundh90a07912000-06-30 07:50:59 +000096 skip = len(code); emit(0)
Fredrik Lundh3562f112000-07-02 12:00:07 +000097 for op, av in _charset(av, fixup):
Fredrik Lundh90a07912000-06-30 07:50:59 +000098 emit(OPCODES[op])
99 if op is NEGATE:
100 pass
101 elif op is LITERAL:
102 emit(fixup(av))
103 elif op is RANGE:
104 emit(fixup(av[0]))
105 emit(fixup(av[1]))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000106 elif op is CHARSET:
107 code.extend(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000108 elif op is CATEGORY:
109 if flags & SRE_FLAG_LOCALE:
110 emit(CHCODES[CH_LOCALE[av]])
111 elif flags & SRE_FLAG_UNICODE:
112 emit(CHCODES[CH_UNICODE[av]])
113 else:
114 emit(CHCODES[av])
115 else:
116 raise error, "internal: unsupported set operator"
117 emit(OPCODES[FAILURE])
118 code[skip] = len(code) - skip
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000119 elif op is ANY:
120 if flags & SRE_FLAG_DOTALL:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000121 emit(OPCODES[op])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000122 else:
123 emit(OPCODES[CATEGORY])
124 emit(CHCODES[CATEGORY_NOT_LINEBREAK])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000125 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
126 if flags & SRE_FLAG_TEMPLATE:
127 emit(OPCODES[REPEAT])
128 skip = len(code); emit(0)
129 emit(av[0])
130 emit(av[1])
131 _compile(code, av[2], flags)
132 emit(OPCODES[SUCCESS])
133 code[skip] = len(code) - skip
134 else:
135 lo, hi = av[2].getwidth()
136 if lo == 0:
137 raise error, "nothing to repeat"
138 if 0 and lo == hi == 1 and op is MAX_REPEAT:
139 # FIXME: <fl> need a better way to figure out when
140 # it's safe to use this one (in the parser, probably)
141 emit(OPCODES[MAX_REPEAT_ONE])
142 skip = len(code); emit(0)
143 emit(av[0])
144 emit(av[1])
145 _compile(code, av[2], flags)
146 emit(OPCODES[SUCCESS])
147 code[skip] = len(code) - skip
148 else:
149 emit(OPCODES[op])
150 skip = len(code); emit(0)
151 emit(av[0])
152 emit(av[1])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000153 mark = MAXCODE
154 if av[2][0][0] == SUBPATTERN:
155 # repeated subpattern
156 gid, foo = av[2][0][1]
157 if gid:
158 mark = (gid-1)*2
159 emit(mark)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000160 _compile(code, av[2], flags)
161 emit(OPCODES[SUCCESS])
162 code[skip] = len(code) - skip
163 elif op is SUBPATTERN:
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000164 gid = av[0]
165 if gid:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000166 emit(OPCODES[MARK])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000167 emit((gid-1)*2)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000168 _compile(code, av[1], flags)
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000169 if gid:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000170 emit(OPCODES[MARK])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000171 emit((gid-1)*2+1)
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000172 elif op in (SUCCESS, FAILURE):
173 emit(OPCODES[op])
Fredrik Lundh6f013982000-07-03 18:44:21 +0000174 elif op in (ASSERT, ASSERT_NOT):
175 emit(OPCODES[op])
176 skip = len(code); emit(0)
177 if av[0] >= 0:
178 emit(0) # look ahead
179 else:
180 lo, hi = av[1].getwidth()
181 if lo != hi:
182 raise error, "look-behind requires fixed-width pattern"
183 emit(lo) # look behind
184 _compile(code, av[1], flags)
185 emit(OPCODES[SUCCESS])
186 code[skip] = len(code) - skip
187 elif op is CALL:
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000188 emit(OPCODES[op])
189 skip = len(code); emit(0)
190 _compile(code, av, flags)
191 emit(OPCODES[SUCCESS])
192 code[skip] = len(code) - skip
193 elif op is AT:
194 emit(OPCODES[op])
195 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundh55a4f4a2000-06-30 22:37:31 +0000196 emit(ATCODES[AT_MULTILINE.get(av, av)])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000197 else:
198 emit(ATCODES[av])
199 elif op is BRANCH:
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000200 tail = []
201 for av in av[1]:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000202 emit(OPCODES[op])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000203 skip = len(code); emit(0)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000204 emit(MAXCODE) # save mark
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000205 _compile(code, av, flags)
206 emit(OPCODES[JUMP])
207 tail.append(len(code)); emit(0)
208 code[skip] = len(code) - skip
209 emit(0) # end of branch
210 for tail in tail:
211 code[tail] = len(code) - tail
212 elif op is CATEGORY:
213 emit(OPCODES[op])
214 if flags & SRE_FLAG_LOCALE:
215 emit(CHCODES[CH_LOCALE[av]])
216 elif flags & SRE_FLAG_UNICODE:
217 emit(CHCODES[CH_UNICODE[av]])
218 else:
219 emit(CHCODES[av])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000220 elif op is GROUPREF:
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000221 if flags & SRE_FLAG_IGNORECASE:
222 emit(OPCODES[OP_IGNORE[op]])
223 else:
224 emit(OPCODES[op])
225 emit(av-1)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000226 elif op in (MARK, INDEX):
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000227 emit(OPCODES[op])
228 emit(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000229 else:
230 raise ValueError, ("unsupported operand type", op)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000231
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000232def _compile_info(code, pattern, flags):
233 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000234 # this contains min/max pattern width, and an optional literal
235 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000236 lo, hi = pattern.getwidth()
237 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000238 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000239 # look for a literal prefix
240 prefix = []
Fredrik Lundh3562f112000-07-02 12:00:07 +0000241 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000242 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000243 for op, av in pattern.data:
244 if op is LITERAL:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000245 prefix.append(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000246 else:
247 break
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000248 # add an info block
249 emit = code.append
250 emit(OPCODES[INFO])
251 skip = len(code); emit(0)
252 # literal flag
253 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000254 if prefix:
255 mask = SRE_INFO_PREFIX
256 if len(prefix) == len(pattern.data):
257 mask = mask + SRE_INFO_LITERAL
258 elif charset:
259 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000260 emit(mask)
261 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000262 if lo < MAXCODE:
263 emit(lo)
264 else:
265 emit(MAXCODE)
266 prefix = prefix[:MAXCODE]
267 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000268 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000269 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000270 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000271 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000272 if prefix:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000273 emit(len(prefix))
274 if prefix:
275 code.extend(prefix)
276 # generate overlap table
277 table = [-1] + ([0]*len(prefix))
278 for i in range(len(prefix)):
279 table[i+1] = table[i]+1
280 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
281 table[i+1] = table[table[i+1]-1]+1
282 code.extend(table[1:]) # don't store first entry
283 elif charset:
284 for char in charset:
285 emit(OPCODES[LITERAL])
286 emit(char)
287 emit(OPCODES[FAILURE])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000288 code[skip] = len(code) - skip
289
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000290STRING_TYPES = [type("")]
291
292try:
293 STRING_TYPES.append(type(unicode("")))
294except NameError:
295 pass
296
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000297def compile(p, flags=0):
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000298 # internal: convert pattern list to internal format
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000299
300 # compile, as necessary
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000301 if type(p) in STRING_TYPES:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000302 import sre_parse
303 pattern = p
Fredrik Lundh55a4f4a2000-06-30 22:37:31 +0000304 p = sre_parse.parse(p, flags)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000305 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000306 pattern = None
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000307
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000308 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000309 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000310
311 # compile info block
312 _compile_info(code, p, flags)
313
314 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000316
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000317 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 # print code
320
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000321 # FIXME: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000322 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000323 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000324
Fredrik Lundhc2301732000-07-02 22:25:39 +0000325 # map in either direction
326 groupindex = p.pattern.groupdict
327 indexgroup = [None] * p.pattern.groups
328 for k, i in groupindex.items():
329 indexgroup[i] = k
330
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000331 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000332 pattern, flags, code,
333 p.pattern.groups-1,
334 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000335 )