blob: a593ee73f05d682c3f9b0fe3e86219af7421c144 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossum7627c0d2000-03-31 14:58:54 +00008# Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +00009# CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossum7627c0d2000-03-31 14:58:54 +000010# other compatibility work.
11#
12
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000013import array
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014import _sre
15
16from sre_constants import *
17
18# find an array type code that matches the engine's code size
Fredrik Lundh3562f112000-07-02 12:00:07 +000019for WORDSIZE in "Hil":
Guido van Rossum7627c0d2000-03-31 14:58:54 +000020 if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
Fredrik Lundh90a07912000-06-30 07:50:59 +000021 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +000022else:
23 raise RuntimeError, "cannot find a useable array type"
24
Fredrik Lundh3562f112000-07-02 12:00:07 +000025MAXCODE = 65535
26
27def _charset(charset, fixup):
28 # internal: optimize character set
29 out = []
30 charmap = [0]*256
31 try:
32 for op, av in charset:
33 if op is NEGATE:
34 out.append((op, av))
35 elif op is LITERAL:
36 charmap[fixup(av)] = 1
37 elif op is RANGE:
38 for i in range(fixup(av[0]), fixup(av[1])+1):
39 charmap[i] = 1
40 elif op is CATEGORY:
41 # FIXME: could append to charmap tail
42 return charset # cannot compress
43 except IndexError:
44 # unicode
45 return charset
46 # compress character map
47 i = p = n = 0
48 runs = []
49 for c in charmap:
50 if c:
51 if n == 0:
52 p = i
53 n = n + 1
54 elif n:
55 runs.append((p, n))
56 n = 0
57 i = i + 1
58 if n:
59 runs.append((p, n))
60 if len(runs) <= 2:
61 # use literal/range
62 for p, n in runs:
63 if n == 1:
64 out.append((LITERAL, p))
65 else:
66 out.append((RANGE, (p, p+n-1)))
67 if len(out) < len(charset):
68 return out
69 else:
70 # use bitmap
71 data = []
72 m = 1; v = 0
73 for c in charmap:
74 if c:
75 v = v + m
76 m = m << 1
77 if m > MAXCODE:
78 data.append(v)
79 m = 1; v = 0
80 out.append((CHARSET, data))
81 return out
82 return charset
83
Fredrik Lundh436c3d582000-06-29 08:58:44 +000084def _compile(code, pattern, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085 # internal: compile a (sub)pattern
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000086 emit = code.append
Guido van Rossum7627c0d2000-03-31 14:58:54 +000087 for op, av in pattern:
Fredrik Lundh43b3b492000-06-30 10:41:31 +000088 if op in (LITERAL, NOT_LITERAL):
Fredrik Lundh90a07912000-06-30 07:50:59 +000089 if flags & SRE_FLAG_IGNORECASE:
90 emit(OPCODES[OP_IGNORE[op]])
91 else:
92 emit(OPCODES[op])
Fredrik Lundh0640e112000-06-30 13:55:15 +000093 emit(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +000094 elif op is IN:
95 if flags & SRE_FLAG_IGNORECASE:
96 emit(OPCODES[OP_IGNORE[op]])
97 def fixup(literal, flags=flags):
Fredrik Lundh0640e112000-06-30 13:55:15 +000098 return _sre.getlower(literal, flags)
Fredrik Lundh90a07912000-06-30 07:50:59 +000099 else:
100 emit(OPCODES[op])
Fredrik Lundh4ccea942000-06-30 18:39:20 +0000101 fixup = lambda x: x
Fredrik Lundh90a07912000-06-30 07:50:59 +0000102 skip = len(code); emit(0)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000103 for op, av in _charset(av, fixup):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000104 emit(OPCODES[op])
105 if op is NEGATE:
106 pass
107 elif op is LITERAL:
108 emit(fixup(av))
109 elif op is RANGE:
110 emit(fixup(av[0]))
111 emit(fixup(av[1]))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000112 elif op is CHARSET:
113 code.extend(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000114 elif op is CATEGORY:
115 if flags & SRE_FLAG_LOCALE:
116 emit(CHCODES[CH_LOCALE[av]])
117 elif flags & SRE_FLAG_UNICODE:
118 emit(CHCODES[CH_UNICODE[av]])
119 else:
120 emit(CHCODES[av])
121 else:
122 raise error, "internal: unsupported set operator"
123 emit(OPCODES[FAILURE])
124 code[skip] = len(code) - skip
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000125 elif op is ANY:
126 if flags & SRE_FLAG_DOTALL:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000127 emit(OPCODES[op])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000128 else:
129 emit(OPCODES[CATEGORY])
130 emit(CHCODES[CATEGORY_NOT_LINEBREAK])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000131 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
132 if flags & SRE_FLAG_TEMPLATE:
133 emit(OPCODES[REPEAT])
134 skip = len(code); emit(0)
135 emit(av[0])
136 emit(av[1])
137 _compile(code, av[2], flags)
138 emit(OPCODES[SUCCESS])
139 code[skip] = len(code) - skip
140 else:
141 lo, hi = av[2].getwidth()
142 if lo == 0:
143 raise error, "nothing to repeat"
144 if 0 and lo == hi == 1 and op is MAX_REPEAT:
145 # FIXME: <fl> need a better way to figure out when
146 # it's safe to use this one (in the parser, probably)
147 emit(OPCODES[MAX_REPEAT_ONE])
148 skip = len(code); emit(0)
149 emit(av[0])
150 emit(av[1])
151 _compile(code, av[2], flags)
152 emit(OPCODES[SUCCESS])
153 code[skip] = len(code) - skip
154 else:
155 emit(OPCODES[op])
156 skip = len(code); emit(0)
157 emit(av[0])
158 emit(av[1])
159 _compile(code, av[2], flags)
160 emit(OPCODES[SUCCESS])
161 code[skip] = len(code) - skip
162 elif op is SUBPATTERN:
163 group = av[0]
164 if group:
165 emit(OPCODES[MARK])
166 emit((group-1)*2)
167 _compile(code, av[1], flags)
168 if group:
169 emit(OPCODES[MARK])
170 emit((group-1)*2+1)
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000171 elif op in (SUCCESS, FAILURE):
172 emit(OPCODES[op])
173 elif op in (ASSERT, ASSERT_NOT, CALL):
174 emit(OPCODES[op])
175 skip = len(code); emit(0)
176 _compile(code, av, flags)
177 emit(OPCODES[SUCCESS])
178 code[skip] = len(code) - skip
179 elif op is AT:
180 emit(OPCODES[op])
181 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundh55a4f4a2000-06-30 22:37:31 +0000182 emit(ATCODES[AT_MULTILINE.get(av, av)])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000183 else:
184 emit(ATCODES[av])
185 elif op is BRANCH:
186 emit(OPCODES[op])
187 tail = []
188 for av in av[1]:
189 skip = len(code); emit(0)
190 _compile(code, av, flags)
191 emit(OPCODES[JUMP])
192 tail.append(len(code)); emit(0)
193 code[skip] = len(code) - skip
194 emit(0) # end of branch
195 for tail in tail:
196 code[tail] = len(code) - tail
197 elif op is CATEGORY:
198 emit(OPCODES[op])
199 if flags & SRE_FLAG_LOCALE:
200 emit(CHCODES[CH_LOCALE[av]])
201 elif flags & SRE_FLAG_UNICODE:
202 emit(CHCODES[CH_UNICODE[av]])
203 else:
204 emit(CHCODES[av])
205 elif op is GROUP:
206 if flags & SRE_FLAG_IGNORECASE:
207 emit(OPCODES[OP_IGNORE[op]])
208 else:
209 emit(OPCODES[op])
210 emit(av-1)
211 elif op is MARK:
212 emit(OPCODES[op])
213 emit(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000214 else:
215 raise ValueError, ("unsupported operand type", op)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000216
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000217def _compile_info(code, pattern, flags):
218 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000219 # this contains min/max pattern width, and an optional literal
220 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000221 lo, hi = pattern.getwidth()
222 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000223 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000224 # look for a literal prefix
225 prefix = []
Fredrik Lundh3562f112000-07-02 12:00:07 +0000226 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000227 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000228 for op, av in pattern.data:
229 if op is LITERAL:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000230 prefix.append(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000231 else:
232 break
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000233 # add an info block
234 emit = code.append
235 emit(OPCODES[INFO])
236 skip = len(code); emit(0)
237 # literal flag
238 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000239 if prefix:
240 mask = SRE_INFO_PREFIX
241 if len(prefix) == len(pattern.data):
242 mask = mask + SRE_INFO_LITERAL
243 elif charset:
244 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000245 emit(mask)
246 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000247 if lo < MAXCODE:
248 emit(lo)
249 else:
250 emit(MAXCODE)
251 prefix = prefix[:MAXCODE]
252 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000253 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000254 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000255 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000256 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000257 if prefix:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000258 emit(len(prefix))
259 if prefix:
260 code.extend(prefix)
261 # generate overlap table
262 table = [-1] + ([0]*len(prefix))
263 for i in range(len(prefix)):
264 table[i+1] = table[i]+1
265 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
266 table[i+1] = table[table[i+1]-1]+1
267 code.extend(table[1:]) # don't store first entry
268 elif charset:
269 for char in charset:
270 emit(OPCODES[LITERAL])
271 emit(char)
272 emit(OPCODES[FAILURE])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000273 code[skip] = len(code) - skip
274
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000275def compile(p, flags=0):
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000276 # internal: convert pattern list to internal format
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000277
278 # compile, as necessary
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000279 if type(p) in (type(""), type(u"")):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000280 import sre_parse
281 pattern = p
Fredrik Lundh55a4f4a2000-06-30 22:37:31 +0000282 p = sre_parse.parse(p, flags)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000283 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000284 pattern = None
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000285
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000286 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000287 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000288
289 # compile info block
290 _compile_info(code, p, flags)
291
292 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000293 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000294
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000295 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000296
297 # FIXME: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000298 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000299 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000300
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000301 return _sre.compile(
Fredrik Lundh90a07912000-06-30 07:50:59 +0000302 pattern, flags,
303 array.array(WORDSIZE, code).tostring(),
304 p.pattern.groups-1, p.pattern.groupdict
305 )