blob: 828b1702fe335cfc1115b5908d47eeed1692e19c [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossum7627c0d2000-03-31 14:58:54 +00008# Portions of this engine have been developed in cooperation with
Fredrik Lundh22d25462000-07-01 17:50:59 +00009# CNRI. Hewlett-Packard provided funding for 2.0 integration and
Guido van Rossum7627c0d2000-03-31 14:58:54 +000010# other compatibility work.
11#
12
Guido van Rossum7627c0d2000-03-31 14:58:54 +000013import _sre
14
15from sre_constants import *
16
Fredrik Lundh3562f112000-07-02 12:00:07 +000017MAXCODE = 65535
18
19def _charset(charset, fixup):
20 # internal: optimize character set
21 out = []
22 charmap = [0]*256
23 try:
24 for op, av in charset:
25 if op is NEGATE:
26 out.append((op, av))
27 elif op is LITERAL:
28 charmap[fixup(av)] = 1
29 elif op is RANGE:
30 for i in range(fixup(av[0]), fixup(av[1])+1):
31 charmap[i] = 1
32 elif op is CATEGORY:
33 # FIXME: could append to charmap tail
34 return charset # cannot compress
35 except IndexError:
36 # unicode
37 return charset
38 # compress character map
39 i = p = n = 0
40 runs = []
41 for c in charmap:
42 if c:
43 if n == 0:
44 p = i
45 n = n + 1
46 elif n:
47 runs.append((p, n))
48 n = 0
49 i = i + 1
50 if n:
51 runs.append((p, n))
52 if len(runs) <= 2:
53 # use literal/range
54 for p, n in runs:
55 if n == 1:
56 out.append((LITERAL, p))
57 else:
58 out.append((RANGE, (p, p+n-1)))
59 if len(out) < len(charset):
60 return out
61 else:
62 # use bitmap
63 data = []
64 m = 1; v = 0
65 for c in charmap:
66 if c:
67 v = v + m
68 m = m << 1
69 if m > MAXCODE:
70 data.append(v)
71 m = 1; v = 0
72 out.append((CHARSET, data))
73 return out
74 return charset
75
Fredrik Lundh436c3d582000-06-29 08:58:44 +000076def _compile(code, pattern, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +000077 # internal: compile a (sub)pattern
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000078 emit = code.append
Guido van Rossum7627c0d2000-03-31 14:58:54 +000079 for op, av in pattern:
Fredrik Lundh43b3b492000-06-30 10:41:31 +000080 if op in (LITERAL, NOT_LITERAL):
Fredrik Lundh90a07912000-06-30 07:50:59 +000081 if flags & SRE_FLAG_IGNORECASE:
82 emit(OPCODES[OP_IGNORE[op]])
83 else:
84 emit(OPCODES[op])
Fredrik Lundh0640e112000-06-30 13:55:15 +000085 emit(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +000086 elif op is IN:
87 if flags & SRE_FLAG_IGNORECASE:
88 emit(OPCODES[OP_IGNORE[op]])
89 def fixup(literal, flags=flags):
Fredrik Lundh0640e112000-06-30 13:55:15 +000090 return _sre.getlower(literal, flags)
Fredrik Lundh90a07912000-06-30 07:50:59 +000091 else:
92 emit(OPCODES[op])
Fredrik Lundh4ccea942000-06-30 18:39:20 +000093 fixup = lambda x: x
Fredrik Lundh90a07912000-06-30 07:50:59 +000094 skip = len(code); emit(0)
Fredrik Lundh3562f112000-07-02 12:00:07 +000095 for op, av in _charset(av, fixup):
Fredrik Lundh90a07912000-06-30 07:50:59 +000096 emit(OPCODES[op])
97 if op is NEGATE:
98 pass
99 elif op is LITERAL:
100 emit(fixup(av))
101 elif op is RANGE:
102 emit(fixup(av[0]))
103 emit(fixup(av[1]))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000104 elif op is CHARSET:
105 code.extend(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000106 elif op is CATEGORY:
107 if flags & SRE_FLAG_LOCALE:
108 emit(CHCODES[CH_LOCALE[av]])
109 elif flags & SRE_FLAG_UNICODE:
110 emit(CHCODES[CH_UNICODE[av]])
111 else:
112 emit(CHCODES[av])
113 else:
114 raise error, "internal: unsupported set operator"
115 emit(OPCODES[FAILURE])
116 code[skip] = len(code) - skip
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000117 elif op is ANY:
118 if flags & SRE_FLAG_DOTALL:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000119 emit(OPCODES[op])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000120 else:
121 emit(OPCODES[CATEGORY])
122 emit(CHCODES[CATEGORY_NOT_LINEBREAK])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000123 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
124 if flags & SRE_FLAG_TEMPLATE:
125 emit(OPCODES[REPEAT])
126 skip = len(code); emit(0)
127 emit(av[0])
128 emit(av[1])
129 _compile(code, av[2], flags)
130 emit(OPCODES[SUCCESS])
131 code[skip] = len(code) - skip
132 else:
133 lo, hi = av[2].getwidth()
134 if lo == 0:
135 raise error, "nothing to repeat"
136 if 0 and lo == hi == 1 and op is MAX_REPEAT:
137 # FIXME: <fl> need a better way to figure out when
138 # it's safe to use this one (in the parser, probably)
139 emit(OPCODES[MAX_REPEAT_ONE])
140 skip = len(code); emit(0)
141 emit(av[0])
142 emit(av[1])
143 _compile(code, av[2], flags)
144 emit(OPCODES[SUCCESS])
145 code[skip] = len(code) - skip
146 else:
147 emit(OPCODES[op])
148 skip = len(code); emit(0)
149 emit(av[0])
150 emit(av[1])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000151 mark = MAXCODE
152 if av[2][0][0] == SUBPATTERN:
153 # repeated subpattern
154 gid, foo = av[2][0][1]
155 if gid:
156 mark = (gid-1)*2
157 emit(mark)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000158 _compile(code, av[2], flags)
159 emit(OPCODES[SUCCESS])
160 code[skip] = len(code) - skip
161 elif op is SUBPATTERN:
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000162 gid = av[0]
163 if gid:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000164 emit(OPCODES[MARK])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000165 emit((gid-1)*2)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000166 _compile(code, av[1], flags)
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000167 if gid:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000168 emit(OPCODES[MARK])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000169 emit((gid-1)*2+1)
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000170 elif op in (SUCCESS, FAILURE):
171 emit(OPCODES[op])
Fredrik Lundh6f013982000-07-03 18:44:21 +0000172 elif op in (ASSERT, ASSERT_NOT):
173 emit(OPCODES[op])
174 skip = len(code); emit(0)
175 if av[0] >= 0:
176 emit(0) # look ahead
177 else:
178 lo, hi = av[1].getwidth()
179 if lo != hi:
180 raise error, "look-behind requires fixed-width pattern"
181 emit(lo) # look behind
182 _compile(code, av[1], flags)
183 emit(OPCODES[SUCCESS])
184 code[skip] = len(code) - skip
185 elif op is CALL:
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000186 emit(OPCODES[op])
187 skip = len(code); emit(0)
188 _compile(code, av, flags)
189 emit(OPCODES[SUCCESS])
190 code[skip] = len(code) - skip
191 elif op is AT:
192 emit(OPCODES[op])
193 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundh55a4f4a2000-06-30 22:37:31 +0000194 emit(ATCODES[AT_MULTILINE.get(av, av)])
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000195 else:
196 emit(ATCODES[av])
197 elif op is BRANCH:
198 emit(OPCODES[op])
199 tail = []
200 for av in av[1]:
201 skip = len(code); emit(0)
202 _compile(code, av, flags)
203 emit(OPCODES[JUMP])
204 tail.append(len(code)); emit(0)
205 code[skip] = len(code) - skip
206 emit(0) # end of branch
207 for tail in tail:
208 code[tail] = len(code) - tail
209 elif op is CATEGORY:
210 emit(OPCODES[op])
211 if flags & SRE_FLAG_LOCALE:
212 emit(CHCODES[CH_LOCALE[av]])
213 elif flags & SRE_FLAG_UNICODE:
214 emit(CHCODES[CH_UNICODE[av]])
215 else:
216 emit(CHCODES[av])
Fredrik Lundh72b82ba2000-07-03 21:31:48 +0000217 elif op is GROUPREF:
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000218 if flags & SRE_FLAG_IGNORECASE:
219 emit(OPCODES[OP_IGNORE[op]])
220 else:
221 emit(OPCODES[op])
222 emit(av-1)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000223 elif op in (MARK, INDEX):
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000224 emit(OPCODES[op])
225 emit(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000226 else:
227 raise ValueError, ("unsupported operand type", op)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000228
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000229def _compile_info(code, pattern, flags):
230 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000231 # this contains min/max pattern width, and an optional literal
232 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000233 lo, hi = pattern.getwidth()
234 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000235 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000236 # look for a literal prefix
237 prefix = []
Fredrik Lundh3562f112000-07-02 12:00:07 +0000238 charset = [] # not used
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000239 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000240 for op, av in pattern.data:
241 if op is LITERAL:
Fredrik Lundh0640e112000-06-30 13:55:15 +0000242 prefix.append(av)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000243 else:
244 break
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000245 # add an info block
246 emit = code.append
247 emit(OPCODES[INFO])
248 skip = len(code); emit(0)
249 # literal flag
250 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000251 if prefix:
252 mask = SRE_INFO_PREFIX
253 if len(prefix) == len(pattern.data):
254 mask = mask + SRE_INFO_LITERAL
255 elif charset:
256 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000257 emit(mask)
258 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000259 if lo < MAXCODE:
260 emit(lo)
261 else:
262 emit(MAXCODE)
263 prefix = prefix[:MAXCODE]
264 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000265 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000266 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000267 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000268 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000269 if prefix:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000270 emit(len(prefix))
271 if prefix:
272 code.extend(prefix)
273 # generate overlap table
274 table = [-1] + ([0]*len(prefix))
275 for i in range(len(prefix)):
276 table[i+1] = table[i]+1
277 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
278 table[i+1] = table[table[i+1]-1]+1
279 code.extend(table[1:]) # don't store first entry
280 elif charset:
281 for char in charset:
282 emit(OPCODES[LITERAL])
283 emit(char)
284 emit(OPCODES[FAILURE])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000285 code[skip] = len(code) - skip
286
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000287def compile(p, flags=0):
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000288 # internal: convert pattern list to internal format
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000289
290 # compile, as necessary
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000291 if type(p) in (type(""), type(u"")):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000292 import sre_parse
293 pattern = p
Fredrik Lundh55a4f4a2000-06-30 22:37:31 +0000294 p = sre_parse.parse(p, flags)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000295 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000296 pattern = None
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000297
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000298 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000299 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000300
301 # compile info block
302 _compile_info(code, p, flags)
303
304 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000305 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000306
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000307 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000308
309 # FIXME: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000310 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000311 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000312
Fredrik Lundhc2301732000-07-02 22:25:39 +0000313 # map in either direction
314 groupindex = p.pattern.groupdict
315 indexgroup = [None] * p.pattern.groups
316 for k, i in groupindex.items():
317 indexgroup[i] = k
318
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000319 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000320 pattern, flags, code,
321 p.pattern.groups-1,
322 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000323 )