blob: 27ab1fe7482a0c2fc9046cc0cb392d008d509d6b [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Fred Drakeb8f22742001-09-04 19:10:20 +000011"""Internal support module for sre"""
12
Fredrik Lundh4fb70272002-06-27 20:08:25 +000013import _sre, sys
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014
15from sre_constants import *
16
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000017assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
Martin v. Löwis78e2f062003-04-19 12:56:08 +000019if _sre.CODESIZE == 2:
20 MAXCODE = 65535
21else:
22 MAXCODE = 0xFFFFFFFFL
Fredrik Lundh3562f112000-07-02 12:00:07 +000023
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000024def _identityfunction(x):
25 return x
26
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000027def _compile(code, pattern, flags):
28 # internal: compile a (sub)pattern
29 emit = code.append
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000030 _len = len
31 LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1}
32 REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1}
33 SUCCESS_CODES = {SUCCESS:1, FAILURE:1}
34 ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1}
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000035 for op, av in pattern:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000036 if op in LITERAL_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000037 if flags & SRE_FLAG_IGNORECASE:
38 emit(OPCODES[OP_IGNORE[op]])
Fredrik Lundh2e240442001-01-15 18:28:14 +000039 emit(_sre.getlower(av, flags))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000040 else:
41 emit(OPCODES[op])
Fredrik Lundh2e240442001-01-15 18:28:14 +000042 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000043 elif op is IN:
44 if flags & SRE_FLAG_IGNORECASE:
45 emit(OPCODES[OP_IGNORE[op]])
46 def fixup(literal, flags=flags):
47 return _sre.getlower(literal, flags)
48 else:
49 emit(OPCODES[op])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000050 fixup = _identityfunction
51 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000052 _compile_charset(av, flags, code, fixup)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000053 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000054 elif op is ANY:
55 if flags & SRE_FLAG_DOTALL:
56 emit(OPCODES[ANY_ALL])
57 else:
58 emit(OPCODES[ANY])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000059 elif op in REPEATING_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000060 if flags & SRE_FLAG_TEMPLATE:
61 raise error, "internal: unsupported template operator"
62 emit(OPCODES[REPEAT])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000063 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000064 emit(av[0])
65 emit(av[1])
66 _compile(code, av[2], flags)
67 emit(OPCODES[SUCCESS])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000068 code[skip] = _len(code) - skip
69 elif _simple(av) and op is not REPEAT:
70 if op is MAX_REPEAT:
Guido van Rossum41c99e72003-04-14 17:59:34 +000071 emit(OPCODES[REPEAT_ONE])
72 else:
73 emit(OPCODES[MIN_REPEAT_ONE])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000074 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000075 emit(av[0])
76 emit(av[1])
77 _compile(code, av[2], flags)
78 emit(OPCODES[SUCCESS])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000079 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000080 else:
81 emit(OPCODES[REPEAT])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000082 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000083 emit(av[0])
84 emit(av[1])
85 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000086 code[skip] = _len(code) - skip
87 if op is MAX_REPEAT:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000088 emit(OPCODES[MAX_UNTIL])
89 else:
90 emit(OPCODES[MIN_UNTIL])
91 elif op is SUBPATTERN:
92 if av[0]:
93 emit(OPCODES[MARK])
94 emit((av[0]-1)*2)
95 # _compile_info(code, av[1], flags)
96 _compile(code, av[1], flags)
97 if av[0]:
98 emit(OPCODES[MARK])
99 emit((av[0]-1)*2+1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000100 elif op in SUCCESS_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000101 emit(OPCODES[op])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000102 elif op in ASSERT_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000103 emit(OPCODES[op])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000104 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000105 if av[0] >= 0:
106 emit(0) # look ahead
107 else:
108 lo, hi = av[1].getwidth()
109 if lo != hi:
110 raise error, "look-behind requires fixed-width pattern"
111 emit(lo) # look behind
112 _compile(code, av[1], flags)
113 emit(OPCODES[SUCCESS])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000114 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000115 elif op is CALL:
116 emit(OPCODES[op])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000117 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000118 _compile(code, av, flags)
119 emit(OPCODES[SUCCESS])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000120 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000121 elif op is AT:
122 emit(OPCODES[op])
123 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000124 av = AT_MULTILINE.get(av, av)
125 if flags & SRE_FLAG_LOCALE:
126 av = AT_LOCALE.get(av, av)
127 elif flags & SRE_FLAG_UNICODE:
128 av = AT_UNICODE.get(av, av)
129 emit(ATCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000130 elif op is BRANCH:
131 emit(OPCODES[op])
132 tail = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000133 tailappend = tail.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000134 for av in av[1]:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000135 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000136 # _compile_info(code, av, flags)
137 _compile(code, av, flags)
138 emit(OPCODES[JUMP])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000139 tailappend(_len(code)); emit(0)
140 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000141 emit(0) # end of branch
142 for tail in tail:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000143 code[tail] = _len(code) - tail
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000144 elif op is CATEGORY:
145 emit(OPCODES[op])
146 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000147 av = CH_LOCALE[av]
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000148 elif flags & SRE_FLAG_UNICODE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000149 av = CH_UNICODE[av]
150 emit(CHCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000151 elif op is GROUPREF:
152 if flags & SRE_FLAG_IGNORECASE:
153 emit(OPCODES[OP_IGNORE[op]])
154 else:
155 emit(OPCODES[op])
156 emit(av-1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000157 elif op is GROUPREF_EXISTS:
158 emit(OPCODES[op])
159 emit((av[0]-1)*2)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000160 skipyes = _len(code); emit(0)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000161 _compile(code, av[1], flags)
162 if av[2]:
163 emit(OPCODES[JUMP])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000164 skipno = _len(code); emit(0)
165 code[skipyes] = _len(code) - skipyes + 1
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000166 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000167 code[skipno] = _len(code) - skipno
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000168 else:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000169 code[skipyes] = _len(code) - skipyes + 1
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000170 else:
171 raise ValueError, ("unsupported operand type", op)
172
173def _compile_charset(charset, flags, code, fixup=None):
174 # compile charset subprogram
175 emit = code.append
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000176 if fixup is None:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000177 fixup = _identityfunction
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000178 for op, av in _optimize_charset(charset, fixup):
179 emit(OPCODES[op])
180 if op is NEGATE:
181 pass
182 elif op is LITERAL:
183 emit(fixup(av))
184 elif op is RANGE:
185 emit(fixup(av[0]))
186 emit(fixup(av[1]))
187 elif op is CHARSET:
188 code.extend(av)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000189 elif op is BIGCHARSET:
190 code.extend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000191 elif op is CATEGORY:
192 if flags & SRE_FLAG_LOCALE:
193 emit(CHCODES[CH_LOCALE[av]])
194 elif flags & SRE_FLAG_UNICODE:
195 emit(CHCODES[CH_UNICODE[av]])
196 else:
197 emit(CHCODES[av])
198 else:
199 raise error, "internal: unsupported set operator"
200 emit(OPCODES[FAILURE])
201
202def _optimize_charset(charset, fixup):
203 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000204 out = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000205 outappend = out.append
Raymond Hettingerd732c952004-03-27 09:24:36 +0000206 charmap = [0]*256
Fredrik Lundh3562f112000-07-02 12:00:07 +0000207 try:
208 for op, av in charset:
209 if op is NEGATE:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000210 outappend((op, av))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000211 elif op is LITERAL:
Raymond Hettingerd732c952004-03-27 09:24:36 +0000212 charmap[fixup(av)] = 1
Fredrik Lundh3562f112000-07-02 12:00:07 +0000213 elif op is RANGE:
214 for i in range(fixup(av[0]), fixup(av[1])+1):
Raymond Hettingerd732c952004-03-27 09:24:36 +0000215 charmap[i] = 1
Fredrik Lundh3562f112000-07-02 12:00:07 +0000216 elif op is CATEGORY:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000217 # XXX: could append to charmap tail
Fredrik Lundh3562f112000-07-02 12:00:07 +0000218 return charset # cannot compress
219 except IndexError:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000220 # character set contains unicode characters
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000221 return _optimize_unicode(charset, fixup)
Fredrik Lundh3562f112000-07-02 12:00:07 +0000222 # compress character map
223 i = p = n = 0
224 runs = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000225 runsappend = runs.append
Fredrik Lundh3562f112000-07-02 12:00:07 +0000226 for c in charmap:
227 if c:
228 if n == 0:
229 p = i
230 n = n + 1
231 elif n:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000232 runsappend((p, n))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000233 n = 0
234 i = i + 1
235 if n:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000236 runsappend((p, n))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000237 if len(runs) <= 2:
238 # use literal/range
239 for p, n in runs:
240 if n == 1:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000241 outappend((LITERAL, p))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000242 else:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000243 outappend((RANGE, (p, p+n-1)))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000244 if len(out) < len(charset):
245 return out
246 else:
247 # use bitmap
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000248 data = _mk_bitmap(charmap)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000249 outappend((CHARSET, data))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000250 return out
251 return charset
252
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000253def _mk_bitmap(bits):
254 data = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000255 dataappend = data.append
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000256 if _sre.CODESIZE == 2:
257 start = (1, 0)
258 else:
259 start = (1L, 0L)
260 m, v = start
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000261 for c in bits:
262 if c:
263 v = v + m
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000264 m = m + m
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000265 if m > MAXCODE:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000266 dataappend(v)
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000267 m, v = start
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000268 return data
269
270# To represent a big charset, first a bitmap of all characters in the
271# set is constructed. Then, this bitmap is sliced into chunks of 256
272# characters, duplicate chunks are eliminitated, and each chunk is
273# given a number. In the compiled expression, the charset is
274# represented by a 16-bit word sequence, consisting of one word for
275# the number of different chunks, a sequence of 256 bytes (128 words)
276# of chunk numbers indexed by their original chunk position, and a
277# sequence of chunks (16 words each).
278
279# Compression is normally good: in a typical charset, large ranges of
280# Unicode will be either completely excluded (e.g. if only cyrillic
281# letters are to be matched), or completely included (e.g. if large
282# subranges of Kanji match). These ranges will be represented by
283# chunks of all one-bits or all zero-bits.
284
285# Matching can be also done efficiently: the more significant byte of
286# the Unicode character is an index into the chunk number, and the
287# less significant byte is a bit index in the chunk (just like the
288# CHARSET matching).
289
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000290# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
291# of the basic multilingual plane; an efficient representation
292# for all of UTF-16 has not yet been developed. This means,
293# in particular, that negated charsets cannot be represented as
294# bigcharsets.
295
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000296def _optimize_unicode(charset, fixup):
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000297 try:
298 import array
299 except ImportError:
300 return charset
Raymond Hettingerd732c952004-03-27 09:24:36 +0000301 charmap = [0]*65536
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000302 negate = 0
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000303 try:
304 for op, av in charset:
305 if op is NEGATE:
306 negate = 1
307 elif op is LITERAL:
Raymond Hettingerd732c952004-03-27 09:24:36 +0000308 charmap[fixup(av)] = 1
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000309 elif op is RANGE:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000310 for i in xrange(fixup(av[0]), fixup(av[1])+1):
Raymond Hettingerd732c952004-03-27 09:24:36 +0000311 charmap[i] = 1
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000312 elif op is CATEGORY:
313 # XXX: could expand category
314 return charset # cannot compress
315 except IndexError:
316 # non-BMP characters
317 return charset
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000318 if negate:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000319 if sys.maxunicode != 65535:
320 # XXX: negation does not work with big charsets
321 return charset
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000322 for i in xrange(65536):
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000323 charmap[i] = not charmap[i]
324 comps = {}
325 mapping = [0]*256
326 block = 0
327 data = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000328 for i in xrange(256):
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000329 chunk = tuple(charmap[i*256:(i+1)*256])
330 new = comps.setdefault(chunk, block)
331 mapping[i] = new
332 if new == block:
Fredrik Lundh4fb70272002-06-27 20:08:25 +0000333 block = block + 1
334 data = data + _mk_bitmap(chunk)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000335 header = [block]
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000336 if _sre.CODESIZE == 2:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000337 code = 'H'
338 else:
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000339 code = 'I'
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000340 # Convert block indices to byte array of 256 bytes
341 mapping = array.array('b', mapping).tostring()
342 # Convert byte array to word array
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000343 mapping = array.array(code, mapping)
344 assert mapping.itemsize == _sre.CODESIZE
345 header = header + mapping.tolist()
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000346 data[0:0] = header
Tim Peters87cc0c32001-07-21 01:41:30 +0000347 return [(BIGCHARSET, data)]
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000348
Fredrik Lundhe1869832000-08-01 22:47:49 +0000349def _simple(av):
350 # check if av is a "simple" operator
351 lo, hi = av[2].getwidth()
Fredrik Lundh13ac9922000-10-07 17:38:23 +0000352 if lo == 0 and hi == MAXREPEAT:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000353 raise error, "nothing to repeat"
354 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
355
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000356def _compile_info(code, pattern, flags):
357 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000358 # this contains min/max pattern width, and an optional literal
359 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000360 lo, hi = pattern.getwidth()
361 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000362 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000363 # look for a literal prefix
364 prefix = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000365 prefixappend = prefix.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000366 prefix_skip = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000367 charset = [] # not used
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000368 charsetappend = charset.append
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000369 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000370 # look for literal prefix
Fredrik Lundh90a07912000-06-30 07:50:59 +0000371 for op, av in pattern.data:
372 if op is LITERAL:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000373 if len(prefix) == prefix_skip:
374 prefix_skip = prefix_skip + 1
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000375 prefixappend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000376 elif op is SUBPATTERN and len(av[1]) == 1:
377 op, av = av[1][0]
378 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000379 prefixappend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000380 else:
381 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000382 else:
383 break
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000384 # if no prefix, look for charset prefix
385 if not prefix and pattern.data:
386 op, av = pattern.data[0]
387 if op is SUBPATTERN and av[1]:
388 op, av = av[1][0]
389 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000390 charsetappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000391 elif op is BRANCH:
392 c = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000393 cappend = c.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000394 for p in av[1]:
395 if not p:
396 break
397 op, av = p[0]
398 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000399 cappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000400 else:
401 break
402 else:
403 charset = c
404 elif op is BRANCH:
405 c = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000406 cappend = c.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000407 for p in av[1]:
408 if not p:
409 break
410 op, av = p[0]
411 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000412 cappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000413 else:
414 break
415 else:
416 charset = c
417 elif op is IN:
418 charset = av
419## if prefix:
420## print "*** PREFIX", prefix, prefix_skip
421## if charset:
422## print "*** CHARSET", charset
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000423 # add an info block
424 emit = code.append
425 emit(OPCODES[INFO])
426 skip = len(code); emit(0)
427 # literal flag
428 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000429 if prefix:
430 mask = SRE_INFO_PREFIX
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000431 if len(prefix) == prefix_skip == len(pattern.data):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000432 mask = mask + SRE_INFO_LITERAL
433 elif charset:
434 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000435 emit(mask)
436 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000437 if lo < MAXCODE:
438 emit(lo)
439 else:
440 emit(MAXCODE)
441 prefix = prefix[:MAXCODE]
442 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000443 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000444 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000445 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000446 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000447 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000448 emit(len(prefix)) # length
449 emit(prefix_skip) # skip
450 code.extend(prefix)
451 # generate overlap table
452 table = [-1] + ([0]*len(prefix))
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000453 for i in xrange(len(prefix)):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000454 table[i+1] = table[i]+1
455 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
456 table[i+1] = table[table[i+1]-1]+1
457 code.extend(table[1:]) # don't store first entry
Fredrik Lundh3562f112000-07-02 12:00:07 +0000458 elif charset:
Guido van Rossum577fb5a2003-02-24 01:18:35 +0000459 _compile_charset(charset, flags, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000460 code[skip] = len(code) - skip
461
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000462try:
Just van Rossum12723ba2003-07-02 20:03:04 +0000463 unicode
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000464except NameError:
Just van Rossum74902502003-07-02 21:37:16 +0000465 STRING_TYPES = (type(""),)
Just van Rossum12723ba2003-07-02 20:03:04 +0000466else:
467 STRING_TYPES = (type(""), type(unicode("")))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000468
Just van Rossum74902502003-07-02 21:37:16 +0000469def isstring(obj):
470 for tp in STRING_TYPES:
471 if isinstance(obj, tp):
472 return 1
473 return 0
474
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000475def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000476
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000477 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000478 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000479
480 # compile info block
481 _compile_info(code, p, flags)
482
483 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000484 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000485
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000486 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000487
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000488 return code
489
490def compile(p, flags=0):
491 # internal: convert pattern list to internal format
492
Just van Rossum74902502003-07-02 21:37:16 +0000493 if isstring(p):
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000494 import sre_parse
495 pattern = p
496 p = sre_parse.parse(p, flags)
497 else:
498 pattern = None
499
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000500 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000501
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000502 # print code
503
Fredrik Lundh770617b2001-01-14 15:06:11 +0000504 # XXX: <fl> get rid of this limitation!
Fredrik Lundh5e7d51b2004-10-15 06:15:08 +0000505 if p.pattern.groups > 100:
506 raise AssertionError(
507 "sorry, but this version only supports 100 named groups"
508 )
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000509
Fredrik Lundhc2301732000-07-02 22:25:39 +0000510 # map in either direction
511 groupindex = p.pattern.groupdict
512 indexgroup = [None] * p.pattern.groups
513 for k, i in groupindex.items():
514 indexgroup[i] = k
515
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000516 return _sre.compile(
Fredrik Lundh6f013982000-07-03 18:44:21 +0000517 pattern, flags, code,
518 p.pattern.groups-1,
519 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000520 )