blob: 9fdc8f395a2dbba6794398a8fbada985d36b7885 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
6# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossum7627c0d2000-03-31 14:58:54 +00008# Portions of this engine have been developed in cooperation with
9# CNRI. Hewlett-Packard provided funding for 1.6 integration and
10# other compatibility work.
11#
12
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000013import array
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014import _sre
15
16from sre_constants import *
17
18# find an array type code that matches the engine's code size
19for WORDSIZE in "BHil":
20 if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
Fredrik Lundh90a07912000-06-30 07:50:59 +000021 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +000022else:
23 raise RuntimeError, "cannot find a useable array type"
24
Fredrik Lundh436c3d582000-06-29 08:58:44 +000025def _compile(code, pattern, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +000026 # internal: compile a (sub)pattern
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000027 emit = code.append
Guido van Rossum7627c0d2000-03-31 14:58:54 +000028 for op, av in pattern:
Fredrik Lundh90a07912000-06-30 07:50:59 +000029 if op is ANY:
30 if flags & SRE_FLAG_DOTALL:
31 emit(OPCODES[op])
32 else:
33 emit(OPCODES[CATEGORY])
34 emit(CHCODES[CATEGORY_NOT_LINEBREAK])
35 elif op in (SUCCESS, FAILURE):
36 emit(OPCODES[op])
37 elif op is AT:
38 emit(OPCODES[op])
39 if flags & SRE_FLAG_MULTILINE:
40 emit(ATCODES[AT_MULTILINE[av]])
41 else:
42 emit(ATCODES[av])
43 elif op is BRANCH:
44 emit(OPCODES[op])
45 tail = []
46 for av in av[1]:
47 skip = len(code); emit(0)
48 _compile(code, av, flags)
49 emit(OPCODES[JUMP])
50 tail.append(len(code)); emit(0)
51 code[skip] = len(code) - skip
52 emit(0) # end of branch
53 for tail in tail:
54 code[tail] = len(code) - tail
55 elif op is CALL:
56 emit(OPCODES[op])
57 skip = len(code); emit(0)
58 _compile(code, av, flags)
59 emit(OPCODES[SUCCESS])
60 code[skip] = len(code) - skip
61 elif op is CATEGORY:
62 emit(OPCODES[op])
63 if flags & SRE_FLAG_LOCALE:
64 emit(CHCODES[CH_LOCALE[av]])
65 elif flags & SRE_FLAG_UNICODE:
66 emit(CHCODES[CH_UNICODE[av]])
67 else:
68 emit(CHCODES[av])
69 elif op is GROUP:
70 if flags & SRE_FLAG_IGNORECASE:
71 emit(OPCODES[OP_IGNORE[op]])
72 else:
73 emit(OPCODES[op])
74 emit(av-1)
75 elif op is IN:
76 if flags & SRE_FLAG_IGNORECASE:
77 emit(OPCODES[OP_IGNORE[op]])
78 def fixup(literal, flags=flags):
79 return _sre.getlower(ord(literal), flags)
80 else:
81 emit(OPCODES[op])
82 fixup = ord
83 skip = len(code); emit(0)
84 for op, av in av:
85 emit(OPCODES[op])
86 if op is NEGATE:
87 pass
88 elif op is LITERAL:
89 emit(fixup(av))
90 elif op is RANGE:
91 emit(fixup(av[0]))
92 emit(fixup(av[1]))
93 elif op is CATEGORY:
94 if flags & SRE_FLAG_LOCALE:
95 emit(CHCODES[CH_LOCALE[av]])
96 elif flags & SRE_FLAG_UNICODE:
97 emit(CHCODES[CH_UNICODE[av]])
98 else:
99 emit(CHCODES[av])
100 else:
101 raise error, "internal: unsupported set operator"
102 emit(OPCODES[FAILURE])
103 code[skip] = len(code) - skip
104 elif op in (LITERAL, NOT_LITERAL):
105 if flags & SRE_FLAG_IGNORECASE:
106 emit(OPCODES[OP_IGNORE[op]])
107 else:
108 emit(OPCODES[op])
109 emit(ord(av))
110 elif op is MARK:
111 emit(OPCODES[op])
112 emit(av)
113 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
114 if flags & SRE_FLAG_TEMPLATE:
115 emit(OPCODES[REPEAT])
116 skip = len(code); emit(0)
117 emit(av[0])
118 emit(av[1])
119 _compile(code, av[2], flags)
120 emit(OPCODES[SUCCESS])
121 code[skip] = len(code) - skip
122 else:
123 lo, hi = av[2].getwidth()
124 if lo == 0:
125 raise error, "nothing to repeat"
126 if 0 and lo == hi == 1 and op is MAX_REPEAT:
127 # FIXME: <fl> need a better way to figure out when
128 # it's safe to use this one (in the parser, probably)
129 emit(OPCODES[MAX_REPEAT_ONE])
130 skip = len(code); emit(0)
131 emit(av[0])
132 emit(av[1])
133 _compile(code, av[2], flags)
134 emit(OPCODES[SUCCESS])
135 code[skip] = len(code) - skip
136 else:
137 emit(OPCODES[op])
138 skip = len(code); emit(0)
139 emit(av[0])
140 emit(av[1])
141 _compile(code, av[2], flags)
142 emit(OPCODES[SUCCESS])
143 code[skip] = len(code) - skip
144 elif op is SUBPATTERN:
145 group = av[0]
146 if group:
147 emit(OPCODES[MARK])
148 emit((group-1)*2)
149 _compile(code, av[1], flags)
150 if group:
151 emit(OPCODES[MARK])
152 emit((group-1)*2+1)
153 else:
154 raise ValueError, ("unsupported operand type", op)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000155
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000156def _compile_info(code, pattern, flags):
157 # internal: compile an info block. in the current version,
158 # this contains min/max pattern width and a literal prefix,
159 # if any
160 lo, hi = pattern.getwidth()
161 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000162 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000163 # look for a literal prefix
164 prefix = []
165 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000166 for op, av in pattern.data:
167 if op is LITERAL:
168 prefix.append(ord(av))
169 else:
170 break
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000171 # add an info block
172 emit = code.append
173 emit(OPCODES[INFO])
174 skip = len(code); emit(0)
175 # literal flag
176 mask = 0
177 if len(prefix) == len(pattern.data):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000178 mask = 1
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000179 emit(mask)
180 # pattern length
181 emit(lo)
182 if hi < 32768:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000183 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000184 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000185 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000186 # add literal prefix
187 emit(len(prefix))
188 if prefix:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000189 code.extend(prefix)
190 # generate overlap table
191 table = [-1] + ([0]*len(prefix))
192 for i in range(len(prefix)):
193 table[i+1] = table[i]+1
194 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
195 table[i+1] = table[table[i+1]-1]+1
196 code.extend(table[1:]) # don't store first entry
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000197 code[skip] = len(code) - skip
198
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000199def compile(p, flags=0):
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000200 # internal: convert pattern list to internal format
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000201
202 # compile, as necessary
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000203 if type(p) in (type(""), type(u"")):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000204 import sre_parse
205 pattern = p
206 p = sre_parse.parse(p)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000207 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000208 pattern = None
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000209
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000210 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000211 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000212
213 # compile info block
214 _compile_info(code, p, flags)
215
216 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000217 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000218
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000219 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000220
221 # FIXME: <fl> get rid of this limitation!
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000222 assert p.pattern.groups <= 100,\
Fredrik Lundh90a07912000-06-30 07:50:59 +0000223 "sorry, but this version only supports 100 named groups"
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000224
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000225 return _sre.compile(
Fredrik Lundh90a07912000-06-30 07:50:59 +0000226 pattern, flags,
227 array.array(WORDSIZE, code).tostring(),
228 p.pattern.groups-1, p.pattern.groupdict
229 )