blob: db4c500df1bb18e34dc4e429e0db0b0b258bb0fb [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
3# $Id$
4#
5# convert re-style regular expression to SRE template. the current
6# implementation is somewhat incomplete, and not very fast. should
7# definitely be rewritten before Python 1.6 goes beta.
8#
9# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
10#
11# This code can only be used for 1.6 alpha testing. All other use
12# require explicit permission from Secret Labs AB.
13#
14# Portions of this engine have been developed in cooperation with
15# CNRI. Hewlett-Packard provided funding for 1.6 integration and
16# other compatibility work.
17#
18
19# FIXME: comments marked with the FIXME tag are open issues. all such
20# issues should be closed before the final beta.
21
22import string, sys
23
24from sre_constants import *
25
26SPECIAL_CHARS = ".\\[{()*+?^$|"
27REPEAT_CHARS = "*+?{"
28
29OCTDIGITS = "01234567"
30HEXDIGITS = "0123456789abcdefABCDEF"
31
32ESCAPES = {
33 "\\a": (LITERAL, chr(7)),
34 "\\b": (LITERAL, chr(8)),
35 "\\f": (LITERAL, chr(12)),
36 "\\n": (LITERAL, chr(10)),
37 "\\r": (LITERAL, chr(13)),
38 "\\t": (LITERAL, chr(9)),
39 "\\v": (LITERAL, chr(11))
40}
41
42CATEGORIES = {
43 "\\A": (AT, AT_BEGINNING), # start of string
44 "\\b": (AT, AT_BOUNDARY),
45 "\\B": (AT, AT_NON_BOUNDARY),
46 "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47 "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48 "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49 "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50 "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51 "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52 "\\Z": (AT, AT_END), # end of string
53}
54
55class Pattern:
56 # FIXME: <fl> rename class, and store flags in here too!
57 def __init__(self):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000058 self.flags = []
59 self.groups = 1
60 self.groupdict = {}
Guido van Rossum7627c0d2000-03-31 14:58:54 +000061 def getgroup(self, name=None):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000062 gid = self.groups
63 self.groups = gid + 1
64 if name:
65 self.groupdict[name] = gid
66 return gid
Guido van Rossum7627c0d2000-03-31 14:58:54 +000067 def setflag(self, flag):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000068 if flag not in self.flags:
69 self.flags.append(flag)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000070
71class SubPattern:
72 # a subpattern, in intermediate form
73 def __init__(self, pattern, data=None):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000074 self.pattern = pattern
75 if not data:
76 data = []
77 self.data = data
78 self.flags = []
79 self.width = None
Guido van Rossum7627c0d2000-03-31 14:58:54 +000080 def __repr__(self):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000081 return repr(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000082 def __len__(self):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000083 return len(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000084 def __delitem__(self, index):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000085 del self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000086 def __getitem__(self, index):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000087 return self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000088 def __setitem__(self, index, code):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000089 self.data[index] = code
Guido van Rossum7627c0d2000-03-31 14:58:54 +000090 def __getslice__(self, start, stop):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000091 return SubPattern(self.pattern, self.data[start:stop])
Guido van Rossum7627c0d2000-03-31 14:58:54 +000092 def insert(self, index, code):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000093 self.data.insert(index, code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000094 def append(self, code):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000095 self.data.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000096 def getwidth(self):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +000097 # determine the width (min, max) for this subpattern
98 if self.width:
99 return self.width
100 lo = hi = 0L
101 for op, av in self.data:
102 if op is BRANCH:
103 l = sys.maxint
104 h = 0
105 for av in av[1]:
106 i, j = av.getwidth()
107 l = min(l, i)
108 h = min(h, j)
109 lo = lo + i
110 hi = hi + j
111 elif op is CALL:
112 i, j = av.getwidth()
113 lo = lo + i
114 hi = hi + j
115 elif op is SUBPATTERN:
116 i, j = av[1].getwidth()
117 lo = lo + i
118 hi = hi + j
119 elif op in (MIN_REPEAT, MAX_REPEAT):
120 i, j = av[2].getwidth()
121 lo = lo + i * av[0]
122 hi = hi + j * av[1]
123 elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
124 lo = lo + 1
125 hi = hi + 1
126 elif op == SUCCESS:
127 break
128 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
129 return self.width
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000130 def set(self, flag):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000131 if not flag in self.flags:
132 self.flags.append(flag)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000133 def reset(self, flag):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000134 if flag in self.flags:
135 self.flags.remove(flag)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000136
137class Tokenizer:
138 def __init__(self, string):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000139 self.string = list(string)
140 self.next = self.__next()
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000141 def __next(self):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000142 if not self.string:
143 return None
144 char = self.string[0]
145 if char[0] == "\\":
146 try:
147 c = self.string[1]
148 except IndexError:
149 raise SyntaxError, "bogus escape"
150 char = char + c
151 try:
152 if c == "x":
153 # hexadecimal constant
154 for i in xrange(2, sys.maxint):
155 c = self.string[i]
156 if c not in HEXDIGITS:
157 break
158 char = char + c
159 elif c in string.digits:
160 # decimal (or octal) number
161 for i in xrange(2, sys.maxint):
162 c = self.string[i]
163 # FIXME: if larger than current number of
164 # groups, interpret as an octal number
165 if c not in string.digits:
166 break
167 char = char + c
168 except IndexError:
169 pass # use what we've got this far
170 del self.string[0:len(char)]
171 return char
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000172 def match(self, char):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000173 if char == self.next:
174 self.next = self.__next()
175 return 1
176 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000177 def match_set(self, set):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000178 if self.next in set:
179 self.next = self.__next()
180 return 1
181 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000182 def get(self):
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000183 this = self.next
184 self.next = self.__next()
185 return this
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000186
187def _fixescape(escape, character_class=0):
188 # convert escape to (type, value)
189 if character_class:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000190 # inside a character class, we'll look in the character
191 # escapes dictionary first
192 code = ESCAPES.get(escape)
193 if code:
194 return code
195 code = CATEGORIES.get(escape)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000196 else:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000197 code = CATEGORIES.get(escape)
198 if code:
199 return code
200 code = ESCAPES.get(escape)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000201 if code:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000202 return code
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000203 if not character_class:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000204 try:
205 group = int(escape[1:])
206 # FIXME: only valid if group <= current number of groups
207 return GROUP, group
208 except ValueError:
209 pass
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000210 try:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000211 if escape[1:2] == "x":
212 escape = escape[2:]
213 return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff)
214 elif escape[1:2] in string.digits:
215 return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff)
216 elif len(escape) == 2:
217 return LITERAL, escape[1]
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000218 except ValueError:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000219 pass
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000220 raise SyntaxError, "bogus escape: %s" % repr(escape)
221
222def _branch(subpattern, items):
223
224 # form a branch operator from a set of items (FIXME: move this
225 # optimization to the compiler module!)
226
227 # check if all items share a common prefix
228 while 1:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000229 prefix = None
230 for item in items:
231 if not item:
232 break
233 if prefix is None:
234 prefix = item[0]
235 elif item[0] != prefix:
236 break
237 else:
238 # all subitems start with a common "prefix".
239 # move it out of the branch
240 for item in items:
241 del item[0]
242 subpattern.append(prefix)
243 continue # check next one
244 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000245
246 # check if the branch can be replaced by a character set
247 for item in items:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000248 if len(item) != 1 or item[0][0] != LITERAL:
249 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000250 else:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000251 # we can store this as a character set instead of a
252 # branch (FIXME: use a range if possible)
253 set = []
254 for item in items:
255 set.append(item[0])
256 subpattern.append((IN, set))
257 return
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000258
259 subpattern.append((BRANCH, (None, items)))
260
261def _parse(source, pattern, flags=()):
262
263 # parse regular expression pattern into an operator list.
264
265 subpattern = SubPattern(pattern)
266
267 this = None
268
269 while 1:
270
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000271 if source.next in ("|", ")"):
272 break # end of subpattern
273 this = source.get()
274 if this is None:
275 break # end of pattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000276
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000277 if this and this[0] not in SPECIAL_CHARS:
278 subpattern.append((LITERAL, this))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000279
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000280 elif this == "[":
281 # character set
282 set = []
283## if source.match(":"):
284## pass # handle character classes
285 if source.match("^"):
286 set.append((NEGATE, None))
287 # check remaining characters
288 start = set[:]
289 while 1:
290 this = source.get()
291 if this == "]" and set != start:
292 break
293 elif this and this[0] == "\\":
294 code1 = _fixescape(this, 1)
295 elif this:
296 code1 = LITERAL, this
297 else:
298 raise SyntaxError, "unexpected end of regular expression"
299 if source.match("-"):
300 # potential range
301 this = source.get()
302 if this == "]":
303 set.append(code1)
304 set.append((LITERAL, "-"))
305 break
306 else:
307 if this[0] == "\\":
308 code2 = _fixescape(this, 1)
309 else:
310 code2 = LITERAL, this
311 if code1[0] != LITERAL or code2[0] != LITERAL:
312 raise SyntaxError, "illegal range"
313 if len(code1[1]) != 1 or len(code2[1]) != 1:
314 raise SyntaxError, "illegal range"
315 set.append((RANGE, (code1[1], code2[1])))
316 else:
317 if code1[0] is IN:
318 code1 = code1[1][0]
319 set.append(code1)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000320
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000321 # FIXME: <fl> move set optimization to support function
322 if len(set)==1 and set[0][0] is LITERAL:
323 subpattern.append(set[0]) # optimization
324 elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
325 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
326 else:
327 # FIXME: <fl> add charmap optimization
328 subpattern.append((IN, set))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000329
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000330 elif this and this[0] in REPEAT_CHARS:
331 # repeat previous item
332 if this == "?":
333 min, max = 0, 1
334 elif this == "*":
335 min, max = 0, sys.maxint
336 elif this == "+":
337 min, max = 1, sys.maxint
338 elif this == "{":
339 min, max = 0, sys.maxint
340 lo = hi = ""
341 while source.next in string.digits:
342 lo = lo + source.get()
343 if source.match(","):
344 while source.next in string.digits:
345 hi = hi + source.get()
346 else:
347 hi = lo
348 if not source.match("}"):
349 raise SyntaxError, "bogus range"
350 if lo:
351 min = int(lo)
352 if hi:
353 max = int(hi)
354 # FIXME: <fl> check that hi >= lo!
355 else:
356 raise SyntaxError, "not supported"
357 # figure out which item to repeat
358 # FIXME: should back up to the right mark, right?
359 if subpattern:
360 index = len(subpattern)-1
361 while subpattern[index][0] is MARK:
362 index = index - 1
363 item = subpattern[index:index+1]
364 else:
365 raise SyntaxError, "nothing to repeat"
366 if source.match("?"):
367 subpattern[index] = (MIN_REPEAT, (min, max, item))
368 else:
369 subpattern[index] = (MAX_REPEAT, (min, max, item))
370 elif this == ".":
371 subpattern.append((ANY, None))
372 elif this == "(":
373 group = 1
374 name = None
375 if source.match("?"):
376 group = 0
377 # options
378 if source.match("P"):
379 # named group: skip forward to end of name
380 if source.match("<"):
381 name = ""
382 while 1:
383 char = source.get()
384 if char in (">", None):
385 break
386 name = name + char
387 group = 1
388 elif source.match(":"):
389 # non-capturing group
390 group = 2
391 elif source.match_set("iI"):
392 pattern.setflag("i")
393 elif source.match_set("lL"):
394 pattern.setflag("l")
395 elif source.match_set("mM"):
396 pattern.setflag("m")
397 elif source.match_set("sS"):
398 pattern.setflag("s")
399 elif source.match_set("xX"):
400 pattern.setflag("x")
401 if group:
402 # parse group contents
403 b = []
404 if group == 2:
405 # anonymous group
406 group = None
407 else:
408 group = pattern.getgroup(name)
409 if group:
410 subpattern.append((MARK, (group-1)*2))
411 while 1:
412 p = _parse(source, pattern, flags)
413 if source.match(")"):
414 if b:
415 b.append(p)
416 _branch(subpattern, b)
417 else:
418 subpattern.append((SUBPATTERN, (group, p)))
419 break
420 elif source.match("|"):
421 b.append(p)
422 else:
423 raise SyntaxError, "group not properly closed"
424 if group:
425 subpattern.append((MARK, (group-1)*2+1))
426 else:
427 # FIXME: should this really be a while loop?
428 while source.get() not in (")", None):
429 pass
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000430
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000431 elif this == "^":
432 subpattern.append((AT, AT_BEGINNING))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000433
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000434 elif this == "$":
435 subpattern.append((AT, AT_END))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000436
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000437 elif this and this[0] == "\\":
438 code =_fixescape(this)
439 subpattern.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000440
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000441 else:
442 raise SyntaxError, "parser error"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000443
444 return subpattern
445
446def parse(source, flags=()):
447 s = Tokenizer(source)
448 g = Pattern()
449 b = []
450 while 1:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000451 p = _parse(s, g, flags)
452 tail = s.get()
453 if tail == "|":
454 b.append(p)
455 elif tail == ")":
456 raise SyntaxError, "unbalanced parenthesis"
457 elif tail is None:
458 if b:
459 b.append(p)
460 p = SubPattern(g)
461 _branch(p, b)
462 break
463 else:
464 raise SyntaxError, "bogus characters at end of regular expression"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000465 return p
466
467if __name__ == "__main__":
468 from pprint import pprint
469 from testpatterns import PATTERNS
470 a = b = c = 0
471 for pattern, flags in PATTERNS:
Andrew M. Kuchlinge3ba9312000-04-02 05:22:30 +0000472 if flags:
473 continue
474 print "-"*68
475 try:
476 p = parse(pattern)
477 print repr(pattern), "->"
478 pprint(p.data)
479 import sre_compile
480 try:
481 code = sre_compile.compile(p)
482 c = c + 1
483 except:
484 pass
485 a = a + 1
486 except SyntaxError, v:
487 print "**", repr(pattern), v
488 b = b + 1
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000489 print "-"*68
490 print a, "of", b, "patterns successfully parsed"
491 print c, "of", b, "patterns successfully compiled"
492