blob: 8b68ea1f82258a07a8ce69040658a57e75fdbd67 [file] [log] [blame]
Guido van Rossumaad67612000-05-08 17:31:04 +00001#
2# Secret Labs' Regular Expression Engine
3# $Id$
4#
5# convert re-style regular expression to SRE template. the current
6# implementation is somewhat incomplete, and not very fast. should
7# definitely be rewritten before Python 1.6 goes beta.
8#
9# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
10#
11# This code can only be used for 1.6 alpha testing. All other use
12# require explicit permission from Secret Labs AB.
13#
14# Portions of this engine have been developed in cooperation with
15# CNRI. Hewlett-Packard provided funding for 1.6 integration and
16# other compatibility work.
17#
18
19# FIXME: comments marked with the FIXME tag are open issues. all such
20# issues should be closed before the final beta.
21
22import string, sys
23
24from sre_constants import *
25
26SPECIAL_CHARS = ".\\[{()*+?^$|"
27REPEAT_CHARS = "*+?{"
28
29# FIXME: string in tuple tests may explode with if char is unicode :-(
30DIGITS = tuple(string.digits)
31
32OCTDIGITS = tuple("01234567")
33HEXDIGITS = tuple("0123456789abcdefABCDEF")
34
35ESCAPES = {
36 "\\a": (LITERAL, chr(7)),
37 "\\b": (LITERAL, chr(8)),
38 "\\f": (LITERAL, chr(12)),
39 "\\n": (LITERAL, chr(10)),
40 "\\r": (LITERAL, chr(13)),
41 "\\t": (LITERAL, chr(9)),
42 "\\v": (LITERAL, chr(11))
43}
44
45CATEGORIES = {
46 "\\A": (AT, AT_BEGINNING), # start of string
47 "\\b": (AT, AT_BOUNDARY),
48 "\\B": (AT, AT_NON_BOUNDARY),
49 "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
50 "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
51 "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
52 "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
53 "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
54 "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
55 "\\Z": (AT, AT_END), # end of string
56}
57
58class Pattern:
59 # FIXME: <fl> rename class, and store flags in here too!
60 def __init__(self):
61 self.flags = []
62 self.groups = 1
63 self.groupdict = {}
64 def getgroup(self, name=None):
65 gid = self.groups
66 self.groups = gid + 1
67 if name:
68 self.groupdict[name] = gid
69 return gid
70 def setflag(self, flag):
71 if flag in self.flags:
72 self.flags.append(flag)
73
74class SubPattern:
75 # a subpattern, in intermediate form
76 def __init__(self, pattern, data=None):
77 self.pattern = pattern
78 if not data:
79 data = []
80 self.data = data
81 self.flags = []
82 self.width = None
83 def __repr__(self):
84 return repr(self.data)
85 def __len__(self):
86 return len(self.data)
87 def __delitem__(self, index):
88 del self.data[index]
89 def __getitem__(self, index):
90 return self.data[index]
91 def __setitem__(self, index, code):
92 self.data[index] = code
93 def __getslice__(self, start, stop):
94 return SubPattern(self.pattern, self.data[start:stop])
95 def insert(self, index, code):
96 self.data.insert(index, code)
97 def append(self, code):
98 self.data.append(code)
99 def getwidth(self):
100 # determine the width (min, max) for this subpattern
101 if self.width:
102 return self.width
103 lo = hi = 0L
104 for op, av in self.data:
105 if op is BRANCH:
106 l = sys.maxint
107 h = 0
108 for av in av[1]:
109 i, j = av.getwidth()
110 l = min(l, i)
111 h = min(h, j)
112 lo = lo + i
113 hi = hi + j
114 elif op is CALL:
115 i, j = av.getwidth()
116 lo = lo + i
117 hi = hi + j
118 elif op is SUBPATTERN:
119 i, j = av[1].getwidth()
120 lo = lo + i
121 hi = hi + j
122 elif op in (MIN_REPEAT, MAX_REPEAT):
123 i, j = av[2].getwidth()
124 lo = lo + i * av[0]
125 hi = hi + j * av[1]
126 elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
127 lo = lo + 1
128 hi = hi + 1
129 elif op == SUCCESS:
130 break
131 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
132 return self.width
133 def set(self, flag):
134 if not flag in self.flags:
135 self.flags.append(flag)
136 def reset(self, flag):
137 if flag in self.flags:
138 self.flags.remove(flag)
139
140class Tokenizer:
141 def __init__(self, string):
142 self.string = list(string)
143 self.next = self.__next()
144 def __next(self):
145 if not self.string:
146 return None
147 char = self.string[0]
148 if char[0] == "\\":
149 try:
150 c = self.string[1]
151 except IndexError:
152 raise SyntaxError, "bogus escape"
153 char = char + c
154 try:
155 if c == "x":
156 # hexadecimal constant
157 for i in xrange(2, sys.maxint):
158 c = self.string[i]
159 if str(c) not in HEXDIGITS:
160 break
161 char = char + c
162 elif str(c) in DIGITS:
163 # decimal (or octal) number
164 for i in xrange(2, sys.maxint):
165 c = self.string[i]
166 # FIXME: if larger than current number of
167 # groups, interpret as an octal number
168 if str(c) not in DIGITS:
169 break
170 char = char + c
171 except IndexError:
172 pass # use what we've got this far
173 del self.string[0:len(char)]
174 return char
175 def match(self, char):
176 if char == self.next:
177 self.next = self.__next()
178 return 1
179 return 0
180 def match_set(self, set):
181 if self.next and self.next in set:
182 self.next = self.__next()
183 return 1
184 return 0
185 def get(self):
186 this = self.next
187 self.next = self.__next()
188 return this
189
190def _fixescape(escape, character_class=0):
191 # convert escape to (type, value)
192 if character_class:
193 # inside a character class, we'll look in the character
194 # escapes dictionary first
195 code = ESCAPES.get(escape)
196 if code:
197 return code
198 code = CATEGORIES.get(escape)
199 else:
200 code = CATEGORIES.get(escape)
201 if code:
202 return code
203 code = ESCAPES.get(escape)
204 if code:
205 return code
206 if not character_class:
207 try:
208 group = int(escape[1:])
209 # FIXME: only valid if group <= current number of groups
210 return GROUP, group
211 except ValueError:
212 pass
213 try:
214 if escape[1:2] == "x":
215 escape = escape[2:]
216 return LITERAL, chr(int(escape[-2:], 16) & 0xff)
217 elif str(escape[1:2]) in DIGITS:
218 return LITERAL, chr(int(escape[1:], 8) & 0xff)
219 elif len(escape) == 2:
220 return LITERAL, escape[1]
221 except ValueError:
222 pass
223 raise SyntaxError, "bogus escape: %s" % repr(escape)
224
225def _branch(subpattern, items):
226
227 # form a branch operator from a set of items (FIXME: move this
228 # optimization to the compiler module!)
229
230 # check if all items share a common prefix
231 while 1:
232 prefix = None
233 for item in items:
234 if not item:
235 break
236 if prefix is None:
237 prefix = item[0]
238 elif item[0] != prefix:
239 break
240 else:
241 # all subitems start with a common "prefix".
242 # move it out of the branch
243 for item in items:
244 del item[0]
245 subpattern.append(prefix)
246 continue # check next one
247 break
248
249 # check if the branch can be replaced by a character set
250 for item in items:
251 if len(item) != 1 or item[0][0] != LITERAL:
252 break
253 else:
254 # we can store this as a character set instead of a
255 # branch (FIXME: use a range if possible)
256 set = []
257 for item in items:
258 set.append(item[0])
259 subpattern.append((IN, set))
260 return
261
262 subpattern.append((BRANCH, (None, items)))
263
264def _parse(source, pattern, flags=()):
265
266 # parse regular expression pattern into an operator list.
267
268 subpattern = SubPattern(pattern)
269
270 this = None
271
272 while 1:
273
274 if str(source.next) in ("|", ")"):
275 break # end of subpattern
276 this = source.get()
277 if this is None:
278 break # end of pattern
279
280 if this and this[0] not in SPECIAL_CHARS:
281 subpattern.append((LITERAL, this))
282
283 elif this == "[":
284 # character set
285 set = []
286## if source.match(":"):
287## pass # handle character classes
288 if source.match("^"):
289 set.append((NEGATE, None))
290 # check remaining characters
291 start = set[:]
292 while 1:
293 this = source.get()
294 if this == "]" and set != start:
295 break
296 elif this and this[0] == "\\":
297 code1 = _fixescape(this, 1)
298 elif this:
299 code1 = LITERAL, this
300 else:
301 raise SyntaxError, "unexpected end of regular expression"
302 if source.match("-"):
303 # potential range
304 this = source.get()
305 if this == "]":
306 set.append(code1)
307 set.append((LITERAL, "-"))
308 break
309 else:
310 if this[0] == "\\":
311 code2 = _fixescape(this, 1)
312 else:
313 code2 = LITERAL, this
314 if code1[0] != LITERAL or code2[0] != LITERAL:
315 raise SyntaxError, "illegal range"
316 if len(code1[1]) != 1 or len(code2[1]) != 1:
317 raise SyntaxError, "illegal range"
318 set.append((RANGE, (code1[1], code2[1])))
319 else:
320 if code1[0] is IN:
321 code1 = code1[1][0]
322 set.append(code1)
323
324 # FIXME: <fl> move set optimization to support function
325 if len(set)==1 and set[0][0] is LITERAL:
326 subpattern.append(set[0]) # optimization
327 elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
328 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
329 else:
330 # FIXME: <fl> add charmap optimization
331 subpattern.append((IN, set))
332
333 elif this and this[0] in REPEAT_CHARS:
334 # repeat previous item
335 if this == "?":
336 min, max = 0, 1
337 elif this == "*":
338 min, max = 0, sys.maxint
339 elif this == "+":
340 min, max = 1, sys.maxint
341 elif this == "{":
342 min, max = 0, sys.maxint
343 lo = hi = ""
344 while str(source.next) in DIGITS:
345 lo = lo + source.get()
346 if source.match(","):
347 while str(source.next) in DIGITS:
348 hi = hi + source.get()
349 else:
350 hi = lo
351 if not source.match("}"):
352 raise SyntaxError, "bogus range"
353 if lo:
354 min = int(lo)
355 if hi:
356 max = int(hi)
357 # FIXME: <fl> check that hi >= lo!
358 else:
359 raise SyntaxError, "not supported"
360 # figure out which item to repeat
361 # FIXME: should back up to the right mark, right?
362 if subpattern:
363 index = len(subpattern)-1
364 while subpattern[index][0] is MARK:
365 index = index - 1
366 item = subpattern[index:index+1]
367 else:
368 raise SyntaxError, "nothing to repeat"
369 if source.match("?"):
370 subpattern[index] = (MIN_REPEAT, (min, max, item))
371 else:
372 subpattern[index] = (MAX_REPEAT, (min, max, item))
373 elif this == ".":
374 subpattern.append((ANY, None))
375 elif this == "(":
376 group = 1
377 name = None
378 if source.match("?"):
379 group = 0
380 # options
381 if source.match("P"):
382 # named group: skip forward to end of name
383 if source.match("<"):
384 name = ""
385 while 1:
386 char = source.get()
387 if char is None or char == ">":
388 break
389 name = name + char
390 group = 1
391 elif source.match(":"):
392 # non-capturing group
393 group = 2
394 elif source.match_set("iI"):
395 pattern.setflag("i")
396 elif source.match_set("lL"):
397 pattern.setflag("l")
398 elif source.match_set("mM"):
399 pattern.setflag("m")
400 elif source.match_set("sS"):
401 pattern.setflag("s")
402 elif source.match_set("xX"):
403 pattern.setflag("x")
404 if group:
405 # parse group contents
406 b = []
407 if group == 2:
408 # anonymous group
409 group = None
410 else:
411 group = pattern.getgroup(name)
412 if group:
413 subpattern.append((MARK, (group-1)*2))
414 while 1:
415 p = _parse(source, pattern, flags)
416 if source.match(")"):
417 if b:
418 b.append(p)
419 _branch(subpattern, b)
420 else:
421 subpattern.append((SUBPATTERN, (group, p)))
422 break
423 elif source.match("|"):
424 b.append(p)
425 else:
426 raise SyntaxError, "group not properly closed"
427 if group:
428 subpattern.append((MARK, (group-1)*2+1))
429 else:
430 # FIXME: should this really be a while loop?
431 while 1:
432 char = source.get()
433 if char is None or char == ")":
434 break
435
436 elif this == "^":
437 subpattern.append((AT, AT_BEGINNING))
438
439 elif this == "$":
440 subpattern.append((AT, AT_END))
441
442 elif this and this[0] == "\\":
443 code =_fixescape(this)
444 subpattern.append(code)
445
446 else:
447 raise SyntaxError, "parser error"
448
449 return subpattern
450
451def parse(source, flags=()):
452 s = Tokenizer(source)
453 g = Pattern()
454 b = []
455 while 1:
456 p = _parse(s, g, flags)
457 tail = s.get()
458 if tail == "|":
459 b.append(p)
460 elif tail == ")":
461 raise SyntaxError, "unbalanced parenthesis"
462 elif tail is None:
463 if b:
464 b.append(p)
465 p = SubPattern(g)
466 _branch(p, b)
467 break
468 else:
469 raise SyntaxError, "bogus characters at end of regular expression"
470 return p
471
472if __name__ == "__main__":
473 from pprint import pprint
474 from testpatterns import PATTERNS
475 a = b = c = 0
476 for pattern, flags in PATTERNS:
477 if flags:
478 continue
479 print "-"*68
480 try:
481 p = parse(pattern)
482 print repr(pattern), "->"
483 pprint(p.data)
484 import sre_compile
485 try:
486 code = sre_compile.compile(p)
487 c = c + 1
488 except:
489 pass
490 a = a + 1
491 except SyntaxError, v:
492 print "**", repr(pattern), v
493 b = b + 1
494 print "-"*68
495 print a, "of", b, "patterns successfully parsed"
496 print c, "of", b, "patterns successfully compiled"
497