blob: 8e6705c08772ee9be6c58cefbea11dbc811b9aa5 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
3# $Id$
4#
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +00005# convert re-style regular expression to sre pattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +00006#
7# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
8#
9# This code can only be used for 1.6 alpha testing. All other use
10# require explicit permission from Secret Labs AB.
11#
12# Portions of this engine have been developed in cooperation with
13# CNRI. Hewlett-Packard provided funding for 1.6 integration and
14# other compatibility work.
15#
16
Guido van Rossum7627c0d2000-03-31 14:58:54 +000017import string, sys
18
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000019import _sre
20
Guido van Rossum7627c0d2000-03-31 14:58:54 +000021from sre_constants import *
22
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000023# FIXME: should be 65535, but the array module currently chokes on
24# unsigned integers larger than 32767...
25MAXREPEAT = int(2L**(_sre.getcodesize()*8-1))-1
26
Guido van Rossum7627c0d2000-03-31 14:58:54 +000027SPECIAL_CHARS = ".\\[{()*+?^$|"
28REPEAT_CHARS = "*+?{"
29
Guido van Rossumb81e70e2000-04-10 17:10:48 +000030# FIXME: string in tuple tests may explode with if char is unicode :-(
31DIGITS = tuple(string.digits)
32
33OCTDIGITS = tuple("01234567")
34HEXDIGITS = tuple("0123456789abcdefABCDEF")
Guido van Rossum7627c0d2000-03-31 14:58:54 +000035
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000036WHITESPACE = tuple(string.whitespace)
37
Guido van Rossum7627c0d2000-03-31 14:58:54 +000038ESCAPES = {
39 "\\a": (LITERAL, chr(7)),
40 "\\b": (LITERAL, chr(8)),
41 "\\f": (LITERAL, chr(12)),
42 "\\n": (LITERAL, chr(10)),
43 "\\r": (LITERAL, chr(13)),
44 "\\t": (LITERAL, chr(9)),
45 "\\v": (LITERAL, chr(11))
46}
47
48CATEGORIES = {
49 "\\A": (AT, AT_BEGINNING), # start of string
50 "\\b": (AT, AT_BOUNDARY),
51 "\\B": (AT, AT_NON_BOUNDARY),
52 "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
53 "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
54 "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
55 "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
56 "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
57 "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
58 "\\Z": (AT, AT_END), # end of string
59}
60
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000061FLAGS = {
62 "i": SRE_FLAG_IGNORECASE,
63 "L": SRE_FLAG_LOCALE,
64 "m": SRE_FLAG_MULTILINE,
65 "s": SRE_FLAG_DOTALL,
66 "t": SRE_FLAG_TEMPLATE,
67 "x": SRE_FLAG_VERBOSE,
68}
69
70class State:
Guido van Rossum7627c0d2000-03-31 14:58:54 +000071 def __init__(self):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000072 self.flags = 0
Guido van Rossumb81e70e2000-04-10 17:10:48 +000073 self.groups = 1
74 self.groupdict = {}
Guido van Rossum7627c0d2000-03-31 14:58:54 +000075 def getgroup(self, name=None):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000076 gid = self.groups
77 self.groups = gid + 1
78 if name:
79 self.groupdict[name] = gid
80 return gid
Guido van Rossum7627c0d2000-03-31 14:58:54 +000081
82class SubPattern:
83 # a subpattern, in intermediate form
84 def __init__(self, pattern, data=None):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000085 self.pattern = pattern
86 if not data:
87 data = []
88 self.data = data
Guido van Rossumb81e70e2000-04-10 17:10:48 +000089 self.width = None
Guido van Rossum7627c0d2000-03-31 14:58:54 +000090 def __repr__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000091 return repr(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000092 def __len__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000093 return len(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000094 def __delitem__(self, index):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000095 del self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000096 def __getitem__(self, index):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000097 return self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000098 def __setitem__(self, index, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000099 self.data[index] = code
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000100 def __getslice__(self, start, stop):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000101 return SubPattern(self.pattern, self.data[start:stop])
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000102 def insert(self, index, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000103 self.data.insert(index, code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000104 def append(self, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000105 self.data.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000106 def getwidth(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000107 # determine the width (min, max) for this subpattern
108 if self.width:
109 return self.width
110 lo = hi = 0L
111 for op, av in self.data:
112 if op is BRANCH:
113 l = sys.maxint
114 h = 0
115 for av in av[1]:
116 i, j = av.getwidth()
117 l = min(l, i)
118 h = min(h, j)
119 lo = lo + i
120 hi = hi + j
121 elif op is CALL:
122 i, j = av.getwidth()
123 lo = lo + i
124 hi = hi + j
125 elif op is SUBPATTERN:
126 i, j = av[1].getwidth()
127 lo = lo + i
128 hi = hi + j
129 elif op in (MIN_REPEAT, MAX_REPEAT):
130 i, j = av[2].getwidth()
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000131 lo = lo + long(i) * av[0]
132 hi = hi + long(j) * av[1]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000133 elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
134 lo = lo + 1
135 hi = hi + 1
136 elif op == SUCCESS:
137 break
138 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
139 return self.width
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000140
141class Tokenizer:
142 def __init__(self, string):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000143 self.index = 0
144 self.string = string
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000145 self.next = self.__next()
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000146 def __next(self):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000147 if self.index >= len(self.string):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000148 return None
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000149 char = self.string[self.index]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000150 if char[0] == "\\":
151 try:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000152 c = self.string[self.index + 1]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000153 except IndexError:
154 raise SyntaxError, "bogus escape"
155 char = char + c
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000156 self.index = self.index + len(char)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000157 return char
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000158 def match(self, char):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000159 if char == self.next:
160 self.next = self.__next()
161 return 1
162 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000163 def match_set(self, set):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000164 if self.next and self.next in set:
165 self.next = self.__next()
166 return 1
167 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000168 def get(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000169 this = self.next
170 self.next = self.__next()
171 return this
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000172
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000173def _group(escape, state):
174 # check if the escape string represents a valid group
175 try:
176 group = int(escape[1:])
177 if group and group < state.groups:
178 return group
179 except ValueError:
180 pass
181 return None # not a valid group
182
183def _class_escape(source, escape):
184 # handle escape code inside character class
185 code = ESCAPES.get(escape)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000186 if code:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000187 return code
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000188 code = CATEGORIES.get(escape)
189 if code:
190 return code
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000191 try:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000192 if escape[1:2] == "x":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000193 while source.next in HEXDIGITS:
194 escape = escape + source.get()
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000195 escape = escape[2:]
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000196 # FIXME: support unicode characters!
197 return LITERAL, chr(int(escape[-4:], 16) & 0xff)
198 elif str(escape[1:2]) in OCTDIGITS:
199 while source.next in OCTDIGITS:
200 escape = escape + source.get()
201 escape = escape[1:]
202 # FIXME: support unicode characters!
203 return LITERAL, chr(int(escape[-6:], 8) & 0xff)
204 if len(escape) == 2:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000205 return LITERAL, escape[1]
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000206 except ValueError:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000207 pass
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000208 raise SyntaxError, "bogus escape: %s" % repr(escape)
209
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000210def _escape(source, escape, state):
211 # handle escape code in expression
212 code = CATEGORIES.get(escape)
213 if code:
214 return code
215 code = ESCAPES.get(escape)
216 if code:
217 return code
218 try:
219 if escape[1:2] == "x":
220 while source.next in HEXDIGITS:
221 escape = escape + source.get()
222 escape = escape[2:]
223 # FIXME: support unicode characters!
224 return LITERAL, chr(int(escape[-4:], 16) & 0xff)
225 elif str(escape[1:2]) in DIGITS:
226 while 1:
227 group = _group(escape, state)
228 if group:
229 if (not source.next or
230 not _group(escape + source.next, state)):
231 return GROUP, group
232 escape = escape + source.get()
233 elif source.next in OCTDIGITS:
234 escape = escape + source.get()
235 else:
236 break
237 escape = escape[1:]
238 # FIXME: support unicode characters!
239 return LITERAL, chr(int(escape[-6:], 8) & 0xff)
240 if len(escape) == 2:
241 return LITERAL, escape[1]
242 except ValueError:
243 pass
244 raise SyntaxError, "bogus escape: %s" % repr(escape)
245
246
247def _branch(pattern, items):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000248
249 # form a branch operator from a set of items (FIXME: move this
250 # optimization to the compiler module!)
251
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000252 subpattern = SubPattern(pattern)
253
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000254 # check if all items share a common prefix
255 while 1:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000256 prefix = None
257 for item in items:
258 if not item:
259 break
260 if prefix is None:
261 prefix = item[0]
262 elif item[0] != prefix:
263 break
264 else:
265 # all subitems start with a common "prefix".
266 # move it out of the branch
267 for item in items:
268 del item[0]
269 subpattern.append(prefix)
270 continue # check next one
271 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000272
273 # check if the branch can be replaced by a character set
274 for item in items:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000275 if len(item) != 1 or item[0][0] != LITERAL:
276 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000277 else:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000278 # we can store this as a character set instead of a
279 # branch (FIXME: use a range if possible)
280 set = []
281 for item in items:
282 set.append(item[0])
283 subpattern.append((IN, set))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000284 return subpattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000285
286 subpattern.append((BRANCH, (None, items)))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000287 return subpattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000288
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000289def _parse(source, state, flags=0):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000290
291 # parse regular expression pattern into an operator list.
292
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000293 subpattern = SubPattern(state)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000294
295 while 1:
296
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000297 if str(source.next) in ("|", ")"):
298 break # end of subpattern
299 this = source.get()
300 if this is None:
301 break # end of pattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000302
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000303 if state.flags & SRE_FLAG_VERBOSE:
304 # skip whitespace and comments
305 if this in WHITESPACE:
306 continue
307 if this == "#":
308 while 1:
309 this = source.get()
310 if this in (None, "\n"):
311 break
312 continue
313
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000314 if this and this[0] not in SPECIAL_CHARS:
315 subpattern.append((LITERAL, this))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000316
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000317 elif this == "[":
318 # character set
319 set = []
320## if source.match(":"):
321## pass # handle character classes
322 if source.match("^"):
323 set.append((NEGATE, None))
324 # check remaining characters
325 start = set[:]
326 while 1:
327 this = source.get()
328 if this == "]" and set != start:
329 break
330 elif this and this[0] == "\\":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000331 code1 = _class_escape(source, this)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000332 elif this:
333 code1 = LITERAL, this
334 else:
335 raise SyntaxError, "unexpected end of regular expression"
336 if source.match("-"):
337 # potential range
338 this = source.get()
339 if this == "]":
340 set.append(code1)
341 set.append((LITERAL, "-"))
342 break
343 else:
344 if this[0] == "\\":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000345 code2 = _class_escape(source, this)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000346 else:
347 code2 = LITERAL, this
348 if code1[0] != LITERAL or code2[0] != LITERAL:
349 raise SyntaxError, "illegal range"
350 if len(code1[1]) != 1 or len(code2[1]) != 1:
351 raise SyntaxError, "illegal range"
352 set.append((RANGE, (code1[1], code2[1])))
353 else:
354 if code1[0] is IN:
355 code1 = code1[1][0]
356 set.append(code1)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000357
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000358 # FIXME: <fl> move set optimization to compiler!
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000359 if len(set)==1 and set[0][0] is LITERAL:
360 subpattern.append(set[0]) # optimization
361 elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
362 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
363 else:
364 # FIXME: <fl> add charmap optimization
365 subpattern.append((IN, set))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000366
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000367 elif this and this[0] in REPEAT_CHARS:
368 # repeat previous item
369 if this == "?":
370 min, max = 0, 1
371 elif this == "*":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000372 min, max = 0, MAXREPEAT
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000373 elif this == "+":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000374 min, max = 1, MAXREPEAT
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000375 elif this == "{":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000376 min, max = 0, MAXREPEAT
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000377 lo = hi = ""
378 while str(source.next) in DIGITS:
379 lo = lo + source.get()
380 if source.match(","):
381 while str(source.next) in DIGITS:
382 hi = hi + source.get()
383 else:
384 hi = lo
385 if not source.match("}"):
386 raise SyntaxError, "bogus range"
387 if lo:
388 min = int(lo)
389 if hi:
390 max = int(hi)
391 # FIXME: <fl> check that hi >= lo!
392 else:
393 raise SyntaxError, "not supported"
394 # figure out which item to repeat
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000395 if subpattern:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000396 item = subpattern[-1:]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000397 else:
398 raise SyntaxError, "nothing to repeat"
399 if source.match("?"):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000400 subpattern[-1] = (MIN_REPEAT, (min, max, item))
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000401 else:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000402 subpattern[-1] = (MAX_REPEAT, (min, max, item))
403
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000404 elif this == ".":
405 subpattern.append((ANY, None))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000406
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000407 elif this == "(":
408 group = 1
409 name = None
410 if source.match("?"):
411 group = 0
412 # options
413 if source.match("P"):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000414 # python extensions
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000415 if source.match("<"):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000416 # named group: skip forward to end of name
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000417 name = ""
418 while 1:
419 char = source.get()
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000420 if char is None:
421 raise SyntaxError, "unterminated name"
422 if char == ">":
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000423 break
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000424 # FIXME: check for valid character
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000425 name = name + char
426 group = 1
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000427 elif source.match("="):
428 # named backreference
429 raise SyntaxError, "not yet implemented"
430
431 else:
432 char = source.get()
433 if char is None:
434 raise SyntaxError, "unexpected end of pattern"
435 raise SyntaxError, "unknown specifier: ?P%s" % char
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000436 elif source.match(":"):
437 # non-capturing group
438 group = 2
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000439 elif source.match("#"):
440 # comment
441 while 1:
442 char = source.get()
443 if char is None or char == ")":
444 break
445 else:
446 # flags
447 while FLAGS.has_key(source.next):
448 state.flags = state.flags | FLAGS[source.get()]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000449 if group:
450 # parse group contents
451 b = []
452 if group == 2:
453 # anonymous group
454 group = None
455 else:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000456 group = state.getgroup(name)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000457 while 1:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000458 p = _parse(source, state, flags)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000459 if source.match(")"):
460 if b:
461 b.append(p)
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000462 p = _branch(state, b)
463 subpattern.append((SUBPATTERN, (group, p)))
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000464 break
465 elif source.match("|"):
466 b.append(p)
467 else:
468 raise SyntaxError, "group not properly closed"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000469 else:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000470 while 1:
471 char = source.get()
472 if char is None or char == ")":
473 break
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000474 # FIXME: skip characters?
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000475
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000476 elif this == "^":
477 subpattern.append((AT, AT_BEGINNING))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000478
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000479 elif this == "$":
480 subpattern.append((AT, AT_END))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000481
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000482 elif this and this[0] == "\\":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000483 code = _escape(source, this, state)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000484 subpattern.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000485
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000486 else:
487 raise SyntaxError, "parser error"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000488
489 return subpattern
490
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000491def parse(pattern, flags=0):
492 # parse 're' pattern into list of (opcode, argument) tuples
493 source = Tokenizer(pattern)
494 state = State()
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000495 b = []
496 while 1:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000497 p = _parse(source, state, flags)
498 tail = source.get()
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000499 if tail == "|":
500 b.append(p)
501 elif tail == ")":
502 raise SyntaxError, "unbalanced parenthesis"
503 elif tail is None:
504 if b:
505 b.append(p)
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000506 p = _branch(state, b)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000507 break
508 else:
509 raise SyntaxError, "bogus characters at end of regular expression"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000510 return p
511
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000512def parse_replacement(source, pattern):
513 # parse 're' replacement string into list of literals and
514 # group references
515 s = Tokenizer(source)
516 p = []
517 a = p.append
518 while 1:
519 this = s.get()
520 if this is None:
521 break # end of replacement string
522 if this and this[0] == "\\":
523 try:
524 a(LITERAL, ESCAPES[this])
525 except KeyError:
526 for char in this:
527 a(LITERAL, char)
528 else:
529 a(LITERAL, this)
530 return p
531
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000532if __name__ == "__main__":
533 from pprint import pprint
534 from testpatterns import PATTERNS
535 a = b = c = 0
536 for pattern, flags in PATTERNS:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000537 if flags:
538 continue
539 print "-"*68
540 try:
541 p = parse(pattern)
542 print repr(pattern), "->"
543 pprint(p.data)
544 import sre_compile
545 try:
546 code = sre_compile.compile(p)
547 c = c + 1
548 except:
549 pass
550 a = a + 1
551 except SyntaxError, v:
552 print "**", repr(pattern), v
553 b = b + 1
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000554 print "-"*68
555 print a, "of", b, "patterns successfully parsed"
556 print c, "of", b, "patterns successfully compiled"
557