blob: d3dbe00041e15166ac11332cfe7ca52e33b0450d [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +00004# convert re-style regular expression to sre pattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +00005#
6# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossum7627c0d2000-03-31 14:58:54 +00008# Portions of this engine have been developed in cooperation with
9# CNRI. Hewlett-Packard provided funding for 1.6 integration and
10# other compatibility work.
11#
12
Guido van Rossum7627c0d2000-03-31 14:58:54 +000013import string, sys
14
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000015import _sre
16
Guido van Rossum7627c0d2000-03-31 14:58:54 +000017from sre_constants import *
18
Fredrik Lundh6c68dc72000-06-29 10:34:56 +000019# FIXME: should be 65535, but the arraymodule is still broken
20MAXREPEAT = 32767
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000021
Guido van Rossum7627c0d2000-03-31 14:58:54 +000022SPECIAL_CHARS = ".\\[{()*+?^$|"
23REPEAT_CHARS = "*+?{"
24
Fredrik Lundh75f2d672000-06-29 11:34:28 +000025DIGITS = tuple(string.digits)
Guido van Rossumb81e70e2000-04-10 17:10:48 +000026
Fredrik Lundh75f2d672000-06-29 11:34:28 +000027OCTDIGITS = tuple("01234567")
28HEXDIGITS = tuple("0123456789abcdefABCDEF")
Guido van Rossum7627c0d2000-03-31 14:58:54 +000029
Fredrik Lundh6c68dc72000-06-29 10:34:56 +000030WHITESPACE = string.whitespace
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000031
Guido van Rossum7627c0d2000-03-31 14:58:54 +000032ESCAPES = {
Fredrik Lundh01016fe2000-06-30 00:27:46 +000033 r"\a": (LITERAL, chr(7)),
34 r"\b": (LITERAL, chr(8)),
35 r"\f": (LITERAL, chr(12)),
36 r"\n": (LITERAL, chr(10)),
37 r"\r": (LITERAL, chr(13)),
38 r"\t": (LITERAL, chr(9)),
39 r"\v": (LITERAL, chr(11)),
40 r"\\": (LITERAL, "\\")
Guido van Rossum7627c0d2000-03-31 14:58:54 +000041}
42
43CATEGORIES = {
Fredrik Lundh01016fe2000-06-30 00:27:46 +000044 r"\A": (AT, AT_BEGINNING), # start of string
45 r"\b": (AT, AT_BOUNDARY),
46 r"\B": (AT, AT_NON_BOUNDARY),
47 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
48 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
49 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
50 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
51 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
52 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
53 r"\Z": (AT, AT_END), # end of string
Guido van Rossum7627c0d2000-03-31 14:58:54 +000054}
55
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000056FLAGS = {
Fredrik Lundh436c3d582000-06-29 08:58:44 +000057 # standard flags
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000058 "i": SRE_FLAG_IGNORECASE,
59 "L": SRE_FLAG_LOCALE,
60 "m": SRE_FLAG_MULTILINE,
61 "s": SRE_FLAG_DOTALL,
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000062 "x": SRE_FLAG_VERBOSE,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063 # extensions
64 "t": SRE_FLAG_TEMPLATE,
65 "u": SRE_FLAG_UNICODE,
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000066}
67
68class State:
Guido van Rossum7627c0d2000-03-31 14:58:54 +000069 def __init__(self):
Fredrik Lundh90a07912000-06-30 07:50:59 +000070 self.flags = 0
71 self.groups = 1
72 self.groupdict = {}
Guido van Rossum7627c0d2000-03-31 14:58:54 +000073 def getgroup(self, name=None):
Fredrik Lundh90a07912000-06-30 07:50:59 +000074 gid = self.groups
75 self.groups = gid + 1
76 if name:
77 self.groupdict[name] = gid
78 return gid
Guido van Rossum7627c0d2000-03-31 14:58:54 +000079
80class SubPattern:
81 # a subpattern, in intermediate form
82 def __init__(self, pattern, data=None):
Fredrik Lundh90a07912000-06-30 07:50:59 +000083 self.pattern = pattern
84 if not data:
85 data = []
86 self.data = data
87 self.width = None
Guido van Rossum7627c0d2000-03-31 14:58:54 +000088 def __repr__(self):
Fredrik Lundh90a07912000-06-30 07:50:59 +000089 return repr(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000090 def __len__(self):
Fredrik Lundh90a07912000-06-30 07:50:59 +000091 return len(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000092 def __delitem__(self, index):
Fredrik Lundh90a07912000-06-30 07:50:59 +000093 del self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000094 def __getitem__(self, index):
Fredrik Lundh90a07912000-06-30 07:50:59 +000095 return self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000096 def __setitem__(self, index, code):
Fredrik Lundh90a07912000-06-30 07:50:59 +000097 self.data[index] = code
Guido van Rossum7627c0d2000-03-31 14:58:54 +000098 def __getslice__(self, start, stop):
Fredrik Lundh90a07912000-06-30 07:50:59 +000099 return SubPattern(self.pattern, self.data[start:stop])
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000100 def insert(self, index, code):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000101 self.data.insert(index, code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000102 def append(self, code):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000103 self.data.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000104 def getwidth(self):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000105 # determine the width (min, max) for this subpattern
106 if self.width:
107 return self.width
108 lo = hi = 0L
109 for op, av in self.data:
110 if op is BRANCH:
111 l = sys.maxint
112 h = 0
113 for av in av[1]:
114 i, j = av.getwidth()
115 l = min(l, i)
116 h = min(h, j)
117 lo = lo + i
118 hi = hi + j
119 elif op is CALL:
120 i, j = av.getwidth()
121 lo = lo + i
122 hi = hi + j
123 elif op is SUBPATTERN:
124 i, j = av[1].getwidth()
125 lo = lo + i
126 hi = hi + j
127 elif op in (MIN_REPEAT, MAX_REPEAT):
128 i, j = av[2].getwidth()
129 lo = lo + long(i) * av[0]
130 hi = hi + long(j) * av[1]
131 elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
132 lo = lo + 1
133 hi = hi + 1
134 elif op == SUCCESS:
135 break
136 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
137 return self.width
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000138
139class Tokenizer:
140 def __init__(self, string):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000141 self.index = 0
142 self.string = string
143 self.next = self.__next()
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000144 def __next(self):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000145 if self.index >= len(self.string):
146 return None
147 char = self.string[self.index]
148 if char[0] == "\\":
149 try:
150 c = self.string[self.index + 1]
151 except IndexError:
152 raise error, "bogus escape"
153 char = char + c
154 self.index = self.index + len(char)
155 return char
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000156 def match(self, char):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000157 if char == self.next:
158 self.next = self.__next()
159 return 1
160 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000161 def match_set(self, set):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000162 if self.next and self.next in set:
163 self.next = self.__next()
164 return 1
165 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000166 def get(self):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000167 this = self.next
168 self.next = self.__next()
169 return this
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000170
Fredrik Lundh4781b072000-06-29 12:38:45 +0000171def isident(char):
172 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
173
174def isdigit(char):
175 return "0" <= char <= "9"
176
177def isname(name):
178 # check that group name is a valid string
179 # FIXME: <fl> this code is really lame. should use a regular
180 # expression instead, but I seem to have certain bootstrapping
181 # problems here ;-)
182 if not isident(name[0]):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000183 return 0
Fredrik Lundh4781b072000-06-29 12:38:45 +0000184 for char in name:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000185 if not isident(char) and not isdigit(char):
186 return 0
Fredrik Lundh4781b072000-06-29 12:38:45 +0000187 return 1
188
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000189def _group(escape, groups):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000190 # check if the escape string represents a valid group
191 try:
Fredrik Lundhb71624e2000-06-30 09:13:06 +0000192 gid = int(escape[1:])
193 if gid and gid < groups:
194 return gid
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000195 except ValueError:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000196 pass
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000197 return None # not a valid group
198
199def _class_escape(source, escape):
200 # handle escape code inside character class
201 code = ESCAPES.get(escape)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000202 if code:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000203 return code
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000204 code = CATEGORIES.get(escape)
205 if code:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000206 return code
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000207 try:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000208 if escape[1:2] == "x":
209 while source.next in HEXDIGITS:
210 escape = escape + source.get()
211 escape = escape[2:]
212 # FIXME: support unicode characters!
213 return LITERAL, chr(int(escape[-4:], 16) & 0xff)
214 elif str(escape[1:2]) in OCTDIGITS:
215 while source.next in OCTDIGITS:
216 escape = escape + source.get()
217 escape = escape[1:]
218 # FIXME: support unicode characters!
219 return LITERAL, chr(int(escape[-6:], 8) & 0xff)
220 if len(escape) == 2:
221 return LITERAL, escape[1]
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000222 except ValueError:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000223 pass
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000224 raise error, "bogus escape: %s" % repr(escape)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000225
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000226def _escape(source, escape, state):
227 # handle escape code in expression
228 code = CATEGORIES.get(escape)
229 if code:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000230 return code
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000231 code = ESCAPES.get(escape)
232 if code:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000233 return code
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000234 try:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000235 if escape[1:2] == "x":
236 while source.next in HEXDIGITS:
237 escape = escape + source.get()
238 escape = escape[2:]
239 # FIXME: support unicode characters!
240 return LITERAL, chr(int(escape[-4:], 16) & 0xff)
241 elif escape[1:2] in DIGITS:
242 while 1:
243 group = _group(escape, state.groups)
244 if group:
245 if (not source.next or
246 not _group(escape + source.next, state.groups)):
247 return GROUP, group
248 escape = escape + source.get()
249 elif source.next in OCTDIGITS:
250 escape = escape + source.get()
251 else:
252 break
253 escape = escape[1:]
254 # FIXME: support unicode characters!
255 return LITERAL, chr(int(escape[-6:], 8) & 0xff)
256 if len(escape) == 2:
257 return LITERAL, escape[1]
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000258 except ValueError:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000259 pass
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000260 raise error, "bogus escape: %s" % repr(escape)
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000261
262
263def _branch(pattern, items):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000264
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000265 # form a branch operator from a set of items
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000266
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000267 subpattern = SubPattern(pattern)
268
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000269 # check if all items share a common prefix
270 while 1:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000271 prefix = None
272 for item in items:
273 if not item:
274 break
275 if prefix is None:
276 prefix = item[0]
277 elif item[0] != prefix:
278 break
279 else:
280 # all subitems start with a common "prefix".
281 # move it out of the branch
282 for item in items:
283 del item[0]
284 subpattern.append(prefix)
285 continue # check next one
286 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000287
288 # check if the branch can be replaced by a character set
289 for item in items:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000290 if len(item) != 1 or item[0][0] != LITERAL:
291 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000292 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000293 # we can store this as a character set instead of a
294 # branch (FIXME: use a range if possible)
295 set = []
296 for item in items:
297 set.append(item[0])
298 subpattern.append((IN, set))
299 return subpattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000300
301 subpattern.append((BRANCH, (None, items)))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000302 return subpattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000303
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000304def _parse(source, state, flags=0):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000305
306 # parse regular expression pattern into an operator list.
307
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000308 subpattern = SubPattern(state)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000309
310 while 1:
311
Fredrik Lundh90a07912000-06-30 07:50:59 +0000312 if source.next in ("|", ")"):
313 break # end of subpattern
314 this = source.get()
315 if this is None:
316 break # end of pattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000317
Fredrik Lundh90a07912000-06-30 07:50:59 +0000318 if state.flags & SRE_FLAG_VERBOSE:
319 # skip whitespace and comments
320 if this in WHITESPACE:
321 continue
322 if this == "#":
323 while 1:
324 this = source.get()
325 if this in (None, "\n"):
326 break
327 continue
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000328
Fredrik Lundh90a07912000-06-30 07:50:59 +0000329 if this and this[0] not in SPECIAL_CHARS:
330 subpattern.append((LITERAL, this))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000331
Fredrik Lundh90a07912000-06-30 07:50:59 +0000332 elif this == "[":
333 # character set
334 set = []
335## if source.match(":"):
336## pass # handle character classes
337 if source.match("^"):
338 set.append((NEGATE, None))
339 # check remaining characters
340 start = set[:]
341 while 1:
342 this = source.get()
343 if this == "]" and set != start:
344 break
345 elif this and this[0] == "\\":
346 code1 = _class_escape(source, this)
347 elif this:
348 code1 = LITERAL, this
349 else:
350 raise error, "unexpected end of regular expression"
351 if source.match("-"):
352 # potential range
353 this = source.get()
354 if this == "]":
355 set.append(code1)
356 set.append((LITERAL, "-"))
357 break
358 else:
359 if this[0] == "\\":
360 code2 = _class_escape(source, this)
361 else:
362 code2 = LITERAL, this
363 if code1[0] != LITERAL or code2[0] != LITERAL:
364 raise error, "illegal range"
365 if len(code1[1]) != 1 or len(code2[1]) != 1:
366 raise error, "illegal range"
367 set.append((RANGE, (code1[1], code2[1])))
368 else:
369 if code1[0] is IN:
370 code1 = code1[1][0]
371 set.append(code1)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000372
Fredrik Lundh90a07912000-06-30 07:50:59 +0000373 # FIXME: <fl> move set optimization to compiler!
374 if len(set)==1 and set[0][0] is LITERAL:
375 subpattern.append(set[0]) # optimization
376 elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
377 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
378 else:
379 # FIXME: <fl> add charmap optimization
380 subpattern.append((IN, set))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000381
Fredrik Lundh90a07912000-06-30 07:50:59 +0000382 elif this and this[0] in REPEAT_CHARS:
383 # repeat previous item
384 if this == "?":
385 min, max = 0, 1
386 elif this == "*":
387 min, max = 0, MAXREPEAT
388 elif this == "+":
389 min, max = 1, MAXREPEAT
390 elif this == "{":
391 min, max = 0, MAXREPEAT
392 lo = hi = ""
393 while source.next in DIGITS:
394 lo = lo + source.get()
395 if source.match(","):
396 while source.next in DIGITS:
397 hi = hi + source.get()
398 else:
399 hi = lo
400 if not source.match("}"):
401 raise error, "bogus range"
402 if lo:
403 min = int(lo)
404 if hi:
405 max = int(hi)
406 # FIXME: <fl> check that hi >= lo!
407 else:
408 raise error, "not supported"
409 # figure out which item to repeat
410 if subpattern:
411 item = subpattern[-1:]
412 else:
413 raise error, "nothing to repeat"
414 if source.match("?"):
415 subpattern[-1] = (MIN_REPEAT, (min, max, item))
416 else:
417 subpattern[-1] = (MAX_REPEAT, (min, max, item))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000418
Fredrik Lundh90a07912000-06-30 07:50:59 +0000419 elif this == ".":
420 subpattern.append((ANY, None))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000421
Fredrik Lundh90a07912000-06-30 07:50:59 +0000422 elif this == "(":
423 group = 1
424 name = None
425 if source.match("?"):
426 group = 0
427 # options
428 if source.match("P"):
429 # python extensions
430 if source.match("<"):
431 # named group: skip forward to end of name
432 name = ""
433 while 1:
434 char = source.get()
435 if char is None:
436 raise error, "unterminated name"
437 if char == ">":
438 break
439 name = name + char
440 group = 1
441 if not isname(name):
442 raise error, "illegal character in group name"
443 elif source.match("="):
444 # named backreference
Fredrik Lundhb71624e2000-06-30 09:13:06 +0000445 name = ""
446 while 1:
447 char = source.get()
448 if char is None:
449 raise error, "unterminated name"
450 if char == ")":
451 break
452 name = name + char
453 if not isname(name):
454 raise error, "illegal character in group name"
455 gid = state.groupdict.get(name)
456 if gid is None:
457 raise error, "unknown group name"
458 subpattern.append((GROUP, gid))
Fredrik Lundh90a07912000-06-30 07:50:59 +0000459 else:
460 char = source.get()
461 if char is None:
462 raise error, "unexpected end of pattern"
463 raise error, "unknown specifier: ?P%s" % char
464 elif source.match(":"):
465 # non-capturing group
466 group = 2
467 elif source.match("#"):
468 # comment
469 while 1:
470 if source.next is None or source.next == ")":
471 break
472 source.get()
Fredrik Lundh43b3b492000-06-30 10:41:31 +0000473 elif source.next in ("=", "!"):
474 # lookahead assertions
475 char = source.get()
476 b = []
477 while 1:
478 p = _parse(source, state, flags)
479 if source.next == ")":
480 if b:
481 b.append(p)
482 p = _branch(state, b)
483 if char == "=":
484 subpattern.append((ASSERT, p))
485 else:
486 subpattern.append((ASSERT_NOT, p))
487 break
488 elif source.match("|"):
489 b.append(p)
490 else:
491 raise error, "pattern not properly closed"
Fredrik Lundh90a07912000-06-30 07:50:59 +0000492 else:
493 # flags
494 while FLAGS.has_key(source.next):
495 state.flags = state.flags | FLAGS[source.get()]
496 if group:
497 # parse group contents
498 b = []
499 if group == 2:
500 # anonymous group
501 group = None
502 else:
503 group = state.getgroup(name)
504 while 1:
505 p = _parse(source, state, flags)
506 if source.match(")"):
507 if b:
508 b.append(p)
509 p = _branch(state, b)
510 subpattern.append((SUBPATTERN, (group, p)))
511 break
512 elif source.match("|"):
513 b.append(p)
514 else:
515 raise error, "group not properly closed"
516 else:
517 while 1:
518 char = source.get()
519 if char is None or char == ")":
520 break
521 raise error, "unknown extension"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000522
Fredrik Lundh90a07912000-06-30 07:50:59 +0000523 elif this == "^":
524 subpattern.append((AT, AT_BEGINNING))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000525
Fredrik Lundh90a07912000-06-30 07:50:59 +0000526 elif this == "$":
527 subpattern.append((AT, AT_END))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000528
Fredrik Lundh90a07912000-06-30 07:50:59 +0000529 elif this and this[0] == "\\":
530 code = _escape(source, this, state)
531 subpattern.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000532
Fredrik Lundh90a07912000-06-30 07:50:59 +0000533 else:
534 raise error, "parser error"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000535
536 return subpattern
537
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000538def parse(pattern, flags=0):
539 # parse 're' pattern into list of (opcode, argument) tuples
540 source = Tokenizer(pattern)
541 state = State()
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000542 b = []
543 while 1:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000544 p = _parse(source, state, flags)
545 tail = source.get()
546 if tail == "|":
547 b.append(p)
548 elif tail == ")":
549 raise error, "unbalanced parenthesis"
550 elif tail is None:
551 if b:
552 b.append(p)
553 p = _branch(state, b)
554 break
555 else:
556 raise error, "bogus characters at end of regular expression"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000557 return p
558
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000559def parse_template(source, pattern):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000560 # parse 're' replacement string into list of literals and
561 # group references
562 s = Tokenizer(source)
563 p = []
564 a = p.append
565 while 1:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000566 this = s.get()
567 if this is None:
568 break # end of replacement string
569 if this and this[0] == "\\":
570 # group
571 if this == "\\g":
572 name = ""
573 if s.match("<"):
574 while 1:
575 char = s.get()
576 if char is None:
577 raise error, "unterminated group name"
578 if char == ">":
579 break
580 name = name + char
581 if not name:
582 raise error, "bad group name"
583 try:
584 index = int(name)
585 except ValueError:
586 if not isname(name):
587 raise error, "illegal character in group name"
588 try:
589 index = pattern.groupindex[name]
590 except KeyError:
591 raise IndexError, "unknown group name"
592 a((MARK, index))
593 elif len(this) > 1 and this[1] in DIGITS:
594 code = None
595 while 1:
596 group = _group(this, pattern.groups+1)
597 if group:
598 if (not s.next or
599 not _group(this + s.next, pattern.groups+1)):
600 code = MARK, int(group)
601 break
602 elif s.next in OCTDIGITS:
603 this = this + s.get()
604 else:
605 break
606 if not code:
607 this = this[1:]
608 # FIXME: support unicode characters!
609 code = LITERAL, chr(int(this[-6:], 8) & 0xff)
610 a(code)
611 else:
612 try:
613 a(ESCAPES[this])
614 except KeyError:
615 for c in this:
616 a((LITERAL, c))
617 else:
618 a((LITERAL, this))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000619 return p
620
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000621def expand_template(template, match):
622 # FIXME: <fl> this is sooooo slow. drop in the slicelist
623 # code instead
624 p = []
625 a = p.append
626 for c, s in template:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000627 if c is LITERAL:
628 a(s)
629 elif c is MARK:
630 s = match.group(s)
631 if s is None:
632 raise error, "empty group"
633 a(s)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000634 return match.string[:0].join(p)