blob: ec934fe6b49a563deff54585831c088f3acc324e [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +00004# convert re-style regular expression to sre pattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +00005#
6# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
7#
Guido van Rossum7627c0d2000-03-31 14:58:54 +00008# Portions of this engine have been developed in cooperation with
9# CNRI. Hewlett-Packard provided funding for 1.6 integration and
10# other compatibility work.
11#
12
Guido van Rossum7627c0d2000-03-31 14:58:54 +000013import string, sys
14
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000015import _sre
16
Guido van Rossum7627c0d2000-03-31 14:58:54 +000017from sre_constants import *
18
Fredrik Lundh6c68dc72000-06-29 10:34:56 +000019# FIXME: should be 65535, but the arraymodule is still broken
20MAXREPEAT = 32767
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000021
Guido van Rossum7627c0d2000-03-31 14:58:54 +000022SPECIAL_CHARS = ".\\[{()*+?^$|"
23REPEAT_CHARS = "*+?{"
24
Fredrik Lundh75f2d672000-06-29 11:34:28 +000025DIGITS = tuple(string.digits)
Guido van Rossumb81e70e2000-04-10 17:10:48 +000026
Fredrik Lundh75f2d672000-06-29 11:34:28 +000027OCTDIGITS = tuple("01234567")
28HEXDIGITS = tuple("0123456789abcdefABCDEF")
Guido van Rossum7627c0d2000-03-31 14:58:54 +000029
Fredrik Lundh6c68dc72000-06-29 10:34:56 +000030WHITESPACE = string.whitespace
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000031
Guido van Rossum7627c0d2000-03-31 14:58:54 +000032ESCAPES = {
Fredrik Lundh01016fe2000-06-30 00:27:46 +000033 r"\a": (LITERAL, chr(7)),
34 r"\b": (LITERAL, chr(8)),
35 r"\f": (LITERAL, chr(12)),
36 r"\n": (LITERAL, chr(10)),
37 r"\r": (LITERAL, chr(13)),
38 r"\t": (LITERAL, chr(9)),
39 r"\v": (LITERAL, chr(11)),
40 r"\\": (LITERAL, "\\")
Guido van Rossum7627c0d2000-03-31 14:58:54 +000041}
42
43CATEGORIES = {
Fredrik Lundh01016fe2000-06-30 00:27:46 +000044 r"\A": (AT, AT_BEGINNING), # start of string
45 r"\b": (AT, AT_BOUNDARY),
46 r"\B": (AT, AT_NON_BOUNDARY),
47 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
48 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
49 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
50 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
51 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
52 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
53 r"\Z": (AT, AT_END), # end of string
Guido van Rossum7627c0d2000-03-31 14:58:54 +000054}
55
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000056FLAGS = {
Fredrik Lundh436c3d582000-06-29 08:58:44 +000057 # standard flags
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000058 "i": SRE_FLAG_IGNORECASE,
59 "L": SRE_FLAG_LOCALE,
60 "m": SRE_FLAG_MULTILINE,
61 "s": SRE_FLAG_DOTALL,
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000062 "x": SRE_FLAG_VERBOSE,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063 # extensions
64 "t": SRE_FLAG_TEMPLATE,
65 "u": SRE_FLAG_UNICODE,
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000066}
67
68class State:
Guido van Rossum7627c0d2000-03-31 14:58:54 +000069 def __init__(self):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +000070 self.flags = 0
Guido van Rossumb81e70e2000-04-10 17:10:48 +000071 self.groups = 1
72 self.groupdict = {}
Guido van Rossum7627c0d2000-03-31 14:58:54 +000073 def getgroup(self, name=None):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000074 gid = self.groups
75 self.groups = gid + 1
76 if name:
77 self.groupdict[name] = gid
78 return gid
Guido van Rossum7627c0d2000-03-31 14:58:54 +000079
80class SubPattern:
81 # a subpattern, in intermediate form
82 def __init__(self, pattern, data=None):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000083 self.pattern = pattern
84 if not data:
85 data = []
86 self.data = data
Guido van Rossumb81e70e2000-04-10 17:10:48 +000087 self.width = None
Guido van Rossum7627c0d2000-03-31 14:58:54 +000088 def __repr__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000089 return repr(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000090 def __len__(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000091 return len(self.data)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000092 def __delitem__(self, index):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000093 del self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000094 def __getitem__(self, index):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000095 return self.data[index]
Guido van Rossum7627c0d2000-03-31 14:58:54 +000096 def __setitem__(self, index, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000097 self.data[index] = code
Guido van Rossum7627c0d2000-03-31 14:58:54 +000098 def __getslice__(self, start, stop):
Guido van Rossumb81e70e2000-04-10 17:10:48 +000099 return SubPattern(self.pattern, self.data[start:stop])
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000100 def insert(self, index, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000101 self.data.insert(index, code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000102 def append(self, code):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000103 self.data.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000104 def getwidth(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000105 # determine the width (min, max) for this subpattern
106 if self.width:
107 return self.width
108 lo = hi = 0L
109 for op, av in self.data:
110 if op is BRANCH:
111 l = sys.maxint
112 h = 0
113 for av in av[1]:
114 i, j = av.getwidth()
115 l = min(l, i)
116 h = min(h, j)
117 lo = lo + i
118 hi = hi + j
119 elif op is CALL:
120 i, j = av.getwidth()
121 lo = lo + i
122 hi = hi + j
123 elif op is SUBPATTERN:
124 i, j = av[1].getwidth()
125 lo = lo + i
126 hi = hi + j
127 elif op in (MIN_REPEAT, MAX_REPEAT):
128 i, j = av[2].getwidth()
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000129 lo = lo + long(i) * av[0]
130 hi = hi + long(j) * av[1]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000131 elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
132 lo = lo + 1
133 hi = hi + 1
134 elif op == SUCCESS:
135 break
136 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
137 return self.width
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000138
139class Tokenizer:
140 def __init__(self, string):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000141 self.index = 0
142 self.string = string
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000143 self.next = self.__next()
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000144 def __next(self):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000145 if self.index >= len(self.string):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000146 return None
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000147 char = self.string[self.index]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000148 if char[0] == "\\":
149 try:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000150 c = self.string[self.index + 1]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000151 except IndexError:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000152 raise error, "bogus escape"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000153 char = char + c
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000154 self.index = self.index + len(char)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000155 return char
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000156 def match(self, char):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000157 if char == self.next:
158 self.next = self.__next()
159 return 1
160 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000161 def match_set(self, set):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000162 if self.next and self.next in set:
163 self.next = self.__next()
164 return 1
165 return 0
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000166 def get(self):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000167 this = self.next
168 self.next = self.__next()
169 return this
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000170
Fredrik Lundh4781b072000-06-29 12:38:45 +0000171def isident(char):
172 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
173
174def isdigit(char):
175 return "0" <= char <= "9"
176
177def isname(name):
178 # check that group name is a valid string
179 # FIXME: <fl> this code is really lame. should use a regular
180 # expression instead, but I seem to have certain bootstrapping
181 # problems here ;-)
182 if not isident(name[0]):
183 return 0
184 for char in name:
185 if not isident(char) and not isdigit(char):
186 return 0
187 return 1
188
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000189def _group(escape, groups):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000190 # check if the escape string represents a valid group
191 try:
192 group = int(escape[1:])
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000193 if group and group < groups:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000194 return group
195 except ValueError:
196 pass
197 return None # not a valid group
198
199def _class_escape(source, escape):
200 # handle escape code inside character class
201 code = ESCAPES.get(escape)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000202 if code:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000203 return code
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000204 code = CATEGORIES.get(escape)
205 if code:
206 return code
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000207 try:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000208 if escape[1:2] == "x":
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000209 while source.next in HEXDIGITS:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000210 escape = escape + source.get()
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000211 escape = escape[2:]
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000212 # FIXME: support unicode characters!
213 return LITERAL, chr(int(escape[-4:], 16) & 0xff)
214 elif str(escape[1:2]) in OCTDIGITS:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000215 while source.next in OCTDIGITS:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000216 escape = escape + source.get()
217 escape = escape[1:]
218 # FIXME: support unicode characters!
219 return LITERAL, chr(int(escape[-6:], 8) & 0xff)
220 if len(escape) == 2:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000221 return LITERAL, escape[1]
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000222 except ValueError:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000223 pass
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000224 raise error, "bogus escape: %s" % repr(escape)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000225
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000226def _escape(source, escape, state):
227 # handle escape code in expression
228 code = CATEGORIES.get(escape)
229 if code:
230 return code
231 code = ESCAPES.get(escape)
232 if code:
233 return code
234 try:
235 if escape[1:2] == "x":
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000236 while source.next in HEXDIGITS:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000237 escape = escape + source.get()
238 escape = escape[2:]
239 # FIXME: support unicode characters!
240 return LITERAL, chr(int(escape[-4:], 16) & 0xff)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000241 elif escape[1:2] in DIGITS:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000242 while 1:
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000243 group = _group(escape, state.groups)
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000244 if group:
245 if (not source.next or
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000246 not _group(escape + source.next, state.groups)):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000247 return GROUP, group
248 escape = escape + source.get()
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000249 elif source.next in OCTDIGITS:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000250 escape = escape + source.get()
251 else:
252 break
253 escape = escape[1:]
254 # FIXME: support unicode characters!
255 return LITERAL, chr(int(escape[-6:], 8) & 0xff)
256 if len(escape) == 2:
257 return LITERAL, escape[1]
258 except ValueError:
259 pass
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000260 raise error, "bogus escape: %s" % repr(escape)
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000261
262
263def _branch(pattern, items):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000264
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000265 # form a branch operator from a set of items
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000266
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000267 subpattern = SubPattern(pattern)
268
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000269 # check if all items share a common prefix
270 while 1:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000271 prefix = None
272 for item in items:
273 if not item:
274 break
275 if prefix is None:
276 prefix = item[0]
277 elif item[0] != prefix:
278 break
279 else:
280 # all subitems start with a common "prefix".
281 # move it out of the branch
282 for item in items:
283 del item[0]
284 subpattern.append(prefix)
285 continue # check next one
286 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000287
288 # check if the branch can be replaced by a character set
289 for item in items:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000290 if len(item) != 1 or item[0][0] != LITERAL:
291 break
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000292 else:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000293 # we can store this as a character set instead of a
294 # branch (FIXME: use a range if possible)
295 set = []
296 for item in items:
297 set.append(item[0])
298 subpattern.append((IN, set))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000299 return subpattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000300
301 subpattern.append((BRANCH, (None, items)))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000302 return subpattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000303
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000304def _parse(source, state, flags=0):
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000305
306 # parse regular expression pattern into an operator list.
307
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000308 subpattern = SubPattern(state)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000309
310 while 1:
311
Fredrik Lundh6c68dc72000-06-29 10:34:56 +0000312 if source.next in ("|", ")"):
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000313 break # end of subpattern
314 this = source.get()
315 if this is None:
316 break # end of pattern
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000317
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000318 if state.flags & SRE_FLAG_VERBOSE:
319 # skip whitespace and comments
320 if this in WHITESPACE:
321 continue
322 if this == "#":
323 while 1:
324 this = source.get()
325 if this in (None, "\n"):
326 break
327 continue
328
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000329 if this and this[0] not in SPECIAL_CHARS:
330 subpattern.append((LITERAL, this))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000331
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000332 elif this == "[":
333 # character set
334 set = []
335## if source.match(":"):
336## pass # handle character classes
337 if source.match("^"):
338 set.append((NEGATE, None))
339 # check remaining characters
340 start = set[:]
341 while 1:
342 this = source.get()
343 if this == "]" and set != start:
344 break
345 elif this and this[0] == "\\":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000346 code1 = _class_escape(source, this)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000347 elif this:
348 code1 = LITERAL, this
349 else:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000350 raise error, "unexpected end of regular expression"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000351 if source.match("-"):
352 # potential range
353 this = source.get()
354 if this == "]":
355 set.append(code1)
356 set.append((LITERAL, "-"))
357 break
358 else:
359 if this[0] == "\\":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000360 code2 = _class_escape(source, this)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000361 else:
362 code2 = LITERAL, this
363 if code1[0] != LITERAL or code2[0] != LITERAL:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000364 raise error, "illegal range"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000365 if len(code1[1]) != 1 or len(code2[1]) != 1:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000366 raise error, "illegal range"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000367 set.append((RANGE, (code1[1], code2[1])))
368 else:
369 if code1[0] is IN:
370 code1 = code1[1][0]
371 set.append(code1)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000372
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000373 # FIXME: <fl> move set optimization to compiler!
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000374 if len(set)==1 and set[0][0] is LITERAL:
375 subpattern.append(set[0]) # optimization
376 elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
377 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
378 else:
379 # FIXME: <fl> add charmap optimization
380 subpattern.append((IN, set))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000381
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000382 elif this and this[0] in REPEAT_CHARS:
383 # repeat previous item
384 if this == "?":
385 min, max = 0, 1
386 elif this == "*":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000387 min, max = 0, MAXREPEAT
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000388 elif this == "+":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000389 min, max = 1, MAXREPEAT
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000390 elif this == "{":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000391 min, max = 0, MAXREPEAT
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000392 lo = hi = ""
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000393 while source.next in DIGITS:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000394 lo = lo + source.get()
395 if source.match(","):
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000396 while source.next in DIGITS:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000397 hi = hi + source.get()
398 else:
399 hi = lo
400 if not source.match("}"):
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000401 raise error, "bogus range"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000402 if lo:
403 min = int(lo)
404 if hi:
405 max = int(hi)
406 # FIXME: <fl> check that hi >= lo!
407 else:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000408 raise error, "not supported"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000409 # figure out which item to repeat
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000410 if subpattern:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000411 item = subpattern[-1:]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000412 else:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000413 raise error, "nothing to repeat"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000414 if source.match("?"):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000415 subpattern[-1] = (MIN_REPEAT, (min, max, item))
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000416 else:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000417 subpattern[-1] = (MAX_REPEAT, (min, max, item))
418
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000419 elif this == ".":
420 subpattern.append((ANY, None))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000421
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000422 elif this == "(":
423 group = 1
424 name = None
425 if source.match("?"):
426 group = 0
427 # options
428 if source.match("P"):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000429 # python extensions
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000430 if source.match("<"):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000431 # named group: skip forward to end of name
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000432 name = ""
433 while 1:
434 char = source.get()
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000435 if char is None:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000436 raise error, "unterminated name"
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000437 if char == ">":
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000438 break
439 name = name + char
440 group = 1
Fredrik Lundh4781b072000-06-29 12:38:45 +0000441 if not isname(name):
442 raise error, "illegal character in group name"
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000443 elif source.match("="):
444 # named backreference
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000445 raise error, "not yet implemented"
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000446 else:
447 char = source.get()
448 if char is None:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000449 raise error, "unexpected end of pattern"
450 raise error, "unknown specifier: ?P%s" % char
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000451 elif source.match(":"):
452 # non-capturing group
453 group = 2
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000454 elif source.match("#"):
455 # comment
456 while 1:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000457 if source.next is None or source.next == ")":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000458 break
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000459 source.get()
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000460 else:
461 # flags
462 while FLAGS.has_key(source.next):
463 state.flags = state.flags | FLAGS[source.get()]
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000464 if group:
465 # parse group contents
466 b = []
467 if group == 2:
468 # anonymous group
469 group = None
470 else:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000471 group = state.getgroup(name)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000472 while 1:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000473 p = _parse(source, state, flags)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000474 if source.match(")"):
475 if b:
476 b.append(p)
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000477 p = _branch(state, b)
478 subpattern.append((SUBPATTERN, (group, p)))
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000479 break
480 elif source.match("|"):
481 b.append(p)
482 else:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000483 raise error, "group not properly closed"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000484 else:
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000485 while 1:
486 char = source.get()
487 if char is None or char == ")":
488 break
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000489 raise error, "unknown extension"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000490
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000491 elif this == "^":
492 subpattern.append((AT, AT_BEGINNING))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000493
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000494 elif this == "$":
495 subpattern.append((AT, AT_END))
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000496
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000497 elif this and this[0] == "\\":
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000498 code = _escape(source, this, state)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000499 subpattern.append(code)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000500
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000501 else:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000502 raise error, "parser error"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000503
504 return subpattern
505
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000506def parse(pattern, flags=0):
507 # parse 're' pattern into list of (opcode, argument) tuples
508 source = Tokenizer(pattern)
509 state = State()
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000510 b = []
511 while 1:
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000512 p = _parse(source, state, flags)
513 tail = source.get()
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000514 if tail == "|":
515 b.append(p)
516 elif tail == ")":
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000517 raise error, "unbalanced parenthesis"
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000518 elif tail is None:
519 if b:
520 b.append(p)
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000521 p = _branch(state, b)
Guido van Rossumb81e70e2000-04-10 17:10:48 +0000522 break
523 else:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000524 raise error, "bogus characters at end of regular expression"
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000525 return p
526
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000527def parse_template(source, pattern):
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000528 # parse 're' replacement string into list of literals and
529 # group references
530 s = Tokenizer(source)
531 p = []
532 a = p.append
533 while 1:
534 this = s.get()
535 if this is None:
536 break # end of replacement string
537 if this and this[0] == "\\":
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000538 # group
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000539 if this == "\\g":
540 name = ""
541 if s.match("<"):
542 while 1:
543 char = s.get()
544 if char is None:
Fredrik Lundh4781b072000-06-29 12:38:45 +0000545 raise error, "unterminated group name"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000546 if char == ">":
547 break
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000548 name = name + char
549 if not name:
Fredrik Lundh4781b072000-06-29 12:38:45 +0000550 raise error, "bad group name"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000551 try:
552 index = int(name)
553 except ValueError:
Fredrik Lundh4781b072000-06-29 12:38:45 +0000554 if not isname(name):
555 raise error, "illegal character in group name"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000556 try:
557 index = pattern.groupindex[name]
558 except KeyError:
Fredrik Lundh4781b072000-06-29 12:38:45 +0000559 raise IndexError, "unknown group name"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000560 a((MARK, index))
561 elif len(this) > 1 and this[1] in DIGITS:
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000562 code = None
563 while 1:
564 group = _group(this, pattern.groups+1)
565 if group:
566 if (not s.next or
567 not _group(this + s.next, pattern.groups+1)):
568 code = MARK, int(group)
569 break
570 elif s.next in OCTDIGITS:
571 this = this + s.get()
572 else:
573 break
574 if not code:
575 this = this[1:]
576 # FIXME: support unicode characters!
577 code = LITERAL, chr(int(this[-6:], 8) & 0xff)
578 a(code)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000579 else:
580 try:
581 a(ESCAPES[this])
582 except KeyError:
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000583 for c in this:
584 a((LITERAL, c))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000585 else:
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000586 a((LITERAL, this))
Andrew M. Kuchling815d5b92000-06-09 14:08:07 +0000587 return p
588
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000589def expand_template(template, match):
590 # FIXME: <fl> this is sooooo slow. drop in the slicelist
591 # code instead
592 p = []
593 a = p.append
594 for c, s in template:
595 if c is LITERAL:
596 a(s)
597 elif c is MARK:
598 s = match.group(s)
599 if s is None:
600 raise error, "empty group"
601 a(s)
602 return match.string[:0].join(p)