blob: d190363df47581c0c0478b0d738749a5f1749e4b [file] [log] [blame]
Guido van Rossumbf9d3531997-10-06 14:45:17 +00001import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00002import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00003from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00004
Guido van Rossumbf9d3531997-10-06 14:45:17 +00005#
6# First, the public part of the interface:
7#
8
9# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossumdfa67901997-12-08 17:12:06 +000010# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000011
12# compilation flags
13
Guido van Rossumbf9d3531997-10-06 14:45:17 +000014I = IGNORECASE
Guido van Rossumdfa67901997-12-08 17:12:06 +000015L = LOCALE
Guido van Rossumbf9d3531997-10-06 14:45:17 +000016M = MULTILINE
17S = DOTALL
18X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000019
20#
21#
22#
23
Guido van Rossum26d80e61997-07-15 18:59:04 +000024_cache = {}
25_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000026
Guido van Rossum9e18ec71997-07-17 22:39:13 +000027def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000028 key = (pattern, flags)
29 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000030 return _cache[key]
Guido van Rossum26d80e61997-07-15 18:59:04 +000031 except KeyError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000032 pass
Guido van Rossum26d80e61997-07-15 18:59:04 +000033 value = compile(pattern, flags)
34 if len(_cache) >= _MAXCACHE:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000035 _cache.clear()
Guido van Rossum26d80e61997-07-15 18:59:04 +000036 _cache[key] = value
37 return value
38
Guido van Rossum5ca1b711997-07-10 21:00:31 +000039def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000040 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000041
Guido van Rossum5ca1b711997-07-10 21:00:31 +000042def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000043 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000044
Guido van Rossum5ca1b711997-07-10 21:00:31 +000045def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000046 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000047 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000048 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000049
50def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000051 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000052 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000053 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000054
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000055def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000056 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000057 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000058 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000059
Guido van Rossumbe0b62c1998-06-29 20:29:08 +000060def findall(pattern, string):
61 if type(pattern) == type(''):
62 pattern = _cachecompile(pattern)
63 return pattern.findall(string)
64
Guido van Rossumdfa67901997-12-08 17:12:06 +000065def escape(pattern):
66 "Escape all non-alphanumeric characters in pattern."
Guido van Rossum3a8d1f51998-07-20 15:46:13 +000067 result = list(pattern)
Guido van Rossumdfa67901997-12-08 17:12:06 +000068 alphanum=string.letters+'_'+string.digits
Guido van Rossum3a8d1f51998-07-20 15:46:13 +000069 for i in range(len(pattern)):
70 char = pattern[i]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000071 if char not in alphanum:
Guido van Rossum3a8d1f51998-07-20 15:46:13 +000072 if char=='\000': result[i] = '\\000'
73 else: result[i] = '\\'+char
Guido van Rossumdfa67901997-12-08 17:12:06 +000074 return string.join(result, '')
75
76def compile(pattern, flags=0):
77 "Compile a regular expression pattern, returning a RegexObject."
78 groupindex={}
79 code=pcre_compile(pattern, flags, groupindex)
80 return RegexObject(pattern, flags, code, groupindex)
81
82
Guido van Rossum5ca1b711997-07-10 21:00:31 +000083#
Guido van Rossumdfa67901997-12-08 17:12:06 +000084# Class definitions
Guido van Rossum5ca1b711997-07-10 21:00:31 +000085#
86
87class RegexObject:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +000088
Guido van Rossumbf9d3531997-10-06 14:45:17 +000089 def __init__(self, pattern, flags, code, groupindex):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000090 self.code = code
91 self.flags = flags
92 self.pattern = pattern
93 self.groupindex = groupindex
Guido van Rossumdfa67901997-12-08 17:12:06 +000094
95 def search(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000096 """Scan through string looking for a match to the pattern, returning
97 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +000098
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000099 if endpos is None or endpos>len(string):
100 endpos=len(string)
101 if endpos<pos: endpos=pos
102 regs = self.code.match(string, pos, endpos, 0)
103 if regs is None:
104 return None
105 self._num_regs=len(regs)
106
107 return MatchObject(self,
108 string,
109 pos, endpos,
110 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000111
Guido van Rossumdfa67901997-12-08 17:12:06 +0000112 def match(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000113 """Try to apply the pattern at the start of the string, returning
114 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000115
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000116 if endpos is None or endpos>len(string):
117 endpos=len(string)
118 if endpos<pos: endpos=pos
119 regs = self.code.match(string, pos, endpos, ANCHORED)
120 if regs is None:
121 return None
122 self._num_regs=len(regs)
123 return MatchObject(self,
124 string,
125 pos, endpos,
126 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000127
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000128 def sub(self, repl, string, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000129 """Return the string obtained by replacing the leftmost
130 non-overlapping occurrences of the pattern in string by the
131 replacement repl"""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000132
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000133 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000134
Guido van Rossumdfa67901997-12-08 17:12:06 +0000135 def subn(self, repl, source, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000136 """Return a 2-tuple containing (new_string, number).
137 new_string is the string obtained by replacing the leftmost
Guido van Rossum8430c581998-04-03 21:47:12 +0000138 non-overlapping occurrences of the pattern in the source
139 string by the replacement repl. number is the number of
140 substitutions that were made."""
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000141
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000142 if count < 0:
143 raise error, "negative substitution count"
144 if count == 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000145 count = sys.maxint
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000146 n = 0 # Number of matches
147 pos = 0 # Where to start searching
148 lastmatch = -1 # End of last match
149 results = [] # Substrings making up the result
150 end = len(source)
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000151
152 if type(repl) is type(''):
153 # See if repl contains group references
154 try:
155 repl = pcre_expand(_Dummy, repl)
156 except:
157 m = MatchObject(self, source, 0, end, [])
158 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
159 else:
160 m = None
161 else:
162 m = MatchObject(self, source, 0, end, [])
163
164 match = self.code.match
165 append = results.append
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000166 while n < count and pos <= end:
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000167 regs = match(source, pos, end, 0)
168 if not regs:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 break
Andrew M. Kuchling9a80e001998-08-21 18:39:38 +0000170 self._num_regs = len(regs)
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000171 i, j = regs[0]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 if i == j == lastmatch:
173 # Empty match adjacent to previous match
174 pos = pos + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000175 append(source[lastmatch:pos])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000176 continue
177 if pos < i:
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000178 append(source[pos:i])
179 if m:
180 m.pos = pos
181 m.regs = regs
182 append(repl(m))
183 else:
184 append(repl)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 pos = lastmatch = j
186 if i == j:
187 # Last match was empty; don't try here again
188 pos = pos + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000189 append(source[lastmatch:pos])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 n = n + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000191 append(source[pos:])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000192 return (string.join(results, ''), n)
193
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000194 def split(self, source, maxsplit=0):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000195 """Split the source string by the occurrences of the pattern,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000196 returning a list containing the resulting substrings."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000197
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 if maxsplit < 0:
199 raise error, "negative split count"
200 if maxsplit == 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000201 maxsplit = sys.maxint
202 n = 0
203 pos = 0
204 lastmatch = 0
205 results = []
206 end = len(source)
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000207 match = self.code.match
208 append = results.append
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000209 while n < maxsplit:
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000210 regs = match(source, pos, end, 0)
211 if not regs:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000212 break
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000213 i, j = regs[0]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000214 if i == j:
215 # Empty match
216 if pos >= end:
217 break
218 pos = pos+1
219 continue
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000220 append(source[lastmatch:i])
221 rest = regs[1:]
222 if rest:
223 for a, b in rest:
224 if a == -1 or b == -1:
225 group = None
226 else:
227 group = source[a:b]
228 append(group)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000229 pos = lastmatch = j
230 n = n + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000231 append(source[lastmatch:])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 return results
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000233
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000234 def findall(self, source):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000235 """Return a list of all non-overlapping matches in the string.
236
237 If one or more groups are present in the pattern, return a
238 list of groups; this will be a list of tuples if the pattern
239 has more than one group.
240
241 Empty matches are included in the result.
242
243 """
244 pos = 0
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000245 end = len(source)
246 results = []
247 match = self.code.match
248 append = results.append
249 while pos <= end:
250 regs = match(source, pos, end, 0)
251 if not regs:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000252 break
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000253 i, j = regs[0]
254 rest = regs[1:]
255 if not rest:
256 gr = source[i:j]
257 elif len(rest) == 1:
258 a, b = rest[0]
259 gr = source[a:b]
260 else:
261 gr = []
262 for (a, b) in rest:
263 gr.append(source[a:b])
264 gr = tuple(gr)
265 append(gr)
266 pos = max(j, pos+1)
267 return results
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000268
Guido van Rossumdfa67901997-12-08 17:12:06 +0000269 # The following 3 functions were contributed by Mike Fletcher, and
270 # allow pickling and unpickling of RegexObject instances.
271 def __getinitargs__(self):
272 return (None,None,None,None) # any 4 elements, to work around
273 # problems with the
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000274 # pickle/cPickle modules not yet
275 # ignoring the __init__ function
Guido van Rossumdfa67901997-12-08 17:12:06 +0000276 def __getstate__(self):
277 return self.pattern, self.flags, self.groupindex
278 def __setstate__(self, statetuple):
279 self.pattern = statetuple[0]
280 self.flags = statetuple[1]
281 self.groupindex = statetuple[2]
282 self.code = apply(pcre_compile, statetuple)
283
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000284class _Dummy:
285 # Dummy class used by _subn_string(). Has 'group' to avoid core dump.
286 group = None
287
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000288class MatchObject:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000289
Guido van Rossumdfa67901997-12-08 17:12:06 +0000290 def __init__(self, re, string, pos, endpos, regs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000291 self.re = re
292 self.string = string
293 self.pos = pos
294 self.endpos = endpos
295 self.regs = regs
296
Guido van Rossumdfa67901997-12-08 17:12:06 +0000297 def start(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000298 "Return the start of the substring matched by group g"
299 if type(g) == type(''):
300 try:
301 g = self.re.groupindex[g]
302 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000303 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000304 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000305
Guido van Rossumdfa67901997-12-08 17:12:06 +0000306 def end(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 "Return the end of the substring matched by group g"
308 if type(g) == type(''):
309 try:
310 g = self.re.groupindex[g]
311 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000312 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000313 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000314
Guido van Rossumdfa67901997-12-08 17:12:06 +0000315 def span(self, g = 0):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000316 "Return (start, end) of the substring matched by group g"
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 if type(g) == type(''):
318 try:
319 g = self.re.groupindex[g]
320 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000321 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000322 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000323
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000324 def groups(self, default=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000325 "Return a tuple containing all subgroups of the match object"
326 result = []
327 for g in range(1, self.re._num_regs):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000328 a, b = self.regs[g]
329 if a == -1 or b == -1:
330 result.append(default)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000331 else:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000332 result.append(self.string[a:b])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 return tuple(result)
Guido van Rossumdfa67901997-12-08 17:12:06 +0000334
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000335 def group(self, *groups):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000336 "Return one or more groups of the match"
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 if len(groups) == 0:
338 groups = (0,)
339 result = []
340 for g in groups:
341 if type(g) == type(''):
342 try:
343 g = self.re.groupindex[g]
344 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000345 raise IndexError, 'group %s is undefined' % `g`
346 if g >= len(self.regs):
347 raise IndexError, 'group %s is undefined' % `g`
348 a, b = self.regs[g]
349 if a == -1 or b == -1:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000350 result.append(None)
351 else:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000352 result.append(self.string[a:b])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000353 if len(result) > 1:
354 return tuple(result)
355 elif len(result) == 1:
356 return result[0]
357 else:
358 return ()
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000359
360 def groupdict(self, default=None):
361 "Return a dictionary containing all named subgroups of the match"
362 dict = {}
363 for name, index in self.re.groupindex.items():
364 a, b = self.regs[index]
365 if a == -1 or b == -1:
366 dict[name] = default
367 else:
368 dict[name] = self.string[a:b]
369 return dict