blob: dce29ceac9921dd2592658b3220d7b1135e6b069 [file] [log] [blame]
Guido van Rossumbf9d3531997-10-06 14:45:17 +00001import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00002import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00003from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00004
Guido van Rossumbf9d3531997-10-06 14:45:17 +00005#
6# First, the public part of the interface:
7#
8
9# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossumdfa67901997-12-08 17:12:06 +000010# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000011
12# compilation flags
13
Guido van Rossumbf9d3531997-10-06 14:45:17 +000014I = IGNORECASE
Guido van Rossumdfa67901997-12-08 17:12:06 +000015L = LOCALE
Guido van Rossumbf9d3531997-10-06 14:45:17 +000016M = MULTILINE
17S = DOTALL
18X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000019
20#
21#
22#
23
Guido van Rossum26d80e61997-07-15 18:59:04 +000024_cache = {}
25_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000026
Guido van Rossum9e18ec71997-07-17 22:39:13 +000027def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000028 key = (pattern, flags)
29 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000030 return _cache[key]
Guido van Rossum26d80e61997-07-15 18:59:04 +000031 except KeyError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000032 pass
Guido van Rossum26d80e61997-07-15 18:59:04 +000033 value = compile(pattern, flags)
34 if len(_cache) >= _MAXCACHE:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000035 _cache.clear()
Guido van Rossum26d80e61997-07-15 18:59:04 +000036 _cache[key] = value
37 return value
38
Guido van Rossum5ca1b711997-07-10 21:00:31 +000039def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000040 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000041
Guido van Rossum5ca1b711997-07-10 21:00:31 +000042def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000043 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000044
Guido van Rossum5ca1b711997-07-10 21:00:31 +000045def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000046 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000047 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000048 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000049
50def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000051 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000052 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000053 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000054
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000055def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000056 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000057 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000058 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000059
Guido van Rossumbe0b62c1998-06-29 20:29:08 +000060def findall(pattern, string):
61 if type(pattern) == type(''):
62 pattern = _cachecompile(pattern)
63 return pattern.findall(string)
64
Guido van Rossumdfa67901997-12-08 17:12:06 +000065def escape(pattern):
66 "Escape all non-alphanumeric characters in pattern."
Guido van Rossum3a8d1f51998-07-20 15:46:13 +000067 result = list(pattern)
Guido van Rossumdfa67901997-12-08 17:12:06 +000068 alphanum=string.letters+'_'+string.digits
Guido van Rossum3a8d1f51998-07-20 15:46:13 +000069 for i in range(len(pattern)):
70 char = pattern[i]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000071 if char not in alphanum:
Guido van Rossum3a8d1f51998-07-20 15:46:13 +000072 if char=='\000': result[i] = '\\000'
73 else: result[i] = '\\'+char
Guido van Rossumdfa67901997-12-08 17:12:06 +000074 return string.join(result, '')
75
76def compile(pattern, flags=0):
77 "Compile a regular expression pattern, returning a RegexObject."
78 groupindex={}
79 code=pcre_compile(pattern, flags, groupindex)
80 return RegexObject(pattern, flags, code, groupindex)
81
82
Guido van Rossum5ca1b711997-07-10 21:00:31 +000083#
Guido van Rossumdfa67901997-12-08 17:12:06 +000084# Class definitions
Guido van Rossum5ca1b711997-07-10 21:00:31 +000085#
86
87class RegexObject:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +000088
Guido van Rossumbf9d3531997-10-06 14:45:17 +000089 def __init__(self, pattern, flags, code, groupindex):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000090 self.code = code
91 self.flags = flags
92 self.pattern = pattern
93 self.groupindex = groupindex
Guido van Rossumdfa67901997-12-08 17:12:06 +000094
95 def search(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000096 """Scan through string looking for a match to the pattern, returning
97 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +000098
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000099 if endpos is None or endpos>len(string):
100 endpos=len(string)
101 if endpos<pos: endpos=pos
102 regs = self.code.match(string, pos, endpos, 0)
103 if regs is None:
104 return None
105 self._num_regs=len(regs)
106
107 return MatchObject(self,
108 string,
109 pos, endpos,
110 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000111
Guido van Rossumdfa67901997-12-08 17:12:06 +0000112 def match(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000113 """Try to apply the pattern at the start of the string, returning
114 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000115
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000116 if endpos is None or endpos>len(string):
117 endpos=len(string)
118 if endpos<pos: endpos=pos
119 regs = self.code.match(string, pos, endpos, ANCHORED)
120 if regs is None:
121 return None
122 self._num_regs=len(regs)
123 return MatchObject(self,
124 string,
125 pos, endpos,
126 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000127
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000128 def sub(self, repl, string, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000129 """Return the string obtained by replacing the leftmost
130 non-overlapping occurrences of the pattern in string by the
131 replacement repl"""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000132
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000133 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000134
Guido van Rossumdfa67901997-12-08 17:12:06 +0000135 def subn(self, repl, source, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000136 """Return a 2-tuple containing (new_string, number).
137 new_string is the string obtained by replacing the leftmost
Guido van Rossum8430c581998-04-03 21:47:12 +0000138 non-overlapping occurrences of the pattern in the source
139 string by the replacement repl. number is the number of
140 substitutions that were made."""
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000141
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000142 if count < 0:
143 raise error, "negative substitution count"
144 if count == 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000145 count = sys.maxint
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000146 n = 0 # Number of matches
147 pos = 0 # Where to start searching
148 lastmatch = -1 # End of last match
149 results = [] # Substrings making up the result
150 end = len(source)
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000151
152 if type(repl) is type(''):
153 # See if repl contains group references
154 try:
155 repl = pcre_expand(_Dummy, repl)
156 except:
157 m = MatchObject(self, source, 0, end, [])
158 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
159 else:
160 m = None
161 else:
162 m = MatchObject(self, source, 0, end, [])
163
164 match = self.code.match
165 append = results.append
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000166 while n < count and pos <= end:
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000167 regs = match(source, pos, end, 0)
168 if not regs:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 break
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000170 i, j = regs[0]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000171 if i == j == lastmatch:
172 # Empty match adjacent to previous match
173 pos = pos + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000174 append(source[lastmatch:pos])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000175 continue
176 if pos < i:
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000177 append(source[pos:i])
178 if m:
179 m.pos = pos
180 m.regs = regs
181 append(repl(m))
182 else:
183 append(repl)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000184 pos = lastmatch = j
185 if i == j:
186 # Last match was empty; don't try here again
187 pos = pos + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000188 append(source[lastmatch:pos])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000189 n = n + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000190 append(source[pos:])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 return (string.join(results, ''), n)
192
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000193 def split(self, source, maxsplit=0):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000194 """Split the source string by the occurrences of the pattern,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 returning a list containing the resulting substrings."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000196
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000197 if maxsplit < 0:
198 raise error, "negative split count"
199 if maxsplit == 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000200 maxsplit = sys.maxint
201 n = 0
202 pos = 0
203 lastmatch = 0
204 results = []
205 end = len(source)
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000206 match = self.code.match
207 append = results.append
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000208 while n < maxsplit:
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000209 regs = match(source, pos, end, 0)
210 if not regs:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 break
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000212 i, j = regs[0]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 if i == j:
214 # Empty match
215 if pos >= end:
216 break
217 pos = pos+1
218 continue
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000219 append(source[lastmatch:i])
220 rest = regs[1:]
221 if rest:
222 for a, b in rest:
223 if a == -1 or b == -1:
224 group = None
225 else:
226 group = source[a:b]
227 append(group)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000228 pos = lastmatch = j
229 n = n + 1
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000230 append(source[lastmatch:])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000231 return results
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000232
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000233 def findall(self, source):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000234 """Return a list of all non-overlapping matches in the string.
235
236 If one or more groups are present in the pattern, return a
237 list of groups; this will be a list of tuples if the pattern
238 has more than one group.
239
240 Empty matches are included in the result.
241
242 """
243 pos = 0
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000244 end = len(source)
245 results = []
246 match = self.code.match
247 append = results.append
248 while pos <= end:
249 regs = match(source, pos, end, 0)
250 if not regs:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000251 break
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000252 i, j = regs[0]
253 rest = regs[1:]
254 if not rest:
255 gr = source[i:j]
256 elif len(rest) == 1:
257 a, b = rest[0]
258 gr = source[a:b]
259 else:
260 gr = []
261 for (a, b) in rest:
262 gr.append(source[a:b])
263 gr = tuple(gr)
264 append(gr)
265 pos = max(j, pos+1)
266 return results
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000267
Guido van Rossumdfa67901997-12-08 17:12:06 +0000268 # The following 3 functions were contributed by Mike Fletcher, and
269 # allow pickling and unpickling of RegexObject instances.
270 def __getinitargs__(self):
271 return (None,None,None,None) # any 4 elements, to work around
272 # problems with the
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 # pickle/cPickle modules not yet
274 # ignoring the __init__ function
Guido van Rossumdfa67901997-12-08 17:12:06 +0000275 def __getstate__(self):
276 return self.pattern, self.flags, self.groupindex
277 def __setstate__(self, statetuple):
278 self.pattern = statetuple[0]
279 self.flags = statetuple[1]
280 self.groupindex = statetuple[2]
281 self.code = apply(pcre_compile, statetuple)
282
Guido van Rossum0e5ab171998-07-17 20:18:49 +0000283class _Dummy:
284 # Dummy class used by _subn_string(). Has 'group' to avoid core dump.
285 group = None
286
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000287class MatchObject:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000288
Guido van Rossumdfa67901997-12-08 17:12:06 +0000289 def __init__(self, re, string, pos, endpos, regs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000290 self.re = re
291 self.string = string
292 self.pos = pos
293 self.endpos = endpos
294 self.regs = regs
295
Guido van Rossumdfa67901997-12-08 17:12:06 +0000296 def start(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000297 "Return the start of the substring matched by group g"
298 if type(g) == type(''):
299 try:
300 g = self.re.groupindex[g]
301 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000302 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000304
Guido van Rossumdfa67901997-12-08 17:12:06 +0000305 def end(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000306 "Return the end of the substring matched by group g"
307 if type(g) == type(''):
308 try:
309 g = self.re.groupindex[g]
310 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000311 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000313
Guido van Rossumdfa67901997-12-08 17:12:06 +0000314 def span(self, g = 0):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000315 "Return (start, end) of the substring matched by group g"
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 if type(g) == type(''):
317 try:
318 g = self.re.groupindex[g]
319 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000320 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000322
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000323 def groups(self, default=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 "Return a tuple containing all subgroups of the match object"
325 result = []
326 for g in range(1, self.re._num_regs):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000327 a, b = self.regs[g]
328 if a == -1 or b == -1:
329 result.append(default)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000330 else:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000331 result.append(self.string[a:b])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000332 return tuple(result)
Guido van Rossumdfa67901997-12-08 17:12:06 +0000333
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000334 def group(self, *groups):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000335 "Return one or more groups of the match"
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000336 if len(groups) == 0:
337 groups = (0,)
338 result = []
339 for g in groups:
340 if type(g) == type(''):
341 try:
342 g = self.re.groupindex[g]
343 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000344 raise IndexError, 'group %s is undefined' % `g`
345 if g >= len(self.regs):
346 raise IndexError, 'group %s is undefined' % `g`
347 a, b = self.regs[g]
348 if a == -1 or b == -1:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000349 result.append(None)
350 else:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000351 result.append(self.string[a:b])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000352 if len(result) > 1:
353 return tuple(result)
354 elif len(result) == 1:
355 return result[0]
356 else:
357 return ()
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000358
359 def groupdict(self, default=None):
360 "Return a dictionary containing all named subgroups of the match"
361 dict = {}
362 for name, index in self.re.groupindex.items():
363 a, b = self.regs[index]
364 if a == -1 or b == -1:
365 dict[name] = default
366 else:
367 dict[name] = self.string[a:b]
368 return dict