blob: c5b71b8bc0633e21730eef06b779ac66db1773c3 [file] [log] [blame]
Guido van Rossumbf9d3531997-10-06 14:45:17 +00001import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00002import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00003from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00004
Guido van Rossumbf9d3531997-10-06 14:45:17 +00005#
6# First, the public part of the interface:
7#
8
9# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossumdfa67901997-12-08 17:12:06 +000010# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000011
12# compilation flags
13
Guido van Rossumbf9d3531997-10-06 14:45:17 +000014I = IGNORECASE
Guido van Rossumdfa67901997-12-08 17:12:06 +000015L = LOCALE
Guido van Rossumbf9d3531997-10-06 14:45:17 +000016M = MULTILINE
17S = DOTALL
18X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000019
20#
21#
22#
23
Guido van Rossum26d80e61997-07-15 18:59:04 +000024_cache = {}
25_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000026
Guido van Rossum9e18ec71997-07-17 22:39:13 +000027def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000028 key = (pattern, flags)
29 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000030 return _cache[key]
Guido van Rossum26d80e61997-07-15 18:59:04 +000031 except KeyError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000032 pass
Guido van Rossum26d80e61997-07-15 18:59:04 +000033 value = compile(pattern, flags)
34 if len(_cache) >= _MAXCACHE:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000035 _cache.clear()
Guido van Rossum26d80e61997-07-15 18:59:04 +000036 _cache[key] = value
37 return value
38
Guido van Rossum5ca1b711997-07-10 21:00:31 +000039def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000040 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000041
Guido van Rossum5ca1b711997-07-10 21:00:31 +000042def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000043 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000044
Guido van Rossum5ca1b711997-07-10 21:00:31 +000045def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000046 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000047 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000048 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000049
50def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000051 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000052 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000053 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000054
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000055def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000056 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000057 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000058 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000059
Guido van Rossumbe0b62c1998-06-29 20:29:08 +000060def findall(pattern, string):
61 if type(pattern) == type(''):
62 pattern = _cachecompile(pattern)
63 return pattern.findall(string)
64
Guido van Rossumdfa67901997-12-08 17:12:06 +000065def escape(pattern):
66 "Escape all non-alphanumeric characters in pattern."
67 result = []
68 alphanum=string.letters+'_'+string.digits
69 for char in pattern:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000070 if char not in alphanum:
Guido van Rossum8430c581998-04-03 21:47:12 +000071 if char=='\000': result.append('\\000')
72 else: result.append('\\'+char)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000073 else: result.append(char)
Guido van Rossumdfa67901997-12-08 17:12:06 +000074 return string.join(result, '')
75
76def compile(pattern, flags=0):
77 "Compile a regular expression pattern, returning a RegexObject."
78 groupindex={}
79 code=pcre_compile(pattern, flags, groupindex)
80 return RegexObject(pattern, flags, code, groupindex)
81
82
Guido van Rossum5ca1b711997-07-10 21:00:31 +000083#
Guido van Rossumdfa67901997-12-08 17:12:06 +000084# Class definitions
Guido van Rossum5ca1b711997-07-10 21:00:31 +000085#
86
87class RegexObject:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +000088
Guido van Rossumbf9d3531997-10-06 14:45:17 +000089 def __init__(self, pattern, flags, code, groupindex):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000090 self.code = code
91 self.flags = flags
92 self.pattern = pattern
93 self.groupindex = groupindex
Guido van Rossumdfa67901997-12-08 17:12:06 +000094
95 def search(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000096 """Scan through string looking for a match to the pattern, returning
97 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +000098
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000099 if endpos is None or endpos>len(string):
100 endpos=len(string)
101 if endpos<pos: endpos=pos
102 regs = self.code.match(string, pos, endpos, 0)
103 if regs is None:
104 return None
105 self._num_regs=len(regs)
106
107 return MatchObject(self,
108 string,
109 pos, endpos,
110 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000111
Guido van Rossumdfa67901997-12-08 17:12:06 +0000112 def match(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000113 """Try to apply the pattern at the start of the string, returning
114 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000115
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000116 if endpos is None or endpos>len(string):
117 endpos=len(string)
118 if endpos<pos: endpos=pos
119 regs = self.code.match(string, pos, endpos, ANCHORED)
120 if regs is None:
121 return None
122 self._num_regs=len(regs)
123 return MatchObject(self,
124 string,
125 pos, endpos,
126 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000127
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000128 def sub(self, repl, string, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000129 """Return the string obtained by replacing the leftmost
130 non-overlapping occurrences of the pattern in string by the
131 replacement repl"""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000132
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000133 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000134
Guido van Rossumdfa67901997-12-08 17:12:06 +0000135 def subn(self, repl, source, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000136 """Return a 2-tuple containing (new_string, number).
137 new_string is the string obtained by replacing the leftmost
Guido van Rossum8430c581998-04-03 21:47:12 +0000138 non-overlapping occurrences of the pattern in the source
139 string by the replacement repl. number is the number of
140 substitutions that were made."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000141
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000142 if count < 0:
143 raise error, "negative substitution count"
144 if count == 0:
145 import sys
146 count = sys.maxint
147 if type(repl) == type(''):
148 if '\\' in repl:
149 repl = lambda m, r=repl: pcre_expand(m, r)
150 else:
151 repl = lambda m, r=repl: r
152 n = 0 # Number of matches
153 pos = 0 # Where to start searching
154 lastmatch = -1 # End of last match
155 results = [] # Substrings making up the result
156 end = len(source)
157 while n < count and pos <= end:
158 m = self.search(source, pos)
159 if not m:
160 break
161 i, j = m.span(0)
162 if i == j == lastmatch:
163 # Empty match adjacent to previous match
164 pos = pos + 1
165 results.append(source[lastmatch:pos])
166 continue
167 if pos < i:
168 results.append(source[pos:i])
169 results.append(repl(m))
170 pos = lastmatch = j
171 if i == j:
172 # Last match was empty; don't try here again
173 pos = pos + 1
174 results.append(source[lastmatch:pos])
175 n = n + 1
176 results.append(source[pos:])
177 return (string.join(results, ''), n)
178
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000179 def split(self, source, maxsplit=0):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000180 """Split the source string by the occurrences of the pattern,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000181 returning a list containing the resulting substrings."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000182
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 if maxsplit < 0:
184 raise error, "negative split count"
185 if maxsplit == 0:
186 import sys
187 maxsplit = sys.maxint
188 n = 0
189 pos = 0
190 lastmatch = 0
191 results = []
192 end = len(source)
193 while n < maxsplit:
194 m = self.search(source, pos)
195 if not m:
196 break
197 i, j = m.span(0)
198 if i == j:
199 # Empty match
200 if pos >= end:
201 break
202 pos = pos+1
203 continue
204 results.append(source[lastmatch:i])
205 g = m.groups()
206 if g:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000207 results[len(results):] = list(g)
208 pos = lastmatch = j
209 n = n + 1
210 results.append(source[lastmatch:])
211 return results
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000212
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000213 def findall(self, string):
214 """Return a list of all non-overlapping matches in the string.
215
216 If one or more groups are present in the pattern, return a
217 list of groups; this will be a list of tuples if the pattern
218 has more than one group.
219
220 Empty matches are included in the result.
221
222 """
223 pos = 0
224 n = len(string)
225 result = []
226 while pos <= n:
227 m = self.search(string, pos)
228 if not m:
229 break
230 gr = m.groups()
231 if not gr:
232 gr = m.group()
233 elif len(gr) == 1:
234 gr = gr[0]
235 result.append(gr)
236 pos = max(m.end(), pos+1)
237 return result
238
Guido van Rossumdfa67901997-12-08 17:12:06 +0000239 # The following 3 functions were contributed by Mike Fletcher, and
240 # allow pickling and unpickling of RegexObject instances.
241 def __getinitargs__(self):
242 return (None,None,None,None) # any 4 elements, to work around
243 # problems with the
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000244 # pickle/cPickle modules not yet
245 # ignoring the __init__ function
Guido van Rossumdfa67901997-12-08 17:12:06 +0000246 def __getstate__(self):
247 return self.pattern, self.flags, self.groupindex
248 def __setstate__(self, statetuple):
249 self.pattern = statetuple[0]
250 self.flags = statetuple[1]
251 self.groupindex = statetuple[2]
252 self.code = apply(pcre_compile, statetuple)
253
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000254class MatchObject:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000255
Guido van Rossumdfa67901997-12-08 17:12:06 +0000256 def __init__(self, re, string, pos, endpos, regs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 self.re = re
258 self.string = string
259 self.pos = pos
260 self.endpos = endpos
261 self.regs = regs
262
Guido van Rossumdfa67901997-12-08 17:12:06 +0000263 def start(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000264 "Return the start of the substring matched by group g"
265 if type(g) == type(''):
266 try:
267 g = self.re.groupindex[g]
268 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000269 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000270 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000271
Guido van Rossumdfa67901997-12-08 17:12:06 +0000272 def end(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 "Return the end of the substring matched by group g"
274 if type(g) == type(''):
275 try:
276 g = self.re.groupindex[g]
277 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000278 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000279 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000280
Guido van Rossumdfa67901997-12-08 17:12:06 +0000281 def span(self, g = 0):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000282 "Return (start, end) of the substring matched by group g"
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000283 if type(g) == type(''):
284 try:
285 g = self.re.groupindex[g]
286 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000287 raise IndexError, 'group %s is undefined' % `g`
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000289
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000290 def groups(self, default=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000291 "Return a tuple containing all subgroups of the match object"
292 result = []
293 for g in range(1, self.re._num_regs):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000294 a, b = self.regs[g]
295 if a == -1 or b == -1:
296 result.append(default)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000297 else:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000298 result.append(self.string[a:b])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000299 return tuple(result)
Guido van Rossumdfa67901997-12-08 17:12:06 +0000300
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000301 def group(self, *groups):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000302 "Return one or more groups of the match"
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 if len(groups) == 0:
304 groups = (0,)
305 result = []
306 for g in groups:
307 if type(g) == type(''):
308 try:
309 g = self.re.groupindex[g]
310 except (KeyError, TypeError):
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000311 raise IndexError, 'group %s is undefined' % `g`
312 if g >= len(self.regs):
313 raise IndexError, 'group %s is undefined' % `g`
314 a, b = self.regs[g]
315 if a == -1 or b == -1:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 result.append(None)
317 else:
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000318 result.append(self.string[a:b])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 if len(result) > 1:
320 return tuple(result)
321 elif len(result) == 1:
322 return result[0]
323 else:
324 return ()
Guido van Rossumbe0b62c1998-06-29 20:29:08 +0000325
326 def groupdict(self, default=None):
327 "Return a dictionary containing all named subgroups of the match"
328 dict = {}
329 for name, index in self.re.groupindex.items():
330 a, b = self.regs[index]
331 if a == -1 or b == -1:
332 dict[name] = default
333 else:
334 dict[name] = self.string[a:b]
335 return dict