blob: f6bac0871da3b6c931c602635523ede1ebc1ab55 [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
Guido van Rossumbf9d3531997-10-06 14:45:17 +00003
4import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00005import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00006from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00007
Guido van Rossumbf9d3531997-10-06 14:45:17 +00008#
9# First, the public part of the interface:
10#
11
12# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossumdfa67901997-12-08 17:12:06 +000013# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000014
15# compilation flags
16
Guido van Rossumbf9d3531997-10-06 14:45:17 +000017I = IGNORECASE
Guido van Rossumdfa67901997-12-08 17:12:06 +000018L = LOCALE
Guido van Rossumbf9d3531997-10-06 14:45:17 +000019M = MULTILINE
20S = DOTALL
21X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000022
23#
24#
25#
26
Guido van Rossum26d80e61997-07-15 18:59:04 +000027_cache = {}
28_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000029
Guido van Rossum9e18ec71997-07-17 22:39:13 +000030def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000031 key = (pattern, flags)
32 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000033 return _cache[key]
Guido van Rossum26d80e61997-07-15 18:59:04 +000034 except KeyError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000035 pass
Guido van Rossum26d80e61997-07-15 18:59:04 +000036 value = compile(pattern, flags)
37 if len(_cache) >= _MAXCACHE:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000038 _cache.clear()
Guido van Rossum26d80e61997-07-15 18:59:04 +000039 _cache[key] = value
40 return value
41
Guido van Rossum5ca1b711997-07-10 21:00:31 +000042def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000043 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000044
Guido van Rossum5ca1b711997-07-10 21:00:31 +000045def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000046 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000047
Guido van Rossum5ca1b711997-07-10 21:00:31 +000048def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000049 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000050 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000051 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000052
53def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000054 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000055 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000056 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000057
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000058def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000059 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000060 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000061 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000062
Guido van Rossumdfa67901997-12-08 17:12:06 +000063def escape(pattern):
64 "Escape all non-alphanumeric characters in pattern."
65 result = []
66 alphanum=string.letters+'_'+string.digits
67 for char in pattern:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000068 if char not in alphanum:
Guido van Rossum8430c581998-04-03 21:47:12 +000069 if char=='\000': result.append('\\000')
70 else: result.append('\\'+char)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000071 else: result.append(char)
Guido van Rossumdfa67901997-12-08 17:12:06 +000072 return string.join(result, '')
73
74def compile(pattern, flags=0):
75 "Compile a regular expression pattern, returning a RegexObject."
76 groupindex={}
77 code=pcre_compile(pattern, flags, groupindex)
78 return RegexObject(pattern, flags, code, groupindex)
79
80
Guido van Rossum5ca1b711997-07-10 21:00:31 +000081#
Guido van Rossumdfa67901997-12-08 17:12:06 +000082# Class definitions
Guido van Rossum5ca1b711997-07-10 21:00:31 +000083#
84
85class RegexObject:
Guido van Rossumbf9d3531997-10-06 14:45:17 +000086 def __init__(self, pattern, flags, code, groupindex):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 self.code = code
88 self.flags = flags
89 self.pattern = pattern
90 self.groupindex = groupindex
Guido van Rossumdfa67901997-12-08 17:12:06 +000091
92 def search(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 """Scan through string looking for a match to the pattern, returning
94 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +000095
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000096 if endpos is None or endpos>len(string):
97 endpos=len(string)
98 if endpos<pos: endpos=pos
99 regs = self.code.match(string, pos, endpos, 0)
100 if regs is None:
101 return None
102 self._num_regs=len(regs)
103
104 return MatchObject(self,
105 string,
106 pos, endpos,
107 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000108
Guido van Rossumdfa67901997-12-08 17:12:06 +0000109 def match(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000110 """Try to apply the pattern at the start of the string, returning
111 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000112
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000113 if endpos is None or endpos>len(string):
114 endpos=len(string)
115 if endpos<pos: endpos=pos
116 regs = self.code.match(string, pos, endpos, ANCHORED)
117 if regs is None:
118 return None
119 self._num_regs=len(regs)
120 return MatchObject(self,
121 string,
122 pos, endpos,
123 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000124
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000125 def sub(self, repl, string, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000126 """Return the string obtained by replacing the leftmost
127 non-overlapping occurrences of the pattern in string by the
128 replacement repl"""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000129
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000130 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000131
Guido van Rossumdfa67901997-12-08 17:12:06 +0000132 def subn(self, repl, source, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000133 """Return a 2-tuple containing (new_string, number).
134 new_string is the string obtained by replacing the leftmost
Guido van Rossum8430c581998-04-03 21:47:12 +0000135 non-overlapping occurrences of the pattern in the source
136 string by the replacement repl. number is the number of
137 substitutions that were made."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000138
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000139 if count < 0:
140 raise error, "negative substitution count"
141 if count == 0:
142 import sys
143 count = sys.maxint
144 if type(repl) == type(''):
145 if '\\' in repl:
146 repl = lambda m, r=repl: pcre_expand(m, r)
147 else:
148 repl = lambda m, r=repl: r
149 n = 0 # Number of matches
150 pos = 0 # Where to start searching
151 lastmatch = -1 # End of last match
152 results = [] # Substrings making up the result
153 end = len(source)
154 while n < count and pos <= end:
155 m = self.search(source, pos)
156 if not m:
157 break
158 i, j = m.span(0)
159 if i == j == lastmatch:
160 # Empty match adjacent to previous match
161 pos = pos + 1
162 results.append(source[lastmatch:pos])
163 continue
164 if pos < i:
165 results.append(source[pos:i])
166 results.append(repl(m))
167 pos = lastmatch = j
168 if i == j:
169 # Last match was empty; don't try here again
170 pos = pos + 1
171 results.append(source[lastmatch:pos])
172 n = n + 1
173 results.append(source[pos:])
174 return (string.join(results, ''), n)
175
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000176 def split(self, source, maxsplit=0):
Guido van Rossum8430c581998-04-03 21:47:12 +0000177 """Split the \var{source} string by the occurrences of the pattern,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000178 returning a list containing the resulting substrings."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000179
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000180 if maxsplit < 0:
181 raise error, "negative split count"
182 if maxsplit == 0:
183 import sys
184 maxsplit = sys.maxint
185 n = 0
186 pos = 0
187 lastmatch = 0
188 results = []
189 end = len(source)
190 while n < maxsplit:
191 m = self.search(source, pos)
192 if not m:
193 break
194 i, j = m.span(0)
195 if i == j:
196 # Empty match
197 if pos >= end:
198 break
199 pos = pos+1
200 continue
201 results.append(source[lastmatch:i])
202 g = m.groups()
203 if g:
204 if type(g)==type( "" ): g = [g]
205 results[len(results):] = list(g)
206 pos = lastmatch = j
207 n = n + 1
208 results.append(source[lastmatch:])
209 return results
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000210
Guido van Rossumdfa67901997-12-08 17:12:06 +0000211 # The following 3 functions were contributed by Mike Fletcher, and
212 # allow pickling and unpickling of RegexObject instances.
213 def __getinitargs__(self):
214 return (None,None,None,None) # any 4 elements, to work around
215 # problems with the
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 # pickle/cPickle modules not yet
217 # ignoring the __init__ function
Guido van Rossumdfa67901997-12-08 17:12:06 +0000218 def __getstate__(self):
219 return self.pattern, self.flags, self.groupindex
220 def __setstate__(self, statetuple):
221 self.pattern = statetuple[0]
222 self.flags = statetuple[1]
223 self.groupindex = statetuple[2]
224 self.code = apply(pcre_compile, statetuple)
225
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000226class MatchObject:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000227 def __init__(self, re, string, pos, endpos, regs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000228 self.re = re
229 self.string = string
230 self.pos = pos
231 self.endpos = endpos
232 self.regs = regs
233
Guido van Rossumdfa67901997-12-08 17:12:06 +0000234 def start(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000235 "Return the start of the substring matched by group g"
236 if type(g) == type(''):
237 try:
238 g = self.re.groupindex[g]
239 except (KeyError, TypeError):
240 raise IndexError, ('group "' + g + '" is undefined')
241 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000242
Guido van Rossumdfa67901997-12-08 17:12:06 +0000243 def end(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000244 "Return the end of the substring matched by group g"
245 if type(g) == type(''):
246 try:
247 g = self.re.groupindex[g]
248 except (KeyError, TypeError):
249 raise IndexError, ('group "' + g + '" is undefined')
250 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000251
Guido van Rossumdfa67901997-12-08 17:12:06 +0000252 def span(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 """Return a tuple containing the start,end of the substring
254 matched by group g"""
255 if type(g) == type(''):
256 try:
257 g = self.re.groupindex[g]
258 except (KeyError, TypeError):
259 raise IndexError, ('group "' + g + '" is undefined')
260 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000261
Guido van Rossumdfa67901997-12-08 17:12:06 +0000262 def groups(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 "Return a tuple containing all subgroups of the match object"
264 result = []
265 for g in range(1, self.re._num_regs):
266 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
267 result.append(None)
268 else:
269 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
270 return tuple(result)
Guido van Rossumdfa67901997-12-08 17:12:06 +0000271
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000272 def group(self, *groups):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 "Return one or more groups of the match."
274 if len(groups) == 0:
275 groups = (0,)
276 result = []
277 for g in groups:
278 if type(g) == type(''):
279 try:
280 g = self.re.groupindex[g]
281 except (KeyError, TypeError):
282 raise IndexError, ('group "' + g + '" is undefined')
283 if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined')
284 elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
285 result.append(None)
286 else:
287 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
288 if len(result) > 1:
289 return tuple(result)
290 elif len(result) == 1:
291 return result[0]
292 else:
293 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000294