blob: 75905c5587a0f1ab15dfa96944427a597470d087 [file] [log] [blame]
Guido van Rossumbf9d3531997-10-06 14:45:17 +00001import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00002import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00003from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00004
Guido van Rossumbf9d3531997-10-06 14:45:17 +00005#
6# First, the public part of the interface:
7#
8
9# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossumdfa67901997-12-08 17:12:06 +000010# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000011
12# compilation flags
13
Guido van Rossumbf9d3531997-10-06 14:45:17 +000014I = IGNORECASE
Guido van Rossumdfa67901997-12-08 17:12:06 +000015L = LOCALE
Guido van Rossumbf9d3531997-10-06 14:45:17 +000016M = MULTILINE
17S = DOTALL
18X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000019
20#
21#
22#
23
Guido van Rossum26d80e61997-07-15 18:59:04 +000024_cache = {}
25_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000026
Guido van Rossum9e18ec71997-07-17 22:39:13 +000027def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000028 key = (pattern, flags)
29 try:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000030 return _cache[key]
Guido van Rossum26d80e61997-07-15 18:59:04 +000031 except KeyError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000032 pass
Guido van Rossum26d80e61997-07-15 18:59:04 +000033 value = compile(pattern, flags)
34 if len(_cache) >= _MAXCACHE:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000035 _cache.clear()
Guido van Rossum26d80e61997-07-15 18:59:04 +000036 _cache[key] = value
37 return value
38
Guido van Rossum5ca1b711997-07-10 21:00:31 +000039def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000040 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000041
Guido van Rossum5ca1b711997-07-10 21:00:31 +000042def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000043 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000044
Guido van Rossum5ca1b711997-07-10 21:00:31 +000045def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000046 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000047 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000048 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000049
50def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000051 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000052 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000053 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000054
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000055def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000056 if type(pattern) == type(''):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000057 pattern = _cachecompile(pattern)
Guido van Rossum9e18ec71997-07-17 22:39:13 +000058 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000059
Guido van Rossumdfa67901997-12-08 17:12:06 +000060def escape(pattern):
61 "Escape all non-alphanumeric characters in pattern."
62 result = []
63 alphanum=string.letters+'_'+string.digits
64 for char in pattern:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000065 if char not in alphanum:
Guido van Rossum8430c581998-04-03 21:47:12 +000066 if char=='\000': result.append('\\000')
67 else: result.append('\\'+char)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000068 else: result.append(char)
Guido van Rossumdfa67901997-12-08 17:12:06 +000069 return string.join(result, '')
70
71def compile(pattern, flags=0):
72 "Compile a regular expression pattern, returning a RegexObject."
73 groupindex={}
74 code=pcre_compile(pattern, flags, groupindex)
75 return RegexObject(pattern, flags, code, groupindex)
76
77
Guido van Rossum5ca1b711997-07-10 21:00:31 +000078#
Guido van Rossumdfa67901997-12-08 17:12:06 +000079# Class definitions
Guido van Rossum5ca1b711997-07-10 21:00:31 +000080#
81
82class RegexObject:
Guido van Rossumbf9d3531997-10-06 14:45:17 +000083 def __init__(self, pattern, flags, code, groupindex):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000084 self.code = code
85 self.flags = flags
86 self.pattern = pattern
87 self.groupindex = groupindex
Guido van Rossumdfa67901997-12-08 17:12:06 +000088
89 def search(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000090 """Scan through string looking for a match to the pattern, returning
91 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 if endpos is None or endpos>len(string):
94 endpos=len(string)
95 if endpos<pos: endpos=pos
96 regs = self.code.match(string, pos, endpos, 0)
97 if regs is None:
98 return None
99 self._num_regs=len(regs)
100
101 return MatchObject(self,
102 string,
103 pos, endpos,
104 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000105
Guido van Rossumdfa67901997-12-08 17:12:06 +0000106 def match(self, string, pos=0, endpos=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000107 """Try to apply the pattern at the start of the string, returning
108 a MatchObject instance, or None if no match was found."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000109
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000110 if endpos is None or endpos>len(string):
111 endpos=len(string)
112 if endpos<pos: endpos=pos
113 regs = self.code.match(string, pos, endpos, ANCHORED)
114 if regs is None:
115 return None
116 self._num_regs=len(regs)
117 return MatchObject(self,
118 string,
119 pos, endpos,
120 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000121
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000122 def sub(self, repl, string, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000123 """Return the string obtained by replacing the leftmost
124 non-overlapping occurrences of the pattern in string by the
125 replacement repl"""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000126
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000127 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000128
Guido van Rossumdfa67901997-12-08 17:12:06 +0000129 def subn(self, repl, source, count=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 """Return a 2-tuple containing (new_string, number).
131 new_string is the string obtained by replacing the leftmost
Guido van Rossum8430c581998-04-03 21:47:12 +0000132 non-overlapping occurrences of the pattern in the source
133 string by the replacement repl. number is the number of
134 substitutions that were made."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000135
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000136 if count < 0:
137 raise error, "negative substitution count"
138 if count == 0:
139 import sys
140 count = sys.maxint
141 if type(repl) == type(''):
142 if '\\' in repl:
143 repl = lambda m, r=repl: pcre_expand(m, r)
144 else:
145 repl = lambda m, r=repl: r
146 n = 0 # Number of matches
147 pos = 0 # Where to start searching
148 lastmatch = -1 # End of last match
149 results = [] # Substrings making up the result
150 end = len(source)
151 while n < count and pos <= end:
152 m = self.search(source, pos)
153 if not m:
154 break
155 i, j = m.span(0)
156 if i == j == lastmatch:
157 # Empty match adjacent to previous match
158 pos = pos + 1
159 results.append(source[lastmatch:pos])
160 continue
161 if pos < i:
162 results.append(source[pos:i])
163 results.append(repl(m))
164 pos = lastmatch = j
165 if i == j:
166 # Last match was empty; don't try here again
167 pos = pos + 1
168 results.append(source[lastmatch:pos])
169 n = n + 1
170 results.append(source[pos:])
171 return (string.join(results, ''), n)
172
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000173 def split(self, source, maxsplit=0):
Guido van Rossum8430c581998-04-03 21:47:12 +0000174 """Split the \var{source} string by the occurrences of the pattern,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000175 returning a list containing the resulting substrings."""
Guido van Rossumdfa67901997-12-08 17:12:06 +0000176
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000177 if maxsplit < 0:
178 raise error, "negative split count"
179 if maxsplit == 0:
180 import sys
181 maxsplit = sys.maxint
182 n = 0
183 pos = 0
184 lastmatch = 0
185 results = []
186 end = len(source)
187 while n < maxsplit:
188 m = self.search(source, pos)
189 if not m:
190 break
191 i, j = m.span(0)
192 if i == j:
193 # Empty match
194 if pos >= end:
195 break
196 pos = pos+1
197 continue
198 results.append(source[lastmatch:i])
199 g = m.groups()
200 if g:
201 if type(g)==type( "" ): g = [g]
202 results[len(results):] = list(g)
203 pos = lastmatch = j
204 n = n + 1
205 results.append(source[lastmatch:])
206 return results
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000207
Guido van Rossumdfa67901997-12-08 17:12:06 +0000208 # The following 3 functions were contributed by Mike Fletcher, and
209 # allow pickling and unpickling of RegexObject instances.
210 def __getinitargs__(self):
211 return (None,None,None,None) # any 4 elements, to work around
212 # problems with the
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 # pickle/cPickle modules not yet
214 # ignoring the __init__ function
Guido van Rossumdfa67901997-12-08 17:12:06 +0000215 def __getstate__(self):
216 return self.pattern, self.flags, self.groupindex
217 def __setstate__(self, statetuple):
218 self.pattern = statetuple[0]
219 self.flags = statetuple[1]
220 self.groupindex = statetuple[2]
221 self.code = apply(pcre_compile, statetuple)
222
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000223class MatchObject:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000224 def __init__(self, re, string, pos, endpos, regs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000225 self.re = re
226 self.string = string
227 self.pos = pos
228 self.endpos = endpos
229 self.regs = regs
230
Guido van Rossumdfa67901997-12-08 17:12:06 +0000231 def start(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 "Return the start of the substring matched by group g"
233 if type(g) == type(''):
234 try:
235 g = self.re.groupindex[g]
236 except (KeyError, TypeError):
237 raise IndexError, ('group "' + g + '" is undefined')
238 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000239
Guido van Rossumdfa67901997-12-08 17:12:06 +0000240 def end(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000241 "Return the end of the substring matched by group g"
242 if type(g) == type(''):
243 try:
244 g = self.re.groupindex[g]
245 except (KeyError, TypeError):
246 raise IndexError, ('group "' + g + '" is undefined')
247 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000248
Guido van Rossumdfa67901997-12-08 17:12:06 +0000249 def span(self, g = 0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000250 """Return a tuple containing the start,end of the substring
251 matched by group g"""
252 if type(g) == type(''):
253 try:
254 g = self.re.groupindex[g]
255 except (KeyError, TypeError):
256 raise IndexError, ('group "' + g + '" is undefined')
257 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000258
Guido van Rossumdfa67901997-12-08 17:12:06 +0000259 def groups(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000260 "Return a tuple containing all subgroups of the match object"
261 result = []
262 for g in range(1, self.re._num_regs):
263 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
264 result.append(None)
265 else:
266 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
267 return tuple(result)
Guido van Rossumdfa67901997-12-08 17:12:06 +0000268
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000269 def group(self, *groups):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000270 "Return one or more groups of the match."
271 if len(groups) == 0:
272 groups = (0,)
273 result = []
274 for g in groups:
275 if type(g) == type(''):
276 try:
277 g = self.re.groupindex[g]
278 except (KeyError, TypeError):
279 raise IndexError, ('group "' + g + '" is undefined')
280 if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined')
281 elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
282 result.append(None)
283 else:
284 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
285 if len(result) > 1:
286 return tuple(result)
287 elif len(result) == 1:
288 return result[0]
289 else:
290 return ()