blob: 3fb9408367ea8543aaef0dbcbd3704e5a79c4d66 [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
Guido van Rossumbf9d3531997-10-06 14:45:17 +00003
4import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00005import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00006from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00007
Guido van Rossumbf9d3531997-10-06 14:45:17 +00008#
9# First, the public part of the interface:
10#
11
12# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossumdfa67901997-12-08 17:12:06 +000013# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000014
15# compilation flags
16
Guido van Rossumbf9d3531997-10-06 14:45:17 +000017I = IGNORECASE
Guido van Rossumdfa67901997-12-08 17:12:06 +000018L = LOCALE
Guido van Rossumbf9d3531997-10-06 14:45:17 +000019M = MULTILINE
20S = DOTALL
21X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000022
23#
24#
25#
26
Guido van Rossum26d80e61997-07-15 18:59:04 +000027_cache = {}
28_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000029
Guido van Rossum9e18ec71997-07-17 22:39:13 +000030def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000031 key = (pattern, flags)
32 try:
33 return _cache[key]
34 except KeyError:
35 pass
36 value = compile(pattern, flags)
37 if len(_cache) >= _MAXCACHE:
38 _cache.clear()
39 _cache[key] = value
40 return value
41
Guido van Rossum5ca1b711997-07-10 21:00:31 +000042def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000043 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000044
Guido van Rossum5ca1b711997-07-10 21:00:31 +000045def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000046 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000047
Guido van Rossum5ca1b711997-07-10 21:00:31 +000048def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000049 if type(pattern) == type(''):
50 pattern = _cachecompile(pattern)
51 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000052
53def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000054 if type(pattern) == type(''):
55 pattern = _cachecompile(pattern)
56 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000057
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000058def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000059 if type(pattern) == type(''):
60 pattern = _cachecompile(pattern)
61 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000062
Guido van Rossumdfa67901997-12-08 17:12:06 +000063def escape(pattern):
64 "Escape all non-alphanumeric characters in pattern."
65 result = []
66 alphanum=string.letters+'_'+string.digits
67 for char in pattern:
68 if char not in alphanum:
69 result.append('\\')
70 result.append(char)
71 return string.join(result, '')
72
73def compile(pattern, flags=0):
74 "Compile a regular expression pattern, returning a RegexObject."
75 groupindex={}
76 code=pcre_compile(pattern, flags, groupindex)
77 return RegexObject(pattern, flags, code, groupindex)
78
79
Guido van Rossum5ca1b711997-07-10 21:00:31 +000080#
Guido van Rossumdfa67901997-12-08 17:12:06 +000081# Class definitions
Guido van Rossum5ca1b711997-07-10 21:00:31 +000082#
83
84class RegexObject:
Guido van Rossumbf9d3531997-10-06 14:45:17 +000085 def __init__(self, pattern, flags, code, groupindex):
86 self.code = code
Guido van Rossum5ca1b711997-07-10 21:00:31 +000087 self.flags = flags
88 self.pattern = pattern
89 self.groupindex = groupindex
Guido van Rossumdfa67901997-12-08 17:12:06 +000090
91 def search(self, string, pos=0, endpos=None):
92 """Scan through string looking for a match to the pattern, returning
93 a MatchObject instance, or None if no match was found."""
94
95 if endpos is None or endpos>len(string):
96 endpos=len(string)
97 if endpos<pos: endpos=pos
98 regs = self.code.match(string, pos, endpos, 0)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000099 if regs is None:
100 return None
Guido van Rossumdfa67901997-12-08 17:12:06 +0000101 self._num_regs=len(regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000102
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000103 return MatchObject(self,
104 string,
Guido van Rossumdfa67901997-12-08 17:12:06 +0000105 pos, endpos,
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000106 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000107
Guido van Rossumdfa67901997-12-08 17:12:06 +0000108 def match(self, string, pos=0, endpos=None):
109 """Try to apply the pattern at the start of the string, returning
110 a MatchObject instance, or None if no match was found."""
111
112 if endpos is None or endpos>len(string):
113 endpos=len(string)
114 if endpos<pos: endpos=pos
115 regs = self.code.match(string, pos, endpos, ANCHORED)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000116 if regs is None:
117 return None
Guido van Rossumdfa67901997-12-08 17:12:06 +0000118 self._num_regs=len(regs)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000119 return MatchObject(self,
120 string,
Guido van Rossumdfa67901997-12-08 17:12:06 +0000121 pos, endpos,
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000122 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000123
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000124 def sub(self, repl, string, count=0):
Guido van Rossumdfa67901997-12-08 17:12:06 +0000125 """Return the string obtained by replacing the leftmost
126 non-overlapping occurrences of the pattern in string by the
127 replacement repl"""
128
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000129 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000130
Guido van Rossumdfa67901997-12-08 17:12:06 +0000131 def subn(self, repl, source, count=0):
132 """Return a 2-tuple containing (new_string, number).
133 new_string is the string obtained by replacing the leftmost
134 non-overlapping occurrences of the pattern in string by the
135 replacement repl. number is the number of substitutions that
136 were made."""
137
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000138 if count < 0:
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000139 raise error, "negative substitution count"
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000140 if count == 0:
141 import sys
142 count = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000143 if type(repl) == type(''):
144 if '\\' in repl:
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000145 repl = lambda m, r=repl: pcre_expand(m, r)
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000146 else:
147 repl = lambda m, r=repl: r
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000148 n = 0 # Number of matches
149 pos = 0 # Where to start searching
150 lastmatch = -1 # End of last match
151 results = [] # Substrings making up the result
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000152 end = len(source)
153 while n < count and pos <= end:
154 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000155 if not m:
156 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000157 i, j = m.span(0)
158 if i == j == lastmatch:
159 # Empty match adjacent to previous match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000160 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000161 results.append(source[lastmatch:pos])
162 continue
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000163 if pos < i:
164 results.append(source[pos:i])
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000165 results.append(repl(m))
166 pos = lastmatch = j
167 if i == j:
168 # Last match was empty; don't try here again
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000169 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000170 results.append(source[lastmatch:pos])
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000171 n = n + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000172 results.append(source[pos:])
173 return (string.join(results, ''), n)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000174
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000175 def split(self, source, maxsplit=0):
Guido van Rossumdfa67901997-12-08 17:12:06 +0000176 """Split \var{string} by the occurrences of the pattern,
177 returning a list containing the resulting substrings."""
178
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000179 if maxsplit < 0:
180 raise error, "negative split count"
181 if maxsplit == 0:
182 import sys
183 maxsplit = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000184 n = 0
185 pos = 0
186 lastmatch = 0
187 results = []
188 end = len(source)
189 while n < maxsplit:
190 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000191 if not m:
192 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000193 i, j = m.span(0)
194 if i == j:
195 # Empty match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000196 if pos >= end:
197 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000198 pos = pos+1
199 continue
200 results.append(source[lastmatch:i])
Guido van Rossumdfa67901997-12-08 17:12:06 +0000201 g = m.groups()
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000202 if g:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000203 if type(g)==type( "" ): g = [g]
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000204 results[len(results):] = list(g)
205 pos = lastmatch = j
Guido van Rossum2b2b3f91998-01-12 18:57:53 +0000206 n = n + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000207 results.append(source[lastmatch:])
208 return results
209
Guido van Rossumdfa67901997-12-08 17:12:06 +0000210 # The following 3 functions were contributed by Mike Fletcher, and
211 # allow pickling and unpickling of RegexObject instances.
212 def __getinitargs__(self):
213 return (None,None,None,None) # any 4 elements, to work around
214 # problems with the
215 # pickle/cPickle modules not yet
216 # ignoring the __init__ function
217 def __getstate__(self):
218 return self.pattern, self.flags, self.groupindex
219 def __setstate__(self, statetuple):
220 self.pattern = statetuple[0]
221 self.flags = statetuple[1]
222 self.groupindex = statetuple[2]
223 self.code = apply(pcre_compile, statetuple)
224
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000225class MatchObject:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000226 def __init__(self, re, string, pos, endpos, regs):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000227 self.re = re
228 self.string = string
Guido van Rossumdfa67901997-12-08 17:12:06 +0000229 self.pos = pos
230 self.endpos = endpos
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000231 self.regs = regs
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000232
Guido van Rossumdfa67901997-12-08 17:12:06 +0000233 def start(self, g = 0):
234 "Return the start of the substring matched by group g"
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000235 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000236 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000237 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000238 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000239 raise IndexError, ('group "' + g + '" is undefined')
240 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000241
Guido van Rossumdfa67901997-12-08 17:12:06 +0000242 def end(self, g = 0):
243 "Return the end of the substring matched by group g"
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000244 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000245 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000246 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000247 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000248 raise IndexError, ('group "' + g + '" is undefined')
249 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000250
Guido van Rossumdfa67901997-12-08 17:12:06 +0000251 def span(self, g = 0):
252 """Return a tuple containing the start,end of the substring
253 matched by group g"""
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000254 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000255 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000256 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000257 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000258 raise IndexError, ('group "' + g + '" is undefined')
259 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000260
Guido van Rossumdfa67901997-12-08 17:12:06 +0000261 def groups(self):
262 "Return a tuple containing all subgroups of the match object"
Guido van Rossum2b2b3f91998-01-12 18:57:53 +0000263 result = []
264 for g in range(1, self.re._num_regs):
265 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
266 result.append(None)
267 else:
268 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
269 return tuple(result)
Guido van Rossumdfa67901997-12-08 17:12:06 +0000270
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000271 def group(self, *groups):
Guido van Rossumdfa67901997-12-08 17:12:06 +0000272 "Return one or more groups of the match."
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000273 if len(groups) == 0:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000274 groups = (0,)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000275 result = []
276 for g in groups:
277 if type(g) == type(''):
278 try:
279 g = self.re.groupindex[g]
280 except (KeyError, TypeError):
281 raise IndexError, ('group "' + g + '" is undefined')
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000282 if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined')
283 elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000284 result.append(None)
285 else:
286 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
Guido van Rossumdfa67901997-12-08 17:12:06 +0000287 if len(result) > 1:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000288 return tuple(result)
289 elif len(result) == 1:
290 return result[0]
291 else:
292 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000293