blob: e67d142ff17c796115483dcba69215586091ffa7 [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
Guido van Rossumbf9d3531997-10-06 14:45:17 +00003
4import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00005import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00006from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00007
Guido van Rossumbf9d3531997-10-06 14:45:17 +00008#
9# First, the public part of the interface:
10#
11
12# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossumdfa67901997-12-08 17:12:06 +000013# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000014
15# compilation flags
16
Guido van Rossumbf9d3531997-10-06 14:45:17 +000017I = IGNORECASE
Guido van Rossumdfa67901997-12-08 17:12:06 +000018L = LOCALE
Guido van Rossumbf9d3531997-10-06 14:45:17 +000019M = MULTILINE
20S = DOTALL
21X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000022
23#
24#
25#
26
Guido van Rossum26d80e61997-07-15 18:59:04 +000027_cache = {}
28_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000029
Guido van Rossum9e18ec71997-07-17 22:39:13 +000030def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000031 key = (pattern, flags)
32 try:
33 return _cache[key]
34 except KeyError:
35 pass
36 value = compile(pattern, flags)
37 if len(_cache) >= _MAXCACHE:
38 _cache.clear()
39 _cache[key] = value
40 return value
41
Guido van Rossum5ca1b711997-07-10 21:00:31 +000042def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000043 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000044
Guido van Rossum5ca1b711997-07-10 21:00:31 +000045def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000046 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000047
Guido van Rossum5ca1b711997-07-10 21:00:31 +000048def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000049 if type(pattern) == type(''):
50 pattern = _cachecompile(pattern)
51 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000052
53def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000054 if type(pattern) == type(''):
55 pattern = _cachecompile(pattern)
56 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000057
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000058def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000059 if type(pattern) == type(''):
60 pattern = _cachecompile(pattern)
61 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000062
Guido van Rossumdfa67901997-12-08 17:12:06 +000063def escape(pattern):
64 "Escape all non-alphanumeric characters in pattern."
65 result = []
66 alphanum=string.letters+'_'+string.digits
67 for char in pattern:
68 if char not in alphanum:
Guido van Rossumb1908841998-02-19 21:18:56 +000069 if char == '\000': result.append(r'\000')
70 else: result.append('\\' + char)
71 else: result.append(char)
Guido van Rossumdfa67901997-12-08 17:12:06 +000072 return string.join(result, '')
73
74def compile(pattern, flags=0):
75 "Compile a regular expression pattern, returning a RegexObject."
76 groupindex={}
77 code=pcre_compile(pattern, flags, groupindex)
78 return RegexObject(pattern, flags, code, groupindex)
79
80
Guido van Rossum5ca1b711997-07-10 21:00:31 +000081#
Guido van Rossumdfa67901997-12-08 17:12:06 +000082# Class definitions
Guido van Rossum5ca1b711997-07-10 21:00:31 +000083#
84
85class RegexObject:
Guido van Rossumbf9d3531997-10-06 14:45:17 +000086 def __init__(self, pattern, flags, code, groupindex):
87 self.code = code
Guido van Rossum5ca1b711997-07-10 21:00:31 +000088 self.flags = flags
89 self.pattern = pattern
90 self.groupindex = groupindex
Guido van Rossumdfa67901997-12-08 17:12:06 +000091
92 def search(self, string, pos=0, endpos=None):
93 """Scan through string looking for a match to the pattern, returning
94 a MatchObject instance, or None if no match was found."""
95
96 if endpos is None or endpos>len(string):
97 endpos=len(string)
98 if endpos<pos: endpos=pos
99 regs = self.code.match(string, pos, endpos, 0)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000100 if regs is None:
101 return None
Guido van Rossumdfa67901997-12-08 17:12:06 +0000102 self._num_regs=len(regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000103
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000104 return MatchObject(self,
105 string,
Guido van Rossumdfa67901997-12-08 17:12:06 +0000106 pos, endpos,
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000107 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000108
Guido van Rossumdfa67901997-12-08 17:12:06 +0000109 def match(self, string, pos=0, endpos=None):
110 """Try to apply the pattern at the start of the string, returning
111 a MatchObject instance, or None if no match was found."""
112
113 if endpos is None or endpos>len(string):
114 endpos=len(string)
115 if endpos<pos: endpos=pos
116 regs = self.code.match(string, pos, endpos, ANCHORED)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000117 if regs is None:
118 return None
Guido van Rossumdfa67901997-12-08 17:12:06 +0000119 self._num_regs=len(regs)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000120 return MatchObject(self,
121 string,
Guido van Rossumdfa67901997-12-08 17:12:06 +0000122 pos, endpos,
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000123 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000124
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000125 def sub(self, repl, string, count=0):
Guido van Rossumdfa67901997-12-08 17:12:06 +0000126 """Return the string obtained by replacing the leftmost
127 non-overlapping occurrences of the pattern in string by the
128 replacement repl"""
129
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000130 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000131
Guido van Rossumdfa67901997-12-08 17:12:06 +0000132 def subn(self, repl, source, count=0):
133 """Return a 2-tuple containing (new_string, number).
134 new_string is the string obtained by replacing the leftmost
135 non-overlapping occurrences of the pattern in string by the
136 replacement repl. number is the number of substitutions that
137 were made."""
138
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000139 if count < 0:
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000140 raise error, "negative substitution count"
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000141 if count == 0:
142 import sys
143 count = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000144 if type(repl) == type(''):
145 if '\\' in repl:
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000146 repl = lambda m, r=repl: pcre_expand(m, r)
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000147 else:
148 repl = lambda m, r=repl: r
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000149 n = 0 # Number of matches
150 pos = 0 # Where to start searching
151 lastmatch = -1 # End of last match
152 results = [] # Substrings making up the result
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000153 end = len(source)
154 while n < count and pos <= end:
155 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000156 if not m:
157 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000158 i, j = m.span(0)
159 if i == j == lastmatch:
160 # Empty match adjacent to previous match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000161 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000162 results.append(source[lastmatch:pos])
163 continue
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000164 if pos < i:
165 results.append(source[pos:i])
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000166 results.append(repl(m))
167 pos = lastmatch = j
168 if i == j:
169 # Last match was empty; don't try here again
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000170 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000171 results.append(source[lastmatch:pos])
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000172 n = n + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000173 results.append(source[pos:])
174 return (string.join(results, ''), n)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000175
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000176 def split(self, source, maxsplit=0):
Guido van Rossumdfa67901997-12-08 17:12:06 +0000177 """Split \var{string} by the occurrences of the pattern,
178 returning a list containing the resulting substrings."""
179
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000180 if maxsplit < 0:
181 raise error, "negative split count"
182 if maxsplit == 0:
183 import sys
184 maxsplit = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000185 n = 0
186 pos = 0
187 lastmatch = 0
188 results = []
189 end = len(source)
190 while n < maxsplit:
191 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000192 if not m:
193 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000194 i, j = m.span(0)
195 if i == j:
196 # Empty match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000197 if pos >= end:
198 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000199 pos = pos+1
200 continue
201 results.append(source[lastmatch:i])
Guido van Rossumdfa67901997-12-08 17:12:06 +0000202 g = m.groups()
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000203 if g:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000204 if type(g)==type( "" ): g = [g]
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000205 results[len(results):] = list(g)
206 pos = lastmatch = j
Guido van Rossum2b2b3f91998-01-12 18:57:53 +0000207 n = n + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000208 results.append(source[lastmatch:])
209 return results
210
Guido van Rossumdfa67901997-12-08 17:12:06 +0000211 # The following 3 functions were contributed by Mike Fletcher, and
212 # allow pickling and unpickling of RegexObject instances.
213 def __getinitargs__(self):
214 return (None,None,None,None) # any 4 elements, to work around
215 # problems with the
216 # pickle/cPickle modules not yet
217 # ignoring the __init__ function
218 def __getstate__(self):
219 return self.pattern, self.flags, self.groupindex
220 def __setstate__(self, statetuple):
221 self.pattern = statetuple[0]
222 self.flags = statetuple[1]
223 self.groupindex = statetuple[2]
224 self.code = apply(pcre_compile, statetuple)
225
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000226class MatchObject:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000227 def __init__(self, re, string, pos, endpos, regs):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000228 self.re = re
229 self.string = string
Guido van Rossumdfa67901997-12-08 17:12:06 +0000230 self.pos = pos
231 self.endpos = endpos
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000232 self.regs = regs
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000233
Guido van Rossumdfa67901997-12-08 17:12:06 +0000234 def start(self, g = 0):
235 "Return the start of the substring matched by group g"
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000236 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000237 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000238 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000239 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000240 raise IndexError, ('group "' + g + '" is undefined')
241 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000242
Guido van Rossumdfa67901997-12-08 17:12:06 +0000243 def end(self, g = 0):
244 "Return the end of the substring matched by group g"
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000245 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000246 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000247 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000248 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000249 raise IndexError, ('group "' + g + '" is undefined')
250 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000251
Guido van Rossumdfa67901997-12-08 17:12:06 +0000252 def span(self, g = 0):
253 """Return a tuple containing the start,end of the substring
254 matched by group g"""
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000255 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000256 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000257 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000258 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000259 raise IndexError, ('group "' + g + '" is undefined')
260 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000261
Guido van Rossumdfa67901997-12-08 17:12:06 +0000262 def groups(self):
263 "Return a tuple containing all subgroups of the match object"
Guido van Rossum2b2b3f91998-01-12 18:57:53 +0000264 result = []
265 for g in range(1, self.re._num_regs):
266 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
267 result.append(None)
268 else:
269 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
270 return tuple(result)
Guido van Rossumdfa67901997-12-08 17:12:06 +0000271
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000272 def group(self, *groups):
Guido van Rossumdfa67901997-12-08 17:12:06 +0000273 "Return one or more groups of the match."
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000274 if len(groups) == 0:
Guido van Rossumdfa67901997-12-08 17:12:06 +0000275 groups = (0,)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000276 result = []
277 for g in groups:
278 if type(g) == type(''):
279 try:
280 g = self.re.groupindex[g]
281 except (KeyError, TypeError):
282 raise IndexError, ('group "' + g + '" is undefined')
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000283 if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined')
284 elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000285 result.append(None)
286 else:
287 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
Guido van Rossumdfa67901997-12-08 17:12:06 +0000288 if len(result) > 1:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000289 return tuple(result)
290 elif len(result) == 1:
291 return result[0]
292 else:
293 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000294