blob: bf74808377b08d94ec1688e9d233cc4419f2793e [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
3# $Id$
4
Guido van Rossumbf9d3531997-10-06 14:45:17 +00005
6import sys
Guido van Rossum5ca1b711997-07-10 21:00:31 +00007import string
Guido van Rossumbf9d3531997-10-06 14:45:17 +00008from pcre import *
Guido van Rossum5ca1b711997-07-10 21:00:31 +00009
Guido van Rossumbf9d3531997-10-06 14:45:17 +000010#
11# First, the public part of the interface:
12#
13
14# pcre.error and re.error should be the same, since exceptions can be
Guido van Rossum6af4abd1997-08-13 03:25:34 +000015# raised from either module.
Guido van Rossum5ca1b711997-07-10 21:00:31 +000016
17# compilation flags
18
Guido van Rossumbf9d3531997-10-06 14:45:17 +000019I = IGNORECASE
20M = MULTILINE
21S = DOTALL
22X = VERBOSE
Guido van Rossum09bcfd61997-07-15 15:38:20 +000023
24#
25#
26#
27
Guido van Rossum26d80e61997-07-15 18:59:04 +000028_cache = {}
29_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000030
Guido van Rossum9e18ec71997-07-17 22:39:13 +000031def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000032 key = (pattern, flags)
33 try:
34 return _cache[key]
35 except KeyError:
36 pass
37 value = compile(pattern, flags)
38 if len(_cache) >= _MAXCACHE:
39 _cache.clear()
40 _cache[key] = value
41 return value
42
Guido van Rossum5ca1b711997-07-10 21:00:31 +000043def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000044 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000045
Guido van Rossum5ca1b711997-07-10 21:00:31 +000046def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000047 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000048
Guido van Rossum5ca1b711997-07-10 21:00:31 +000049def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000050 if type(pattern) == type(''):
51 pattern = _cachecompile(pattern)
52 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000053
54def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000055 if type(pattern) == type(''):
56 pattern = _cachecompile(pattern)
57 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000058
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000059def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000060 if type(pattern) == type(''):
61 pattern = _cachecompile(pattern)
62 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000063
64#
65#
66#
67
68class RegexObject:
Guido van Rossumbf9d3531997-10-06 14:45:17 +000069 def __init__(self, pattern, flags, code, groupindex):
70 self.code = code
Guido van Rossum5ca1b711997-07-10 21:00:31 +000071 self.flags = flags
72 self.pattern = pattern
73 self.groupindex = groupindex
Guido van Rossum5ca1b711997-07-10 21:00:31 +000074 def search(self, string, pos=0):
Guido van Rossumbf9d3531997-10-06 14:45:17 +000075 regs = self.code.match(string, pos, 0)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000076 if regs is None:
77 return None
Guido van Rossumbf9d3531997-10-06 14:45:17 +000078 self.num_regs=len(regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000079
Guido van Rossum5ca1b711997-07-10 21:00:31 +000080 return MatchObject(self,
81 string,
82 pos,
83 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000084
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000085 def match(self, string, pos=0):
Guido van Rossumbf9d3531997-10-06 14:45:17 +000086 regs = self.code.match(string, pos, ANCHORED)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000087 if regs is None:
88 return None
Guido van Rossumaf8d2bf1997-10-27 18:17:19 +000089 self.num_regs=len(regs)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000090 return MatchObject(self,
91 string,
92 pos,
93 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000094
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000095 def sub(self, repl, string, count=0):
Guido van Rossum71fa97c1997-07-18 04:26:03 +000096 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000097
Guido van Rossum9e18ec71997-07-17 22:39:13 +000098 def subn(self, repl, source, count=0):
Guido van Rossum71fa97c1997-07-18 04:26:03 +000099 if count < 0:
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000100 raise error, "negative substitution count"
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000101 if count == 0:
102 import sys
103 count = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000104 if type(repl) == type(''):
105 if '\\' in repl:
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000106 repl = lambda m, r=repl: pcre_expand(m, r)
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000107 else:
108 repl = lambda m, r=repl: r
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000109 n = 0 # Number of matches
110 pos = 0 # Where to start searching
111 lastmatch = -1 # End of last match
112 results = [] # Substrings making up the result
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000113 end = len(source)
114 while n < count and pos <= end:
115 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000116 if not m:
117 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000118 i, j = m.span(0)
119 if i == j == lastmatch:
120 # Empty match adjacent to previous match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000121 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000122 results.append(source[lastmatch:pos])
123 continue
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000124 if pos < i:
125 results.append(source[pos:i])
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000126 results.append(repl(m))
127 pos = lastmatch = j
128 if i == j:
129 # Last match was empty; don't try here again
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000130 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000131 results.append(source[lastmatch:pos])
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000132 n = n + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000133 results.append(source[pos:])
134 return (string.join(results, ''), n)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000135
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000136 def split(self, source, maxsplit=0):
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000137 if maxsplit < 0:
138 raise error, "negative split count"
139 if maxsplit == 0:
140 import sys
141 maxsplit = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000142 n = 0
143 pos = 0
144 lastmatch = 0
145 results = []
146 end = len(source)
147 while n < maxsplit:
148 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000149 if not m:
150 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000151 i, j = m.span(0)
152 if i == j:
153 # Empty match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000154 if pos >= end:
155 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000156 pos = pos+1
157 continue
158 results.append(source[lastmatch:i])
159 g = m.group()
160 if g:
161 results[len(results):] = list(g)
162 pos = lastmatch = j
163 results.append(source[lastmatch:])
164 return results
165
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000166class MatchObject:
167 def __init__(self, re, string, pos, regs):
168 self.re = re
169 self.string = string
170 self.pos = pos
171 self.regs = regs
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000172
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000173 def start(self, g):
174 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000175 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000176 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000177 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000178 raise IndexError, ('group "' + g + '" is undefined')
179 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000180
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000181 def end(self, g):
182 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000183 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000184 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000185 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000186 raise IndexError, ('group "' + g + '" is undefined')
187 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000188
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000189 def span(self, g):
190 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000191 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000192 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000193 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000194 raise IndexError, ('group "' + g + '" is undefined')
195 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000196
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000197 def group(self, *groups):
198 if len(groups) == 0:
199 groups = range(1, self.re.num_regs)
Guido van Rossum53109751997-07-15 15:40:29 +0000200 use_all = 1
201 else:
202 use_all = 0
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000203 result = []
204 for g in groups:
205 if type(g) == type(''):
206 try:
207 g = self.re.groupindex[g]
208 except (KeyError, TypeError):
209 raise IndexError, ('group "' + g + '" is undefined')
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000210 if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined')
211 elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000212 result.append(None)
213 else:
214 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
Guido van Rossum53109751997-07-15 15:40:29 +0000215 if use_all or len(result) > 1:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000216 return tuple(result)
217 elif len(result) == 1:
218 return result[0]
219 else:
220 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000221
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000222def escape(pattern):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000223 result = []
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000224 alphanum=string.letters+'_'+string.digits
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000225 for char in pattern:
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000226 if char not in alphanum:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000227 result.append('\\')
228 result.append(char)
229 return string.join(result, '')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000230
Guido van Rossumbf9d3531997-10-06 14:45:17 +0000231def compile(pattern, flags=0):
232 groupindex={}
233 code=pcre_compile(pattern, flags, groupindex)
234 return RegexObject(pattern, flags, code, groupindex)
235
Guido van Rossum04a1d741997-07-15 14:38:13 +0000236