blob: 8e341bb758ea3f3ac50a800f380bf1054778e352 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Regexp-based split and replace using the obsolete regex module.
Guido van Rossum7a461e51992-09-20 21:41:09 +00002
Guido van Rossume7b146f2000-02-04 15:28:42 +00003This module is only for backward compatibility. These operations
4are now provided by the new regular expression module, "re".
5
6sub(pat, repl, str): replace first occurrence of pattern in string
7gsub(pat, repl, str): replace all occurrences of pattern in string
8split(str, pat, maxsplit): split string using pattern as delimiter
9splitx(str, pat, maxsplit): split string using pattern as delimiter plus
10 return delimiters
11"""
Guido van Rossum7a461e51992-09-20 21:41:09 +000012
13import regex
14
15
16# Replace first occurrence of pattern pat in string str by replacement
17# repl. If the pattern isn't found, the string is returned unchanged.
18# The replacement may contain references \digit to subpatterns and
19# escaped backslashes. The pattern may be a string or an already
20# compiled pattern.
21
22def sub(pat, repl, str):
23 prog = compile(pat)
24 if prog.search(str) >= 0:
25 regs = prog.regs
26 a, b = regs[0]
27 str = str[:a] + expand(repl, regs, str) + str[b:]
28 return str
29
30
31# Replace all (non-overlapping) occurrences of pattern pat in string
32# str by replacement repl. The same rules as for sub() apply.
33# Empty matches for the pattern are replaced only when not adjacent to
34# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
35
36def gsub(pat, repl, str):
37 prog = compile(pat)
38 new = ''
39 start = 0
40 first = 1
41 while prog.search(str, start) >= 0:
42 regs = prog.regs
43 a, b = regs[0]
44 if a == b == start and not first:
45 if start >= len(str) or prog.search(str, start+1) < 0:
46 break
47 regs = prog.regs
48 a, b = regs[0]
49 new = new + str[start:a] + expand(repl, regs, str)
50 start = b
51 first = 0
52 new = new + str[start:]
53 return new
54
55
56# Split string str in fields separated by delimiters matching pattern
57# pat. Only non-empty matches for the pattern are considered, so e.g.
58# split('abc', '') returns ['abc'].
Guido van Rossuma59d3e61996-08-08 18:39:18 +000059# The optional 3rd argument sets the number of splits that are performed.
Guido van Rossum7a461e51992-09-20 21:41:09 +000060
Guido van Rossuma59d3e61996-08-08 18:39:18 +000061def split(str, pat, maxsplit = 0):
62 return intsplit(str, pat, maxsplit, 0)
63
64# Split string str in fields separated by delimiters matching pattern
65# pat. Only non-empty matches for the pattern are considered, so e.g.
66# split('abc', '') returns ['abc']. The delimiters are also included
67# in the list.
68# The optional 3rd argument sets the number of splits that are performed.
69
70
71def splitx(str, pat, maxsplit = 0):
72 return intsplit(str, pat, maxsplit, 1)
73
74# Internal function used to implement split() and splitx().
75
76def intsplit(str, pat, maxsplit, retain):
Guido van Rossum7a461e51992-09-20 21:41:09 +000077 prog = compile(pat)
78 res = []
79 start = next = 0
Guido van Rossuma59d3e61996-08-08 18:39:18 +000080 splitcount = 0
Guido van Rossum7a461e51992-09-20 21:41:09 +000081 while prog.search(str, next) >= 0:
82 regs = prog.regs
83 a, b = regs[0]
84 if a == b:
85 next = next + 1
86 if next >= len(str):
87 break
88 else:
89 res.append(str[start:a])
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000090 if retain:
91 res.append(str[a:b])
Guido van Rossum7a461e51992-09-20 21:41:09 +000092 start = next = b
Guido van Rossuma59d3e61996-08-08 18:39:18 +000093 splitcount = splitcount + 1
94 if (maxsplit and (splitcount >= maxsplit)):
95 break
Guido van Rossum7a461e51992-09-20 21:41:09 +000096 res.append(str[start:])
97 return res
98
99
Guido van Rossum4cc4ab11996-06-11 18:45:15 +0000100# Capitalize words split using a pattern
101
Guido van Rossum7a7d5d81996-08-09 21:32:29 +0000102def capwords(str, pat='[^a-zA-Z0-9_]+'):
Guido van Rossum4cc4ab11996-06-11 18:45:15 +0000103 import string
Guido van Rossum7a7d5d81996-08-09 21:32:29 +0000104 words = splitx(str, pat)
Guido van Rossum4cc4ab11996-06-11 18:45:15 +0000105 for i in range(0, len(words), 2):
106 words[i] = string.capitalize(words[i])
107 return string.joinfields(words, "")
108
109
Guido van Rossum7a461e51992-09-20 21:41:09 +0000110# Internal subroutines:
111# compile(pat): compile a pattern, caching already compiled patterns
112# expand(repl, regs, str): expand \digit escapes in replacement string
113
114
115# Manage a cache of compiled regular expressions.
Barry Warsawb67a25c1997-02-18 18:52:55 +0000116#
117# If the pattern is a string a compiled version of it is returned. If
118# the pattern has been used before we return an already compiled
Guido van Rossum7a461e51992-09-20 21:41:09 +0000119# version from the cache; otherwise we compile it now and save the
Barry Warsawb67a25c1997-02-18 18:52:55 +0000120# compiled version in the cache, along with the syntax it was compiled
121# with. Instead of a string, a compiled regular expression can also
122# be passed.
Guido van Rossum7a461e51992-09-20 21:41:09 +0000123
124cache = {}
125
126def compile(pat):
127 if type(pat) <> type(''):
128 return pat # Assume it is a compiled regex
Barry Warsawb67a25c1997-02-18 18:52:55 +0000129 key = (pat, regex.get_syntax())
130 if cache.has_key(key):
131 prog = cache[key] # Get it from the cache
Guido van Rossum7a461e51992-09-20 21:41:09 +0000132 else:
Barry Warsawb67a25c1997-02-18 18:52:55 +0000133 prog = cache[key] = regex.compile(pat)
Guido van Rossum7a461e51992-09-20 21:41:09 +0000134 return prog
135
136
Barry Warsawb67a25c1997-02-18 18:52:55 +0000137def clear_cache():
138 global cache
139 cache = {}
140
141
Guido van Rossum7a461e51992-09-20 21:41:09 +0000142# Expand \digit in the replacement.
143# Each occurrence of \digit is replaced by the substring of str
144# indicated by regs[digit]. To include a literal \ in the
145# replacement, double it; other \ escapes are left unchanged (i.e.
146# the \ and the following character are both copied).
147
148def expand(repl, regs, str):
149 if '\\' not in repl:
150 return repl
151 new = ''
152 i = 0
Guido van Rossum9e6aa9d1996-05-28 23:01:28 +0000153 ord0 = ord('0')
Guido van Rossum7a461e51992-09-20 21:41:09 +0000154 while i < len(repl):
155 c = repl[i]; i = i+1
156 if c <> '\\' or i >= len(repl):
157 new = new + c
158 else:
159 c = repl[i]; i = i+1
160 if '0' <= c <= '9':
Guido van Rossum9e6aa9d1996-05-28 23:01:28 +0000161 a, b = regs[ord(c)-ord0]
Guido van Rossum7a461e51992-09-20 21:41:09 +0000162 new = new + str[a:b]
163 elif c == '\\':
164 new = new + c
165 else:
166 new = new + '\\' + c
167 return new
168
169
170# Test program, reads sequences "pat repl str" from stdin.
171# Optional argument specifies pattern used to split lines.
172
173def test():
174 import sys
175 if sys.argv[1:]:
176 delpat = sys.argv[1]
177 else:
178 delpat = '[ \t\n]+'
179 while 1:
180 if sys.stdin.isatty(): sys.stderr.write('--> ')
181 line = sys.stdin.readline()
182 if not line: break
183 if line[-1] == '\n': line = line[:-1]
184 fields = split(line, delpat)
185 if len(fields) <> 3:
186 print 'Sorry, not three fields'
187 print 'split:', `fields`
188 continue
189 [pat, repl, str] = split(line, delpat)
190 print 'sub :', `sub(pat, repl, str)`
191 print 'gsub:', `gsub(pat, repl, str)`