blob: 7778602c2d7ef3355a91620934d26e7a2f936884 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""Regexp-based split and replace using the obsolete regex module.
Guido van Rossum7a461e51992-09-20 21:41:09 +00002
Guido van Rossume7b146f2000-02-04 15:28:42 +00003This module is only for backward compatibility. These operations
4are now provided by the new regular expression module, "re".
5
6sub(pat, repl, str): replace first occurrence of pattern in string
7gsub(pat, repl, str): replace all occurrences of pattern in string
8split(str, pat, maxsplit): split string using pattern as delimiter
9splitx(str, pat, maxsplit): split string using pattern as delimiter plus
10 return delimiters
11"""
Guido van Rossum7a461e51992-09-20 21:41:09 +000012
Guido van Rossum7292e922000-12-19 18:25:58 +000013import warnings
14warnings.warn("the regsub module is deprecated; please use re.sub()",
Tim Peters0c9886d2001-01-15 01:18:21 +000015 DeprecationWarning)
Guido van Rossum7292e922000-12-19 18:25:58 +000016
17# Ignore further deprecation warnings about this module
18warnings.filterwarnings("ignore", "", DeprecationWarning, __name__)
19
Guido van Rossum7a461e51992-09-20 21:41:09 +000020import regex
21
22
23# Replace first occurrence of pattern pat in string str by replacement
24# repl. If the pattern isn't found, the string is returned unchanged.
25# The replacement may contain references \digit to subpatterns and
26# escaped backslashes. The pattern may be a string or an already
27# compiled pattern.
28
29def sub(pat, repl, str):
Tim Peters0c9886d2001-01-15 01:18:21 +000030 prog = compile(pat)
31 if prog.search(str) >= 0:
32 regs = prog.regs
33 a, b = regs[0]
34 str = str[:a] + expand(repl, regs, str) + str[b:]
35 return str
Guido van Rossum7a461e51992-09-20 21:41:09 +000036
37
38# Replace all (non-overlapping) occurrences of pattern pat in string
39# str by replacement repl. The same rules as for sub() apply.
40# Empty matches for the pattern are replaced only when not adjacent to
41# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
42
43def gsub(pat, repl, str):
Tim Peters0c9886d2001-01-15 01:18:21 +000044 prog = compile(pat)
45 new = ''
46 start = 0
47 first = 1
48 while prog.search(str, start) >= 0:
49 regs = prog.regs
50 a, b = regs[0]
51 if a == b == start and not first:
52 if start >= len(str) or prog.search(str, start+1) < 0:
53 break
54 regs = prog.regs
55 a, b = regs[0]
56 new = new + str[start:a] + expand(repl, regs, str)
57 start = b
58 first = 0
59 new = new + str[start:]
60 return new
Guido van Rossum7a461e51992-09-20 21:41:09 +000061
62
63# Split string str in fields separated by delimiters matching pattern
64# pat. Only non-empty matches for the pattern are considered, so e.g.
65# split('abc', '') returns ['abc'].
Guido van Rossuma59d3e61996-08-08 18:39:18 +000066# The optional 3rd argument sets the number of splits that are performed.
Guido van Rossum7a461e51992-09-20 21:41:09 +000067
Guido van Rossuma59d3e61996-08-08 18:39:18 +000068def split(str, pat, maxsplit = 0):
Tim Peters0c9886d2001-01-15 01:18:21 +000069 return intsplit(str, pat, maxsplit, 0)
Guido van Rossuma59d3e61996-08-08 18:39:18 +000070
71# Split string str in fields separated by delimiters matching pattern
72# pat. Only non-empty matches for the pattern are considered, so e.g.
73# split('abc', '') returns ['abc']. The delimiters are also included
74# in the list.
75# The optional 3rd argument sets the number of splits that are performed.
76
77
78def splitx(str, pat, maxsplit = 0):
Tim Peters0c9886d2001-01-15 01:18:21 +000079 return intsplit(str, pat, maxsplit, 1)
80
Guido van Rossuma59d3e61996-08-08 18:39:18 +000081# Internal function used to implement split() and splitx().
82
83def intsplit(str, pat, maxsplit, retain):
Tim Peters0c9886d2001-01-15 01:18:21 +000084 prog = compile(pat)
85 res = []
86 start = next = 0
87 splitcount = 0
88 while prog.search(str, next) >= 0:
89 regs = prog.regs
90 a, b = regs[0]
91 if a == b:
92 next = next + 1
93 if next >= len(str):
94 break
95 else:
96 res.append(str[start:a])
97 if retain:
98 res.append(str[a:b])
99 start = next = b
100 splitcount = splitcount + 1
101 if (maxsplit and (splitcount >= maxsplit)):
102 break
103 res.append(str[start:])
104 return res
Guido van Rossum7a461e51992-09-20 21:41:09 +0000105
106
Guido van Rossum4cc4ab11996-06-11 18:45:15 +0000107# Capitalize words split using a pattern
108
Guido van Rossum7a7d5d81996-08-09 21:32:29 +0000109def capwords(str, pat='[^a-zA-Z0-9_]+'):
Tim Peters0c9886d2001-01-15 01:18:21 +0000110 words = splitx(str, pat)
111 for i in range(0, len(words), 2):
Eric S. Raymond66d99192001-02-09 09:19:27 +0000112 words[i] = words[i].capitalize()
113 return "".joinfields(words)
Guido van Rossum4cc4ab11996-06-11 18:45:15 +0000114
115
Guido van Rossum7a461e51992-09-20 21:41:09 +0000116# Internal subroutines:
117# compile(pat): compile a pattern, caching already compiled patterns
118# expand(repl, regs, str): expand \digit escapes in replacement string
119
120
121# Manage a cache of compiled regular expressions.
Barry Warsawb67a25c1997-02-18 18:52:55 +0000122#
123# If the pattern is a string a compiled version of it is returned. If
124# the pattern has been used before we return an already compiled
Guido van Rossum7a461e51992-09-20 21:41:09 +0000125# version from the cache; otherwise we compile it now and save the
Barry Warsawb67a25c1997-02-18 18:52:55 +0000126# compiled version in the cache, along with the syntax it was compiled
127# with. Instead of a string, a compiled regular expression can also
128# be passed.
Guido van Rossum7a461e51992-09-20 21:41:09 +0000129
130cache = {}
131
132def compile(pat):
Tim Peters0c9886d2001-01-15 01:18:21 +0000133 if type(pat) != type(''):
134 return pat # Assume it is a compiled regex
135 key = (pat, regex.get_syntax())
136 if cache.has_key(key):
137 prog = cache[key] # Get it from the cache
138 else:
139 prog = cache[key] = regex.compile(pat)
140 return prog
Guido van Rossum7a461e51992-09-20 21:41:09 +0000141
142
Barry Warsawb67a25c1997-02-18 18:52:55 +0000143def clear_cache():
Tim Peters0c9886d2001-01-15 01:18:21 +0000144 global cache
145 cache = {}
Barry Warsawb67a25c1997-02-18 18:52:55 +0000146
147
Guido van Rossum7a461e51992-09-20 21:41:09 +0000148# Expand \digit in the replacement.
149# Each occurrence of \digit is replaced by the substring of str
150# indicated by regs[digit]. To include a literal \ in the
151# replacement, double it; other \ escapes are left unchanged (i.e.
152# the \ and the following character are both copied).
153
154def expand(repl, regs, str):
Tim Peters0c9886d2001-01-15 01:18:21 +0000155 if '\\' not in repl:
156 return repl
157 new = ''
158 i = 0
159 ord0 = ord('0')
160 while i < len(repl):
161 c = repl[i]; i = i+1
162 if c != '\\' or i >= len(repl):
163 new = new + c
164 else:
165 c = repl[i]; i = i+1
166 if '0' <= c <= '9':
167 a, b = regs[ord(c)-ord0]
168 new = new + str[a:b]
169 elif c == '\\':
170 new = new + c
171 else:
172 new = new + '\\' + c
173 return new
Guido van Rossum7a461e51992-09-20 21:41:09 +0000174
175
176# Test program, reads sequences "pat repl str" from stdin.
177# Optional argument specifies pattern used to split lines.
178
179def test():
Tim Peters0c9886d2001-01-15 01:18:21 +0000180 import sys
181 if sys.argv[1:]:
182 delpat = sys.argv[1]
183 else:
184 delpat = '[ \t\n]+'
185 while 1:
186 if sys.stdin.isatty(): sys.stderr.write('--> ')
187 line = sys.stdin.readline()
188 if not line: break
189 if line[-1] == '\n': line = line[:-1]
190 fields = split(line, delpat)
191 if len(fields) != 3:
192 print 'Sorry, not three fields'
193 print 'split:', `fields`
194 continue
195 [pat, repl, str] = split(line, delpat)
196 print 'sub :', `sub(pat, repl, str)`
197 print 'gsub:', `gsub(pat, repl, str)`