blob: c87ac262aa2d1deffe289523b931eda8d1eed2eb [file] [log] [blame]
Guido van Rossum7a461e51992-09-20 21:41:09 +00001# Regular expression subroutines:
2# sub(pat, repl, str): replace first occurrence of pattern in string
3# gsub(pat, repl, str): replace all occurrences of pattern in string
Guido van Rossuma59d3e61996-08-08 18:39:18 +00004# split(str, pat, maxsplit): split string using pattern as delimiter
5# splitx(str, pat, maxsplit): split string using pattern as delimiter plus
6# return delimiters
Guido van Rossum7a461e51992-09-20 21:41:09 +00007
8
9import regex
10
11
12# Replace first occurrence of pattern pat in string str by replacement
13# repl. If the pattern isn't found, the string is returned unchanged.
14# The replacement may contain references \digit to subpatterns and
15# escaped backslashes. The pattern may be a string or an already
16# compiled pattern.
17
18def sub(pat, repl, str):
19 prog = compile(pat)
20 if prog.search(str) >= 0:
21 regs = prog.regs
22 a, b = regs[0]
23 str = str[:a] + expand(repl, regs, str) + str[b:]
24 return str
25
26
27# Replace all (non-overlapping) occurrences of pattern pat in string
28# str by replacement repl. The same rules as for sub() apply.
29# Empty matches for the pattern are replaced only when not adjacent to
30# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
31
32def gsub(pat, repl, str):
33 prog = compile(pat)
34 new = ''
35 start = 0
36 first = 1
37 while prog.search(str, start) >= 0:
38 regs = prog.regs
39 a, b = regs[0]
40 if a == b == start and not first:
41 if start >= len(str) or prog.search(str, start+1) < 0:
42 break
43 regs = prog.regs
44 a, b = regs[0]
45 new = new + str[start:a] + expand(repl, regs, str)
46 start = b
47 first = 0
48 new = new + str[start:]
49 return new
50
51
52# Split string str in fields separated by delimiters matching pattern
53# pat. Only non-empty matches for the pattern are considered, so e.g.
54# split('abc', '') returns ['abc'].
Guido van Rossuma59d3e61996-08-08 18:39:18 +000055# The optional 3rd argument sets the number of splits that are performed.
Guido van Rossum7a461e51992-09-20 21:41:09 +000056
Guido van Rossuma59d3e61996-08-08 18:39:18 +000057def split(str, pat, maxsplit = 0):
58 return intsplit(str, pat, maxsplit, 0)
59
60# Split string str in fields separated by delimiters matching pattern
61# pat. Only non-empty matches for the pattern are considered, so e.g.
62# split('abc', '') returns ['abc']. The delimiters are also included
63# in the list.
64# The optional 3rd argument sets the number of splits that are performed.
65
66
67def splitx(str, pat, maxsplit = 0):
68 return intsplit(str, pat, maxsplit, 1)
69
70# Internal function used to implement split() and splitx().
71
72def intsplit(str, pat, maxsplit, retain):
Guido van Rossum7a461e51992-09-20 21:41:09 +000073 prog = compile(pat)
74 res = []
75 start = next = 0
Guido van Rossuma59d3e61996-08-08 18:39:18 +000076 splitcount = 0
Guido van Rossum7a461e51992-09-20 21:41:09 +000077 while prog.search(str, next) >= 0:
78 regs = prog.regs
79 a, b = regs[0]
80 if a == b:
81 next = next + 1
82 if next >= len(str):
83 break
84 else:
85 res.append(str[start:a])
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000086 if retain:
87 res.append(str[a:b])
Guido van Rossum7a461e51992-09-20 21:41:09 +000088 start = next = b
Guido van Rossuma59d3e61996-08-08 18:39:18 +000089 splitcount = splitcount + 1
90 if (maxsplit and (splitcount >= maxsplit)):
91 break
Guido van Rossum7a461e51992-09-20 21:41:09 +000092 res.append(str[start:])
93 return res
94
95
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000096# Capitalize words split using a pattern
97
Guido van Rossum7a7d5d81996-08-09 21:32:29 +000098def capwords(str, pat='[^a-zA-Z0-9_]+'):
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000099 import string
Guido van Rossum7a7d5d81996-08-09 21:32:29 +0000100 words = splitx(str, pat)
Guido van Rossum4cc4ab11996-06-11 18:45:15 +0000101 for i in range(0, len(words), 2):
102 words[i] = string.capitalize(words[i])
103 return string.joinfields(words, "")
104
105
Guido van Rossum7a461e51992-09-20 21:41:09 +0000106# Internal subroutines:
107# compile(pat): compile a pattern, caching already compiled patterns
108# expand(repl, regs, str): expand \digit escapes in replacement string
109
110
111# Manage a cache of compiled regular expressions.
112# If the pattern is a string a compiled version of it is returned.
113# If the pattern has been used before we return an already compiled
114# version from the cache; otherwise we compile it now and save the
115# compiled version in the cache.
116# Instead of a string, a compiled regular expression can also be
117# passed.
118# WARNING: if the pattern syntax is changed, the cache should be
119# flushed!
120
121cache = {}
122
123def compile(pat):
124 if type(pat) <> type(''):
125 return pat # Assume it is a compiled regex
126 if cache.has_key(pat):
127 prog = cache[pat] # Get it from the cache
128 else:
129 prog = cache[pat] = regex.compile(pat)
130 return prog
131
132
133# Expand \digit in the replacement.
134# Each occurrence of \digit is replaced by the substring of str
135# indicated by regs[digit]. To include a literal \ in the
136# replacement, double it; other \ escapes are left unchanged (i.e.
137# the \ and the following character are both copied).
138
139def expand(repl, regs, str):
140 if '\\' not in repl:
141 return repl
142 new = ''
143 i = 0
Guido van Rossum9e6aa9d1996-05-28 23:01:28 +0000144 ord0 = ord('0')
Guido van Rossum7a461e51992-09-20 21:41:09 +0000145 while i < len(repl):
146 c = repl[i]; i = i+1
147 if c <> '\\' or i >= len(repl):
148 new = new + c
149 else:
150 c = repl[i]; i = i+1
151 if '0' <= c <= '9':
Guido van Rossum9e6aa9d1996-05-28 23:01:28 +0000152 a, b = regs[ord(c)-ord0]
Guido van Rossum7a461e51992-09-20 21:41:09 +0000153 new = new + str[a:b]
154 elif c == '\\':
155 new = new + c
156 else:
157 new = new + '\\' + c
158 return new
159
160
161# Test program, reads sequences "pat repl str" from stdin.
162# Optional argument specifies pattern used to split lines.
163
164def test():
165 import sys
166 if sys.argv[1:]:
167 delpat = sys.argv[1]
168 else:
169 delpat = '[ \t\n]+'
170 while 1:
171 if sys.stdin.isatty(): sys.stderr.write('--> ')
172 line = sys.stdin.readline()
173 if not line: break
174 if line[-1] == '\n': line = line[:-1]
175 fields = split(line, delpat)
176 if len(fields) <> 3:
177 print 'Sorry, not three fields'
178 print 'split:', `fields`
179 continue
180 [pat, repl, str] = split(line, delpat)
181 print 'sub :', `sub(pat, repl, str)`
182 print 'gsub:', `gsub(pat, repl, str)`