blob: 6dbe8dd92cd6fb4e6b50f846030466f888296e2b [file] [log] [blame]
Guido van Rossum7a461e51992-09-20 21:41:09 +00001# Regular expression subroutines:
2# sub(pat, repl, str): replace first occurrence of pattern in string
3# gsub(pat, repl, str): replace all occurrences of pattern in string
4# split(str, pat): split string using pattern as delimiter
5
6
7import regex
8
9
10# Replace first occurrence of pattern pat in string str by replacement
11# repl. If the pattern isn't found, the string is returned unchanged.
12# The replacement may contain references \digit to subpatterns and
13# escaped backslashes. The pattern may be a string or an already
14# compiled pattern.
15
16def sub(pat, repl, str):
17 prog = compile(pat)
18 if prog.search(str) >= 0:
19 regs = prog.regs
20 a, b = regs[0]
21 str = str[:a] + expand(repl, regs, str) + str[b:]
22 return str
23
24
25# Replace all (non-overlapping) occurrences of pattern pat in string
26# str by replacement repl. The same rules as for sub() apply.
27# Empty matches for the pattern are replaced only when not adjacent to
28# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
29
30def gsub(pat, repl, str):
31 prog = compile(pat)
32 new = ''
33 start = 0
34 first = 1
35 while prog.search(str, start) >= 0:
36 regs = prog.regs
37 a, b = regs[0]
38 if a == b == start and not first:
39 if start >= len(str) or prog.search(str, start+1) < 0:
40 break
41 regs = prog.regs
42 a, b = regs[0]
43 new = new + str[start:a] + expand(repl, regs, str)
44 start = b
45 first = 0
46 new = new + str[start:]
47 return new
48
49
50# Split string str in fields separated by delimiters matching pattern
51# pat. Only non-empty matches for the pattern are considered, so e.g.
52# split('abc', '') returns ['abc'].
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000053# When the optional 3rd argument is true, the separators are also
54# inserted to the list.
Guido van Rossum7a461e51992-09-20 21:41:09 +000055
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000056def split(str, pat, retain = 0):
Guido van Rossum7a461e51992-09-20 21:41:09 +000057 prog = compile(pat)
58 res = []
59 start = next = 0
60 while prog.search(str, next) >= 0:
61 regs = prog.regs
62 a, b = regs[0]
63 if a == b:
64 next = next + 1
65 if next >= len(str):
66 break
67 else:
68 res.append(str[start:a])
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000069 if retain:
70 res.append(str[a:b])
Guido van Rossum7a461e51992-09-20 21:41:09 +000071 start = next = b
72 res.append(str[start:])
73 return res
74
75
Guido van Rossum4cc4ab11996-06-11 18:45:15 +000076# Capitalize words split using a pattern
77
78def capwords(str, pat):
79 import string
80 words = split(str, pat, 1)
81 for i in range(0, len(words), 2):
82 words[i] = string.capitalize(words[i])
83 return string.joinfields(words, "")
84
85
Guido van Rossum7a461e51992-09-20 21:41:09 +000086# Internal subroutines:
87# compile(pat): compile a pattern, caching already compiled patterns
88# expand(repl, regs, str): expand \digit escapes in replacement string
89
90
91# Manage a cache of compiled regular expressions.
92# If the pattern is a string a compiled version of it is returned.
93# If the pattern has been used before we return an already compiled
94# version from the cache; otherwise we compile it now and save the
95# compiled version in the cache.
96# Instead of a string, a compiled regular expression can also be
97# passed.
98# WARNING: if the pattern syntax is changed, the cache should be
99# flushed!
100
101cache = {}
102
103def compile(pat):
104 if type(pat) <> type(''):
105 return pat # Assume it is a compiled regex
106 if cache.has_key(pat):
107 prog = cache[pat] # Get it from the cache
108 else:
109 prog = cache[pat] = regex.compile(pat)
110 return prog
111
112
113# Expand \digit in the replacement.
114# Each occurrence of \digit is replaced by the substring of str
115# indicated by regs[digit]. To include a literal \ in the
116# replacement, double it; other \ escapes are left unchanged (i.e.
117# the \ and the following character are both copied).
118
119def expand(repl, regs, str):
120 if '\\' not in repl:
121 return repl
122 new = ''
123 i = 0
Guido van Rossum9e6aa9d1996-05-28 23:01:28 +0000124 ord0 = ord('0')
Guido van Rossum7a461e51992-09-20 21:41:09 +0000125 while i < len(repl):
126 c = repl[i]; i = i+1
127 if c <> '\\' or i >= len(repl):
128 new = new + c
129 else:
130 c = repl[i]; i = i+1
131 if '0' <= c <= '9':
Guido van Rossum9e6aa9d1996-05-28 23:01:28 +0000132 a, b = regs[ord(c)-ord0]
Guido van Rossum7a461e51992-09-20 21:41:09 +0000133 new = new + str[a:b]
134 elif c == '\\':
135 new = new + c
136 else:
137 new = new + '\\' + c
138 return new
139
140
141# Test program, reads sequences "pat repl str" from stdin.
142# Optional argument specifies pattern used to split lines.
143
144def test():
145 import sys
146 if sys.argv[1:]:
147 delpat = sys.argv[1]
148 else:
149 delpat = '[ \t\n]+'
150 while 1:
151 if sys.stdin.isatty(): sys.stderr.write('--> ')
152 line = sys.stdin.readline()
153 if not line: break
154 if line[-1] == '\n': line = line[:-1]
155 fields = split(line, delpat)
156 if len(fields) <> 3:
157 print 'Sorry, not three fields'
158 print 'split:', `fields`
159 continue
160 [pat, repl, str] = split(line, delpat)
161 print 'sub :', `sub(pat, repl, str)`
162 print 'gsub:', `gsub(pat, repl, str)`