| Guido van Rossum | e7b146f | 2000-02-04 15:28:42 +0000 | [diff] [blame] | 1 | """Regexp-based split and replace using the obsolete regex module. | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 2 |  | 
| Guido van Rossum | e7b146f | 2000-02-04 15:28:42 +0000 | [diff] [blame] | 3 | This module is only for backward compatibility.  These operations | 
 | 4 | are now provided by the new regular expression module, "re". | 
 | 5 |  | 
 | 6 | sub(pat, repl, str):        replace first occurrence of pattern in string | 
 | 7 | gsub(pat, repl, str):       replace all occurrences of pattern in string | 
 | 8 | split(str, pat, maxsplit):  split string using pattern as delimiter | 
 | 9 | splitx(str, pat, maxsplit): split string using pattern as delimiter plus | 
 | 10 |                             return delimiters | 
 | 11 | """ | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 12 |  | 
| Guido van Rossum | 7292e92 | 2000-12-19 18:25:58 +0000 | [diff] [blame] | 13 | import warnings | 
 | 14 | warnings.warn("the regsub module is deprecated; please use re.sub()", | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 15 |               DeprecationWarning) | 
| Guido van Rossum | 7292e92 | 2000-12-19 18:25:58 +0000 | [diff] [blame] | 16 |  | 
 | 17 | # Ignore further deprecation warnings about this module | 
 | 18 | warnings.filterwarnings("ignore", "", DeprecationWarning, __name__) | 
 | 19 |  | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 20 | import regex | 
 | 21 |  | 
| Skip Montanaro | 0de6580 | 2001-02-15 22:15:14 +0000 | [diff] [blame] | 22 | __all__ = ["sub","gsub","split","splitx","capwords"] | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 23 |  | 
 | 24 | # Replace first occurrence of pattern pat in string str by replacement | 
 | 25 | # repl.  If the pattern isn't found, the string is returned unchanged. | 
 | 26 | # The replacement may contain references \digit to subpatterns and | 
 | 27 | # escaped backslashes.  The pattern may be a string or an already | 
 | 28 | # compiled pattern. | 
 | 29 |  | 
 | 30 | def sub(pat, repl, str): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 31 |     prog = compile(pat) | 
 | 32 |     if prog.search(str) >= 0: | 
 | 33 |         regs = prog.regs | 
 | 34 |         a, b = regs[0] | 
 | 35 |         str = str[:a] + expand(repl, regs, str) + str[b:] | 
 | 36 |     return str | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 37 |  | 
 | 38 |  | 
 | 39 | # Replace all (non-overlapping) occurrences of pattern pat in string | 
 | 40 | # str by replacement repl.  The same rules as for sub() apply. | 
 | 41 | # Empty matches for the pattern are replaced only when not adjacent to | 
 | 42 | # a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'. | 
 | 43 |  | 
 | 44 | def gsub(pat, repl, str): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 45 |     prog = compile(pat) | 
 | 46 |     new = '' | 
 | 47 |     start = 0 | 
 | 48 |     first = 1 | 
 | 49 |     while prog.search(str, start) >= 0: | 
 | 50 |         regs = prog.regs | 
 | 51 |         a, b = regs[0] | 
 | 52 |         if a == b == start and not first: | 
 | 53 |             if start >= len(str) or prog.search(str, start+1) < 0: | 
 | 54 |                 break | 
 | 55 |             regs = prog.regs | 
 | 56 |             a, b = regs[0] | 
 | 57 |         new = new + str[start:a] + expand(repl, regs, str) | 
 | 58 |         start = b | 
 | 59 |         first = 0 | 
 | 60 |     new = new + str[start:] | 
 | 61 |     return new | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 62 |  | 
 | 63 |  | 
 | 64 | # Split string str in fields separated by delimiters matching pattern | 
 | 65 | # pat.  Only non-empty matches for the pattern are considered, so e.g. | 
 | 66 | # split('abc', '') returns ['abc']. | 
| Guido van Rossum | a59d3e6 | 1996-08-08 18:39:18 +0000 | [diff] [blame] | 67 | # The optional 3rd argument sets the number of splits that are performed. | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 68 |  | 
| Guido van Rossum | a59d3e6 | 1996-08-08 18:39:18 +0000 | [diff] [blame] | 69 | def split(str, pat, maxsplit = 0): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 70 |     return intsplit(str, pat, maxsplit, 0) | 
| Guido van Rossum | a59d3e6 | 1996-08-08 18:39:18 +0000 | [diff] [blame] | 71 |  | 
 | 72 | # Split string str in fields separated by delimiters matching pattern | 
 | 73 | # pat.  Only non-empty matches for the pattern are considered, so e.g. | 
 | 74 | # split('abc', '') returns ['abc']. The delimiters are also included | 
 | 75 | # in the list. | 
 | 76 | # The optional 3rd argument sets the number of splits that are performed. | 
 | 77 |  | 
 | 78 |  | 
 | 79 | def splitx(str, pat, maxsplit = 0): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 80 |     return intsplit(str, pat, maxsplit, 1) | 
 | 81 |  | 
| Guido van Rossum | a59d3e6 | 1996-08-08 18:39:18 +0000 | [diff] [blame] | 82 | # Internal function used to implement split() and splitx(). | 
 | 83 |  | 
 | 84 | def intsplit(str, pat, maxsplit, retain): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 85 |     prog = compile(pat) | 
 | 86 |     res = [] | 
 | 87 |     start = next = 0 | 
 | 88 |     splitcount = 0 | 
 | 89 |     while prog.search(str, next) >= 0: | 
 | 90 |         regs = prog.regs | 
 | 91 |         a, b = regs[0] | 
 | 92 |         if a == b: | 
 | 93 |             next = next + 1 | 
 | 94 |             if next >= len(str): | 
 | 95 |                 break | 
 | 96 |         else: | 
 | 97 |             res.append(str[start:a]) | 
 | 98 |             if retain: | 
 | 99 |                 res.append(str[a:b]) | 
 | 100 |             start = next = b | 
 | 101 |             splitcount = splitcount + 1 | 
 | 102 |             if (maxsplit and (splitcount >= maxsplit)): | 
 | 103 |                 break | 
 | 104 |     res.append(str[start:]) | 
 | 105 |     return res | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 106 |  | 
 | 107 |  | 
| Guido van Rossum | 4cc4ab1 | 1996-06-11 18:45:15 +0000 | [diff] [blame] | 108 | # Capitalize words split using a pattern | 
 | 109 |  | 
| Guido van Rossum | 7a7d5d8 | 1996-08-09 21:32:29 +0000 | [diff] [blame] | 110 | def capwords(str, pat='[^a-zA-Z0-9_]+'): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 111 |     words = splitx(str, pat) | 
 | 112 |     for i in range(0, len(words), 2): | 
| Eric S. Raymond | 66d9919 | 2001-02-09 09:19:27 +0000 | [diff] [blame] | 113 |         words[i] = words[i].capitalize() | 
| Eric S. Raymond | 92852ad | 2001-02-09 09:21:01 +0000 | [diff] [blame] | 114 |     return "".join(words) | 
| Guido van Rossum | 4cc4ab1 | 1996-06-11 18:45:15 +0000 | [diff] [blame] | 115 |  | 
 | 116 |  | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 117 | # Internal subroutines: | 
 | 118 | # compile(pat): compile a pattern, caching already compiled patterns | 
 | 119 | # expand(repl, regs, str): expand \digit escapes in replacement string | 
 | 120 |  | 
 | 121 |  | 
 | 122 | # Manage a cache of compiled regular expressions. | 
| Barry Warsaw | b67a25c | 1997-02-18 18:52:55 +0000 | [diff] [blame] | 123 | # | 
 | 124 | # If the pattern is a string a compiled version of it is returned.  If | 
 | 125 | # the pattern has been used before we return an already compiled | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 126 | # version from the cache; otherwise we compile it now and save the | 
| Barry Warsaw | b67a25c | 1997-02-18 18:52:55 +0000 | [diff] [blame] | 127 | # compiled version in the cache, along with the syntax it was compiled | 
 | 128 | # with.  Instead of a string, a compiled regular expression can also | 
 | 129 | # be passed. | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 130 |  | 
 | 131 | cache = {} | 
 | 132 |  | 
 | 133 | def compile(pat): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 134 |     if type(pat) != type(''): | 
 | 135 |         return pat              # Assume it is a compiled regex | 
 | 136 |     key = (pat, regex.get_syntax()) | 
| Raymond Hettinger | 54f0222 | 2002-06-01 14:18:47 +0000 | [diff] [blame] | 137 |     if key in cache: | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 138 |         prog = cache[key]       # Get it from the cache | 
 | 139 |     else: | 
 | 140 |         prog = cache[key] = regex.compile(pat) | 
 | 141 |     return prog | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 142 |  | 
 | 143 |  | 
| Barry Warsaw | b67a25c | 1997-02-18 18:52:55 +0000 | [diff] [blame] | 144 | def clear_cache(): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 145 |     global cache | 
 | 146 |     cache = {} | 
| Barry Warsaw | b67a25c | 1997-02-18 18:52:55 +0000 | [diff] [blame] | 147 |  | 
 | 148 |  | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 149 | # Expand \digit in the replacement. | 
 | 150 | # Each occurrence of \digit is replaced by the substring of str | 
 | 151 | # indicated by regs[digit].  To include a literal \ in the | 
 | 152 | # replacement, double it; other \ escapes are left unchanged (i.e. | 
 | 153 | # the \ and the following character are both copied). | 
 | 154 |  | 
 | 155 | def expand(repl, regs, str): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 156 |     if '\\' not in repl: | 
 | 157 |         return repl | 
 | 158 |     new = '' | 
 | 159 |     i = 0 | 
 | 160 |     ord0 = ord('0') | 
 | 161 |     while i < len(repl): | 
 | 162 |         c = repl[i]; i = i+1 | 
 | 163 |         if c != '\\' or i >= len(repl): | 
 | 164 |             new = new + c | 
 | 165 |         else: | 
 | 166 |             c = repl[i]; i = i+1 | 
 | 167 |             if '0' <= c <= '9': | 
 | 168 |                 a, b = regs[ord(c)-ord0] | 
 | 169 |                 new = new + str[a:b] | 
 | 170 |             elif c == '\\': | 
 | 171 |                 new = new + c | 
 | 172 |             else: | 
 | 173 |                 new = new + '\\' + c | 
 | 174 |     return new | 
| Guido van Rossum | 7a461e5 | 1992-09-20 21:41:09 +0000 | [diff] [blame] | 175 |  | 
 | 176 |  | 
 | 177 | # Test program, reads sequences "pat repl str" from stdin. | 
 | 178 | # Optional argument specifies pattern used to split lines. | 
 | 179 |  | 
 | 180 | def test(): | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 181 |     import sys | 
 | 182 |     if sys.argv[1:]: | 
 | 183 |         delpat = sys.argv[1] | 
 | 184 |     else: | 
 | 185 |         delpat = '[ \t\n]+' | 
 | 186 |     while 1: | 
 | 187 |         if sys.stdin.isatty(): sys.stderr.write('--> ') | 
 | 188 |         line = sys.stdin.readline() | 
 | 189 |         if not line: break | 
 | 190 |         if line[-1] == '\n': line = line[:-1] | 
 | 191 |         fields = split(line, delpat) | 
 | 192 |         if len(fields) != 3: | 
 | 193 |             print 'Sorry, not three fields' | 
 | 194 |             print 'split:', `fields` | 
 | 195 |             continue | 
 | 196 |         [pat, repl, str] = split(line, delpat) | 
 | 197 |         print 'sub :', `sub(pat, repl, str)` | 
 | 198 |         print 'gsub:', `gsub(pat, repl, str)` |