| Andrew M. Kuchling | d9b38d2 | 2004-04-06 19:42:34 +0000 | [diff] [blame] | 1 | #! /usr/bin/env python | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 2 |  | 
| Guido van Rossum | 7f91cf9 | 2000-05-30 13:25:35 +0000 | [diff] [blame] | 3 | r"""Convert old ("regex") regular expressions to new syntax ("re"). | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 4 |  | 
 | 5 | When imported as a module, there are two functions, with their own | 
 | 6 | strings: | 
 | 7 |  | 
 | 8 |   convert(s, syntax=None) -- convert a regex regular expression to re syntax | 
 | 9 |  | 
 | 10 |   quote(s) -- return a quoted string literal | 
 | 11 |  | 
 | 12 | When used as a script, read a Python string literal (or any other | 
 | 13 | expression evaluating to a string) from stdin, and write the | 
 | 14 | translated expression to stdout as a string literal.  Unless stdout is | 
 | 15 | a tty, no trailing \n is written to stdout.  This is done so that it | 
 | 16 | can be used with Emacs C-U M-| (shell-command-on-region with argument | 
 | 17 | which filters the region through the shell command). | 
 | 18 |  | 
 | 19 | No attempt has been made at coding for performance. | 
 | 20 |  | 
 | 21 | Translation table... | 
 | 22 |  | 
 | 23 |     \(    (     (unless RE_NO_BK_PARENS set) | 
 | 24 |     \)    )     (unless RE_NO_BK_PARENS set) | 
 | 25 |     \|    |     (unless RE_NO_BK_VBAR set) | 
 | 26 |     \<    \b    (not quite the same, but alla...) | 
 | 27 |     \>    \b    (not quite the same, but alla...) | 
 | 28 |     \`    \A | 
 | 29 |     \'    \Z | 
 | 30 |  | 
 | 31 | Not translated... | 
 | 32 |  | 
 | 33 |     . | 
 | 34 |     ^ | 
 | 35 |     $ | 
 | 36 |     * | 
 | 37 |     +           (unless RE_BK_PLUS_QM set, then to \+) | 
 | 38 |     ?           (unless RE_BK_PLUS_QM set, then to \?) | 
 | 39 |     \ | 
 | 40 |     \b | 
 | 41 |     \B | 
 | 42 |     \w | 
 | 43 |     \W | 
 | 44 |     \1 ... \9 | 
 | 45 |  | 
 | 46 | Special cases... | 
 | 47 |  | 
 | 48 |     Non-printable characters are always replaced by their 3-digit | 
 | 49 |     escape code (except \t, \n, \r, which use mnemonic escapes) | 
 | 50 |  | 
 | 51 |     Newline is turned into | when RE_NEWLINE_OR is set | 
 | 52 |  | 
 | 53 | XXX To be done... | 
 | 54 |  | 
 | 55 |     [...]     (different treatment of backslashed items?) | 
 | 56 |     [^...]    (different treatment of backslashed items?) | 
 | 57 |     ^ $ * + ? (in some error contexts these are probably treated differently) | 
 | 58 |     \vDD  \DD (in the regex docs but only works when RE_ANSI_HEX set) | 
 | 59 |  | 
 | 60 | """ | 
 | 61 |  | 
 | 62 |  | 
| Guido van Rossum | 81fc778 | 2001-09-04 15:18:54 +0000 | [diff] [blame] | 63 | import warnings | 
 | 64 | warnings.filterwarnings("ignore", ".* regex .*", DeprecationWarning, __name__, | 
 | 65 |                         append=1) | 
 | 66 |  | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 67 | import regex | 
| Tim Peters | 0c9886d | 2001-01-15 01:18:21 +0000 | [diff] [blame] | 68 | from regex_syntax import * # RE_* | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 69 |  | 
| Skip Montanaro | 0de6580 | 2001-02-15 22:15:14 +0000 | [diff] [blame] | 70 | __all__ = ["convert","quote"] | 
 | 71 |  | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 72 | # Default translation table | 
 | 73 | mastertable = { | 
 | 74 |     r'\<': r'\b', | 
 | 75 |     r'\>': r'\b', | 
 | 76 |     r'\`': r'\A', | 
 | 77 |     r'\'': r'\Z', | 
 | 78 |     r'\(': '(', | 
 | 79 |     r'\)': ')', | 
 | 80 |     r'\|': '|', | 
 | 81 |     '(': r'\(', | 
 | 82 |     ')': r'\)', | 
 | 83 |     '|': r'\|', | 
 | 84 |     '\t': r'\t', | 
 | 85 |     '\n': r'\n', | 
 | 86 |     '\r': r'\r', | 
 | 87 | } | 
 | 88 |  | 
 | 89 |  | 
 | 90 | def convert(s, syntax=None): | 
 | 91 |     """Convert a regex regular expression to re syntax. | 
 | 92 |  | 
 | 93 |     The first argument is the regular expression, as a string object, | 
 | 94 |     just like it would be passed to regex.compile().  (I.e., pass the | 
 | 95 |     actual string object -- string quotes must already have been | 
 | 96 |     removed and the standard escape processing has already been done, | 
 | 97 |     e.g. by eval().) | 
 | 98 |  | 
 | 99 |     The optional second argument is the regex syntax variant to be | 
 | 100 |     used.  This is an integer mask as passed to regex.set_syntax(); | 
 | 101 |     the flag bits are defined in regex_syntax.  When not specified, or | 
 | 102 |     when None is given, the current regex syntax mask (as retrieved by | 
 | 103 |     regex.get_syntax()) is used -- which is 0 by default. | 
 | 104 |  | 
 | 105 |     The return value is a regular expression, as a string object that | 
 | 106 |     could be passed to re.compile().  (I.e., no string quotes have | 
 | 107 |     been added -- use quote() below, or repr().) | 
 | 108 |  | 
 | 109 |     The conversion is not always guaranteed to be correct.  More | 
 | 110 |     syntactical analysis should be performed to detect borderline | 
 | 111 |     cases and decide what to do with them.  For example, 'x*?' is not | 
 | 112 |     translated correctly. | 
 | 113 |  | 
 | 114 |     """ | 
 | 115 |     table = mastertable.copy() | 
 | 116 |     if syntax is None: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 117 |         syntax = regex.get_syntax() | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 118 |     if syntax & RE_NO_BK_PARENS: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 119 |         del table[r'\('], table[r'\)'] | 
 | 120 |         del table['('], table[')'] | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 121 |     if syntax & RE_NO_BK_VBAR: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 122 |         del table[r'\|'] | 
 | 123 |         del table['|'] | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 124 |     if syntax & RE_BK_PLUS_QM: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 125 |         table['+'] = r'\+' | 
 | 126 |         table['?'] = r'\?' | 
 | 127 |         table[r'\+'] = '+' | 
 | 128 |         table[r'\?'] = '?' | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 129 |     if syntax & RE_NEWLINE_OR: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 130 |         table['\n'] = '|' | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 131 |     res = "" | 
 | 132 |  | 
 | 133 |     i = 0 | 
 | 134 |     end = len(s) | 
 | 135 |     while i < end: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 136 |         c = s[i] | 
 | 137 |         i = i+1 | 
 | 138 |         if c == '\\': | 
 | 139 |             c = s[i] | 
 | 140 |             i = i+1 | 
 | 141 |             key = '\\' + c | 
 | 142 |             key = table.get(key, key) | 
 | 143 |             res = res + key | 
 | 144 |         else: | 
 | 145 |             c = table.get(c, c) | 
 | 146 |             res = res + c | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 147 |     return res | 
 | 148 |  | 
 | 149 |  | 
 | 150 | def quote(s, quote=None): | 
 | 151 |     """Convert a string object to a quoted string literal. | 
 | 152 |  | 
 | 153 |     This is similar to repr() but will return a "raw" string (r'...' | 
 | 154 |     or r"...") when the string contains backslashes, instead of | 
 | 155 |     doubling all backslashes.  The resulting string does *not* always | 
 | 156 |     evaluate to the same string as the original; however it will do | 
 | 157 |     just the right thing when passed into re.compile(). | 
 | 158 |  | 
 | 159 |     The optional second argument forces the string quote; it must be | 
 | 160 |     a single character which is a valid Python string quote. | 
 | 161 |  | 
 | 162 |     """ | 
 | 163 |     if quote is None: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 164 |         q = "'" | 
 | 165 |         altq = "'" | 
 | 166 |         if q in s and altq not in s: | 
 | 167 |             q = altq | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 168 |     else: | 
| Skip Montanaro | 891a1ba | 2005-01-16 19:31:40 +0000 | [diff] [blame] | 169 |         assert quote in ('"', "'", '"""', "'''") | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 170 |         q = quote | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 171 |     res = q | 
 | 172 |     for c in s: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 173 |         if c == q: c = '\\' + c | 
 | 174 |         elif c < ' ' or c > '~': c = "\\%03o" % ord(c) | 
 | 175 |         res = res + c | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 176 |     res = res + q | 
 | 177 |     if '\\' in res: | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 178 |         res = 'r' + res | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 179 |     return res | 
 | 180 |  | 
 | 181 |  | 
 | 182 | def main(): | 
 | 183 |     """Main program -- called when run as a script.""" | 
 | 184 |     import sys | 
 | 185 |     s = eval(sys.stdin.read()) | 
 | 186 |     sys.stdout.write(quote(convert(s))) | 
 | 187 |     if sys.stdout.isatty(): | 
| Guido van Rossum | 45e2fbc | 1998-03-26 21:13:24 +0000 | [diff] [blame] | 188 |         sys.stdout.write("\n") | 
| Guido van Rossum | f81e5b9 | 1997-10-23 22:43:50 +0000 | [diff] [blame] | 189 |  | 
 | 190 |  | 
 | 191 | if __name__ == '__main__': | 
 | 192 |     main() |