Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame^] | 1 | # This module compiles a regular expression that recognizes Python tokens. |
| 2 | # It is designed to match the working of the Python tokenizer exactly. |
| 3 | # It takes care of everything except indentation; |
| 4 | # note that un-escaped newlines are tokens, too. |
| 5 | # tokenprog.regs[3] gives the location of the token without whitespace |
| 6 | # It also defines various subexpressions, but doesn't compile them. |
| 7 | # See the function test() below for an example of how to use. |
| 8 | |
| 9 | import regex |
| 10 | |
| 11 | # Note: to get a quoted backslash in a regexp, it must be quadrupled. |
| 12 | |
| 13 | Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?' |
| 14 | |
| 15 | Name = '[a-zA-Z_][a-zA-Z0-9_]*' |
| 16 | |
| 17 | Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' |
| 18 | Octnumber = '0[0-7]*[lL]?' |
| 19 | Decnumber = '[1-9][0-9]*[lL]?' |
| 20 | Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber |
| 21 | Exponent = '[eE][-+]?[0-9]+' |
| 22 | Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?' |
| 23 | Expfloat = '[0-9]+' + Exponent |
| 24 | Floatnumber = Pointfloat + '\|' + Expfloat |
| 25 | Number = Intnumber + '\|' + Floatnumber |
| 26 | |
| 27 | String = '\'\(\\\\.\|[^\\\n\']\)*\'' |
| 28 | |
| 29 | Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>' |
| 30 | Bracket = '[][(){}]' |
| 31 | Special = '[:;.,`\n]' |
| 32 | Funny = Operator + '\|' + Bracket + '\|' + Special |
| 33 | |
| 34 | PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny |
| 35 | |
| 36 | Token = Ignore + '\(' + PlainToken + '\)' |
| 37 | |
| 38 | try: |
| 39 | save_syntax = regex.set_syntax(0) # Use default syntax |
| 40 | tokenprog = regex.compile(Token) |
| 41 | finally: |
| 42 | dummy = regex.set_syntax(save_syntax) # Restore original syntax |
| 43 | |
| 44 | |
| 45 | def test(file): |
| 46 | f = open(file, 'r') |
| 47 | while 1: |
| 48 | line = f.readline() |
| 49 | if not line: break |
| 50 | i, n = 0, len(line) |
| 51 | while i < n: |
| 52 | j = tokenprog.match(line, i) |
| 53 | if j < 0: |
| 54 | print 'No token at', `line[i:i+20]` + '...' |
| 55 | i = i+1 |
| 56 | else: |
| 57 | i = i+j |
| 58 | a, b = tokenprog.regs[3] |
| 59 | if a < b: |
| 60 | print 'Token:', `line[a:b]` |