blob: 6b3d991a960c570ef278ebbb89a72cbaaca44275 [file] [log] [blame]
Guido van Rossum4d8e8591992-01-01 19:34:47 +00001# This module compiles a regular expression that recognizes Python tokens.
2# It is designed to match the working of the Python tokenizer exactly.
3# It takes care of everything except indentation;
4# note that un-escaped newlines are tokens, too.
5# tokenprog.regs[3] gives the location of the token without whitespace
6# It also defines various subexpressions, but doesn't compile them.
7# See the function test() below for an example of how to use.
8
9import regex
10
11# Note: to get a quoted backslash in a regexp, it must be quadrupled.
12
13Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
14
15Name = '[a-zA-Z_][a-zA-Z0-9_]*'
16
17Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
18Octnumber = '0[0-7]*[lL]?'
19Decnumber = '[1-9][0-9]*[lL]?'
20Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
21Exponent = '[eE][-+]?[0-9]+'
22Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
23Expfloat = '[0-9]+' + Exponent
24Floatnumber = Pointfloat + '\|' + Expfloat
25Number = Intnumber + '\|' + Floatnumber
26
27String = '\'\(\\\\.\|[^\\\n\']\)*\''
28
29Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
30Bracket = '[][(){}]'
31Special = '[:;.,`\n]'
32Funny = Operator + '\|' + Bracket + '\|' + Special
33
34PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
35
36Token = Ignore + '\(' + PlainToken + '\)'
37
38try:
39 save_syntax = regex.set_syntax(0) # Use default syntax
40 tokenprog = regex.compile(Token)
41finally:
42 dummy = regex.set_syntax(save_syntax) # Restore original syntax
43
44
45def test(file):
46 f = open(file, 'r')
47 while 1:
48 line = f.readline()
49 if not line: break
50 i, n = 0, len(line)
51 while i < n:
52 j = tokenprog.match(line, i)
53 if j < 0:
54 print 'No token at', `line[i:i+20]` + '...'
55 i = i+1
56 else:
57 i = i+j
58 a, b = tokenprog.regs[3]
59 if a < b:
60 print 'Token:', `line[a:b]`