| # This module compiles a regular expression that recognizes Python tokens. |
| # It is designed to match the working of the Python tokenizer exactly. |
| # It takes care of everything except indentation; |
| # note that un-escaped newlines are tokens, too. |
| # tokenprog.regs[3] gives the location of the token without whitespace |
| # It also defines various subexpressions, but doesn't compile them. |
| # See the function test() below for an example of how to use. |
| |
| import regex |
| |
| # Note: to get a quoted backslash in a regexp, it must be quadrupled. |
| |
| Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?' |
| |
| Name = '[a-zA-Z_][a-zA-Z0-9_]*' |
| |
| Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' |
| Octnumber = '0[0-7]*[lL]?' |
| Decnumber = '[1-9][0-9]*[lL]?' |
| Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber |
| Exponent = '[eE][-+]?[0-9]+' |
| Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?' |
| Expfloat = '[0-9]+' + Exponent |
| Floatnumber = Pointfloat + '\|' + Expfloat |
| Number = Floatnumber + '\|' + Intnumber |
| |
| String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"' |
| # Note: this module *recognizes* double quotes, but for backward |
| # compatibility, it doesn't *use* them! |
| |
| Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>' |
| Bracket = '[][(){}]' |
| Special = '[:;.,`\n]' |
| Funny = Operator + '\|' + Bracket + '\|' + Special |
| |
| PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny |
| |
| Token = Ignore + '\(' + PlainToken + '\)' |
| |
| try: |
| save_syntax = regex.set_syntax(0) # Use default syntax |
| tokenprog = regex.compile(Token) |
| finally: |
| if save_syntax != 0: |
| dummy = regex.set_syntax(save_syntax) # Restore original syntax |
| |
| |
| def test(file): |
| f = open(file, 'r') |
| while 1: |
| line = f.readline() |
| if not line: break |
| i, n = 0, len(line) |
| while i < n: |
| j = tokenprog.match(line, i) |
| if j < 0: |
| print 'No token at', `line[i:i+20]` + '...' |
| i = i+1 |
| else: |
| i = i+j |
| a, b = tokenprog.regs[3] |
| if a < b: |
| print 'Token:', `line[a:b]` |