Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame^] | 1 | """Module to analyze Python source code; for syntax coloring tools. |
| 2 | |
| 3 | Interface: |
| 4 | tags = fontify(pytext, searchfrom, searchto) |
| 5 | |
| 6 | The 'pytext' argument is a string containing Python source code. |
| 7 | The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext. |
| 8 | The returned value is a list of tuples, formatted like this: |
| 9 | [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ] |
| 10 | The tuple contents are always like this: |
| 11 | (tag, startindex, endindex, sublist) |
| 12 | tag is one of 'keyword', 'string', 'comment' or 'identifier' |
| 13 | sublist is not used, hence always None. |
| 14 | """ |
| 15 | |
| 16 | # Based on FontText.py by Mitchell S. Chapman, |
| 17 | # which was modified by Zachary Roadhouse, |
| 18 | # then un-Tk'd by Just van Rossum. |
| 19 | # Many thanks for regular expression debugging & authoring are due to: |
| 20 | # Tim (the-incredib-ly y'rs) Peters and Cristian Tismer |
| 21 | # So, who owns the copyright? ;-) How about this: |
| 22 | # Copyright 1996-1997: |
| 23 | # Mitchell S. Chapman, |
| 24 | # Zachary Roadhouse, |
| 25 | # Tim Peters, |
| 26 | # Just van Rossum |
| 27 | |
| 28 | __version__ = "0.3.1" |
| 29 | |
| 30 | import string, regex |
| 31 | |
| 32 | # First a little helper, since I don't like to repeat things. (Tismer speaking) |
| 33 | import string |
| 34 | def replace(where, what, with): |
| 35 | return string.join(string.split(where, what), with) |
| 36 | |
| 37 | # This list of keywords is taken from ref/node13.html of the |
| 38 | # Python 1.3 HTML documentation. ("access" is intentionally omitted.) |
| 39 | keywordsList = [ |
| 40 | "assert", |
| 41 | "del", "from", "lambda", "return", |
| 42 | "and", "elif", "global", "not", "try", |
| 43 | "break", "else", "if", "or", "while", |
| 44 | "class", "except", "import", "pass", |
| 45 | "continue", "finally", "in", "print", |
| 46 | "def", "for", "is", "raise"] |
| 47 | |
| 48 | # Build up a regular expression which will match anything |
| 49 | # interesting, including multi-line triple-quoted strings. |
| 50 | commentPat = "#.*" |
| 51 | |
| 52 | pat = "q[^\q\n]*\(\\\\[\000-\377][^\q\n]*\)*q" |
| 53 | quotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"') |
| 54 | |
| 55 | # Way to go, Tim! |
| 56 | pat = """ |
| 57 | qqq |
| 58 | [^\\q]* |
| 59 | \( |
| 60 | \( \\\\[\000-\377] |
| 61 | \| q |
| 62 | \( \\\\[\000-\377] |
| 63 | \| [^\\q] |
| 64 | \| q |
| 65 | \( \\\\[\000-\377] |
| 66 | \| [^\\q] |
| 67 | \) |
| 68 | \) |
| 69 | \) |
| 70 | [^\\q]* |
| 71 | \)* |
| 72 | qqq |
| 73 | """ |
| 74 | pat = string.join(string.split(pat), '') # get rid of whitespace |
| 75 | tripleQuotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"') |
| 76 | |
| 77 | # Build up a regular expression which matches all and only |
| 78 | # Python keywords. This will let us skip the uninteresting |
| 79 | # identifier references. |
| 80 | # nonKeyPat identifies characters which may legally precede |
| 81 | # a keyword pattern. |
| 82 | nonKeyPat = "\(^\|[^a-zA-Z0-9_.\"']\)" |
| 83 | |
| 84 | keyPat = nonKeyPat + "\(" |
| 85 | for keyword in keywordsList: |
| 86 | keyPat = keyPat + keyword + "\|" |
| 87 | keyPat = keyPat[:-2] + "\)" + nonKeyPat |
| 88 | |
| 89 | matchPat = keyPat + "\|" + commentPat + "\|" + tripleQuotePat + "\|" + quotePat |
| 90 | matchRE = regex.compile(matchPat) |
| 91 | |
| 92 | idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace. |
| 93 | idRE = regex.compile(idKeyPat) |
| 94 | |
| 95 | |
| 96 | def fontify(pytext, searchfrom = 0, searchto = None): |
| 97 | if searchto is None: |
| 98 | searchto = len(pytext) |
| 99 | # Cache a few attributes for quicker reference. |
| 100 | search = matchRE.search |
| 101 | group = matchRE.group |
| 102 | idSearch = idRE.search |
| 103 | idGroup = idRE.group |
| 104 | |
| 105 | tags = [] |
| 106 | tags_append = tags.append |
| 107 | commentTag = 'comment' |
| 108 | stringTag = 'string' |
| 109 | keywordTag = 'keyword' |
| 110 | identifierTag = 'identifier' |
| 111 | |
| 112 | start = 0 |
| 113 | end = searchfrom |
| 114 | while 1: |
| 115 | start = search(pytext, end) |
| 116 | if start < 0 or start >= searchto: |
| 117 | break # EXIT LOOP |
| 118 | match = group(0) |
| 119 | end = start + len(match) |
| 120 | c = match[0] |
| 121 | if c not in "#'\"": |
| 122 | # Must have matched a keyword. |
| 123 | if start <> searchfrom: |
| 124 | # there's still a redundant char before and after it, strip! |
| 125 | match = match[1:-1] |
| 126 | start = start + 1 |
| 127 | else: |
| 128 | # this is the first keyword in the text. |
| 129 | # Only a space at the end. |
| 130 | match = match[:-1] |
| 131 | end = end - 1 |
| 132 | tags_append((keywordTag, start, end, None)) |
| 133 | # If this was a defining keyword, look ahead to the |
| 134 | # following identifier. |
| 135 | if match in ["def", "class"]: |
| 136 | start = idSearch(pytext, end) |
| 137 | if start == end: |
| 138 | match = idGroup(0) |
| 139 | end = start + len(match) |
| 140 | tags_append((identifierTag, start, end, None)) |
| 141 | elif c == "#": |
| 142 | tags_append((commentTag, start, end, None)) |
| 143 | else: |
| 144 | tags_append((stringTag, start, end, None)) |
| 145 | return tags |
| 146 | |
| 147 | |
| 148 | def test(path): |
| 149 | f = open(path) |
| 150 | text = f.read() |
| 151 | f.close() |
| 152 | tags = fontify(text) |
| 153 | for tag, start, end, sublist in tags: |
| 154 | print tag, `text[start:end]` |