Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 1 | """Module to analyze Python source code; for syntax coloring tools. |
| 2 | |
| 3 | Interface: |
| 4 | tags = fontify(pytext, searchfrom, searchto) |
| 5 | |
| 6 | The 'pytext' argument is a string containing Python source code. |
| 7 | The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext. |
| 8 | The returned value is a list of tuples, formatted like this: |
| 9 | [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ] |
| 10 | The tuple contents are always like this: |
| 11 | (tag, startindex, endindex, sublist) |
| 12 | tag is one of 'keyword', 'string', 'comment' or 'identifier' |
| 13 | sublist is not used, hence always None. |
| 14 | """ |
| 15 | |
| 16 | # Based on FontText.py by Mitchell S. Chapman, |
| 17 | # which was modified by Zachary Roadhouse, |
| 18 | # then un-Tk'd by Just van Rossum. |
| 19 | # Many thanks for regular expression debugging & authoring are due to: |
| 20 | # Tim (the-incredib-ly y'rs) Peters and Cristian Tismer |
| 21 | # So, who owns the copyright? ;-) How about this: |
Just van Rossum | ed2ed94 | 2000-07-01 14:30:08 +0000 | [diff] [blame] | 22 | # Copyright 1996-2000: |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 23 | # Mitchell S. Chapman, |
| 24 | # Zachary Roadhouse, |
| 25 | # Tim Peters, |
| 26 | # Just van Rossum |
| 27 | |
Just van Rossum | ed2ed94 | 2000-07-01 14:30:08 +0000 | [diff] [blame] | 28 | __version__ = "0.3.3" |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 29 | |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 30 | import string, re |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 31 | |
| 32 | # First a little helper, since I don't like to repeat things. (Tismer speaking) |
| 33 | import string |
| 34 | def replace(where, what, with): |
| 35 | return string.join(string.split(where, what), with) |
| 36 | |
| 37 | # This list of keywords is taken from ref/node13.html of the |
| 38 | # Python 1.3 HTML documentation. ("access" is intentionally omitted.) |
| 39 | keywordsList = [ |
Just van Rossum | 6e5f2d1 | 2000-04-09 19:44:13 +0000 | [diff] [blame] | 40 | "assert", "exec", |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 41 | "del", "from", "lambda", "return", |
| 42 | "and", "elif", "global", "not", "try", |
| 43 | "break", "else", "if", "or", "while", |
| 44 | "class", "except", "import", "pass", |
| 45 | "continue", "finally", "in", "print", |
| 46 | "def", "for", "is", "raise"] |
| 47 | |
| 48 | # Build up a regular expression which will match anything |
| 49 | # interesting, including multi-line triple-quoted strings. |
| 50 | commentPat = "#.*" |
| 51 | |
| 52 | pat = "q[^\q\n]*\(\\\\[\000-\377][^\q\n]*\)*q" |
| 53 | quotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"') |
| 54 | |
| 55 | # Way to go, Tim! |
| 56 | pat = """ |
| 57 | qqq |
| 58 | [^\\q]* |
| 59 | \( |
| 60 | \( \\\\[\000-\377] |
| 61 | \| q |
| 62 | \( \\\\[\000-\377] |
| 63 | \| [^\\q] |
| 64 | \| q |
| 65 | \( \\\\[\000-\377] |
| 66 | \| [^\\q] |
| 67 | \) |
| 68 | \) |
| 69 | \) |
| 70 | [^\\q]* |
| 71 | \)* |
| 72 | qqq |
| 73 | """ |
| 74 | pat = string.join(string.split(pat), '') # get rid of whitespace |
| 75 | tripleQuotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"') |
| 76 | |
| 77 | # Build up a regular expression which matches all and only |
| 78 | # Python keywords. This will let us skip the uninteresting |
| 79 | # identifier references. |
| 80 | # nonKeyPat identifies characters which may legally precede |
| 81 | # a keyword pattern. |
| 82 | nonKeyPat = "\(^\|[^a-zA-Z0-9_.\"']\)" |
| 83 | |
| 84 | keyPat = nonKeyPat + "\(" |
| 85 | for keyword in keywordsList: |
| 86 | keyPat = keyPat + keyword + "\|" |
| 87 | keyPat = keyPat[:-2] + "\)" + nonKeyPat |
| 88 | |
Just van Rossum | ed2ed94 | 2000-07-01 14:30:08 +0000 | [diff] [blame] | 89 | matchPat = commentPat + "\|" + keyPat + "\|" + tripleQuotePat + "\|" + quotePat |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 90 | matchRE = re.compile(matchPat) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 91 | |
| 92 | idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace. |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 93 | idRE = re.compile(idKeyPat) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 94 | |
| 95 | |
| 96 | def fontify(pytext, searchfrom = 0, searchto = None): |
| 97 | if searchto is None: |
| 98 | searchto = len(pytext) |
| 99 | # Cache a few attributes for quicker reference. |
| 100 | search = matchRE.search |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 101 | idSearch = idRE.search |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 102 | |
| 103 | tags = [] |
| 104 | tags_append = tags.append |
| 105 | commentTag = 'comment' |
| 106 | stringTag = 'string' |
| 107 | keywordTag = 'keyword' |
| 108 | identifierTag = 'identifier' |
| 109 | |
| 110 | start = 0 |
| 111 | end = searchfrom |
| 112 | while 1: |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 113 | m = search(pytext, end) |
| 114 | if not m or m.start() >= searchto: |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 115 | break # EXIT LOOP |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 116 | match = m.group(0) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 117 | end = start + len(match) |
| 118 | c = match[0] |
| 119 | if c not in "#'\"": |
| 120 | # Must have matched a keyword. |
| 121 | if start <> searchfrom: |
| 122 | # there's still a redundant char before and after it, strip! |
| 123 | match = match[1:-1] |
| 124 | start = start + 1 |
| 125 | else: |
| 126 | # this is the first keyword in the text. |
| 127 | # Only a space at the end. |
| 128 | match = match[:-1] |
| 129 | end = end - 1 |
| 130 | tags_append((keywordTag, start, end, None)) |
| 131 | # If this was a defining keyword, look ahead to the |
| 132 | # following identifier. |
| 133 | if match in ["def", "class"]: |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 134 | m = idSearch(pytext, end) |
| 135 | if m and m.start() == end: |
| 136 | match = m.group(0) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 137 | end = start + len(match) |
| 138 | tags_append((identifierTag, start, end, None)) |
| 139 | elif c == "#": |
| 140 | tags_append((commentTag, start, end, None)) |
| 141 | else: |
| 142 | tags_append((stringTag, start, end, None)) |
| 143 | return tags |
| 144 | |
| 145 | |
| 146 | def test(path): |
| 147 | f = open(path) |
| 148 | text = f.read() |
| 149 | f.close() |
| 150 | tags = fontify(text) |
| 151 | for tag, start, end, sublist in tags: |
| 152 | print tag, `text[start:end]` |