Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 1 | """Module to analyze Python source code; for syntax coloring tools. |
| 2 | |
| 3 | Interface: |
| 4 | tags = fontify(pytext, searchfrom, searchto) |
| 5 | |
| 6 | The 'pytext' argument is a string containing Python source code. |
| 7 | The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext. |
| 8 | The returned value is a list of tuples, formatted like this: |
| 9 | [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ] |
| 10 | The tuple contents are always like this: |
| 11 | (tag, startindex, endindex, sublist) |
| 12 | tag is one of 'keyword', 'string', 'comment' or 'identifier' |
| 13 | sublist is not used, hence always None. |
| 14 | """ |
| 15 | |
| 16 | # Based on FontText.py by Mitchell S. Chapman, |
| 17 | # which was modified by Zachary Roadhouse, |
| 18 | # then un-Tk'd by Just van Rossum. |
| 19 | # Many thanks for regular expression debugging & authoring are due to: |
| 20 | # Tim (the-incredib-ly y'rs) Peters and Cristian Tismer |
| 21 | # So, who owns the copyright? ;-) How about this: |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 22 | # Copyright 1996-2001: |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 23 | # Mitchell S. Chapman, |
| 24 | # Zachary Roadhouse, |
| 25 | # Tim Peters, |
| 26 | # Just van Rossum |
| 27 | |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 28 | __version__ = "0.4" |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 29 | |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 30 | import string |
| 31 | import re |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 32 | |
| 33 | # First a little helper, since I don't like to repeat things. (Tismer speaking) |
| 34 | import string |
| 35 | def replace(where, what, with): |
| 36 | return string.join(string.split(where, what), with) |
| 37 | |
| 38 | # This list of keywords is taken from ref/node13.html of the |
| 39 | # Python 1.3 HTML documentation. ("access" is intentionally omitted.) |
| 40 | keywordsList = [ |
Just van Rossum | 6e5f2d1 | 2000-04-09 19:44:13 +0000 | [diff] [blame] | 41 | "assert", "exec", |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 42 | "del", "from", "lambda", "return", |
| 43 | "and", "elif", "global", "not", "try", |
| 44 | "break", "else", "if", "or", "while", |
| 45 | "class", "except", "import", "pass", |
| 46 | "continue", "finally", "in", "print", |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 47 | "def", "for", "is", "raise", "yield"] |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 48 | |
| 49 | # Build up a regular expression which will match anything |
| 50 | # interesting, including multi-line triple-quoted strings. |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 51 | commentPat = r"#[^\n]*" |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 52 | |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 53 | pat = r"q[^\\q\n]*(\\[\000-\377][^\\q\n]*)*q" |
| 54 | quotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"') |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 55 | |
| 56 | # Way to go, Tim! |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 57 | pat = r""" |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 58 | qqq |
| 59 | [^\\q]* |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 60 | ( |
| 61 | ( \\[\000-\377] |
| 62 | | q |
| 63 | ( \\[\000-\377] |
| 64 | | [^\q] |
| 65 | | q |
| 66 | ( \\[\000-\377] |
| 67 | | [^\\q] |
| 68 | ) |
| 69 | ) |
| 70 | ) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 71 | [^\\q]* |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 72 | )* |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 73 | qqq |
| 74 | """ |
| 75 | pat = string.join(string.split(pat), '') # get rid of whitespace |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 76 | tripleQuotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"') |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 77 | |
| 78 | # Build up a regular expression which matches all and only |
| 79 | # Python keywords. This will let us skip the uninteresting |
| 80 | # identifier references. |
| 81 | # nonKeyPat identifies characters which may legally precede |
| 82 | # a keyword pattern. |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 83 | nonKeyPat = r"(^|[^a-zA-Z0-9_.\"'])" |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 84 | |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 85 | keyPat = nonKeyPat + "(" + "|".join(keywordsList) + ")" + nonKeyPat |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 86 | |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 87 | matchPat = commentPat + "|" + keyPat + "|" + tripleQuotePat + "|" + quotePat |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 88 | matchRE = re.compile(matchPat) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 89 | |
| 90 | idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace. |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 91 | idRE = re.compile(idKeyPat) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 92 | |
| 93 | |
| 94 | def fontify(pytext, searchfrom = 0, searchto = None): |
| 95 | if searchto is None: |
| 96 | searchto = len(pytext) |
| 97 | # Cache a few attributes for quicker reference. |
| 98 | search = matchRE.search |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 99 | idSearch = idRE.search |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 100 | |
| 101 | tags = [] |
| 102 | tags_append = tags.append |
| 103 | commentTag = 'comment' |
| 104 | stringTag = 'string' |
| 105 | keywordTag = 'keyword' |
| 106 | identifierTag = 'identifier' |
| 107 | |
| 108 | start = 0 |
| 109 | end = searchfrom |
| 110 | while 1: |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 111 | m = search(pytext, end) |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 112 | if m is None: |
| 113 | break # EXIT LOOP |
| 114 | start = m.start() |
| 115 | if start >= searchto: |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 116 | break # EXIT LOOP |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 117 | match = m.group(0) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 118 | end = start + len(match) |
| 119 | c = match[0] |
| 120 | if c not in "#'\"": |
| 121 | # Must have matched a keyword. |
| 122 | if start <> searchfrom: |
| 123 | # there's still a redundant char before and after it, strip! |
| 124 | match = match[1:-1] |
| 125 | start = start + 1 |
| 126 | else: |
| 127 | # this is the first keyword in the text. |
| 128 | # Only a space at the end. |
| 129 | match = match[:-1] |
| 130 | end = end - 1 |
| 131 | tags_append((keywordTag, start, end, None)) |
| 132 | # If this was a defining keyword, look ahead to the |
| 133 | # following identifier. |
| 134 | if match in ["def", "class"]: |
Jack Jansen | 9ad2752 | 2001-02-21 13:54:31 +0000 | [diff] [blame] | 135 | m = idSearch(pytext, end) |
Just van Rossum | 3eec762 | 2001-07-10 19:25:40 +0000 | [diff] [blame] | 136 | if m is not None: |
| 137 | start = m.start() |
| 138 | if start == end: |
| 139 | match = m.group(0) |
| 140 | end = start + len(match) |
| 141 | tags_append((identifierTag, start, end, None)) |
Just van Rossum | 40f9b7b | 1999-01-30 22:39:17 +0000 | [diff] [blame] | 142 | elif c == "#": |
| 143 | tags_append((commentTag, start, end, None)) |
| 144 | else: |
| 145 | tags_append((stringTag, start, end, None)) |
| 146 | return tags |
| 147 | |
| 148 | |
| 149 | def test(path): |
| 150 | f = open(path) |
| 151 | text = f.read() |
| 152 | f.close() |
| 153 | tags = fontify(text) |
| 154 | for tag, start, end, sublist in tags: |
| 155 | print tag, `text[start:end]` |