blob: d60456c67696540803d94ec1199fdc5447d5922e [file] [log] [blame]
Just van Rossum40f9b7b1999-01-30 22:39:17 +00001"""Module to analyze Python source code; for syntax coloring tools.
2
3Interface:
4 tags = fontify(pytext, searchfrom, searchto)
5
6The 'pytext' argument is a string containing Python source code.
7The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
8The returned value is a list of tuples, formatted like this:
9 [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
10The tuple contents are always like this:
11 (tag, startindex, endindex, sublist)
12tag is one of 'keyword', 'string', 'comment' or 'identifier'
13sublist is not used, hence always None.
14"""
15
16# Based on FontText.py by Mitchell S. Chapman,
17# which was modified by Zachary Roadhouse,
18# then un-Tk'd by Just van Rossum.
19# Many thanks for regular expression debugging & authoring are due to:
20# Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
21# So, who owns the copyright? ;-) How about this:
22# Copyright 1996-1997:
23# Mitchell S. Chapman,
24# Zachary Roadhouse,
25# Tim Peters,
26# Just van Rossum
27
Just van Rossum6e5f2d12000-04-09 19:44:13 +000028__version__ = "0.3.2"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000029
30import string, regex
31
32# First a little helper, since I don't like to repeat things. (Tismer speaking)
33import string
34def replace(where, what, with):
35 return string.join(string.split(where, what), with)
36
37# This list of keywords is taken from ref/node13.html of the
38# Python 1.3 HTML documentation. ("access" is intentionally omitted.)
39keywordsList = [
Just van Rossum6e5f2d12000-04-09 19:44:13 +000040 "assert", "exec",
Just van Rossum40f9b7b1999-01-30 22:39:17 +000041 "del", "from", "lambda", "return",
42 "and", "elif", "global", "not", "try",
43 "break", "else", "if", "or", "while",
44 "class", "except", "import", "pass",
45 "continue", "finally", "in", "print",
46 "def", "for", "is", "raise"]
47
48# Build up a regular expression which will match anything
49# interesting, including multi-line triple-quoted strings.
50commentPat = "#.*"
51
52pat = "q[^\q\n]*\(\\\\[\000-\377][^\q\n]*\)*q"
53quotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')
54
55# Way to go, Tim!
56pat = """
57 qqq
58 [^\\q]*
59 \(
60 \( \\\\[\000-\377]
61 \| q
62 \( \\\\[\000-\377]
63 \| [^\\q]
64 \| q
65 \( \\\\[\000-\377]
66 \| [^\\q]
67 \)
68 \)
69 \)
70 [^\\q]*
71 \)*
72 qqq
73"""
74pat = string.join(string.split(pat), '') # get rid of whitespace
75tripleQuotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')
76
77# Build up a regular expression which matches all and only
78# Python keywords. This will let us skip the uninteresting
79# identifier references.
80# nonKeyPat identifies characters which may legally precede
81# a keyword pattern.
82nonKeyPat = "\(^\|[^a-zA-Z0-9_.\"']\)"
83
84keyPat = nonKeyPat + "\("
85for keyword in keywordsList:
86 keyPat = keyPat + keyword + "\|"
87keyPat = keyPat[:-2] + "\)" + nonKeyPat
88
89matchPat = keyPat + "\|" + commentPat + "\|" + tripleQuotePat + "\|" + quotePat
90matchRE = regex.compile(matchPat)
91
92idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace.
93idRE = regex.compile(idKeyPat)
94
95
96def fontify(pytext, searchfrom = 0, searchto = None):
97 if searchto is None:
98 searchto = len(pytext)
99 # Cache a few attributes for quicker reference.
100 search = matchRE.search
101 group = matchRE.group
102 idSearch = idRE.search
103 idGroup = idRE.group
104
105 tags = []
106 tags_append = tags.append
107 commentTag = 'comment'
108 stringTag = 'string'
109 keywordTag = 'keyword'
110 identifierTag = 'identifier'
111
112 start = 0
113 end = searchfrom
114 while 1:
115 start = search(pytext, end)
116 if start < 0 or start >= searchto:
117 break # EXIT LOOP
118 match = group(0)
119 end = start + len(match)
120 c = match[0]
121 if c not in "#'\"":
122 # Must have matched a keyword.
123 if start <> searchfrom:
124 # there's still a redundant char before and after it, strip!
125 match = match[1:-1]
126 start = start + 1
127 else:
128 # this is the first keyword in the text.
129 # Only a space at the end.
130 match = match[:-1]
131 end = end - 1
132 tags_append((keywordTag, start, end, None))
133 # If this was a defining keyword, look ahead to the
134 # following identifier.
135 if match in ["def", "class"]:
136 start = idSearch(pytext, end)
137 if start == end:
138 match = idGroup(0)
139 end = start + len(match)
140 tags_append((identifierTag, start, end, None))
141 elif c == "#":
142 tags_append((commentTag, start, end, None))
143 else:
144 tags_append((stringTag, start, end, None))
145 return tags
146
147
148def test(path):
149 f = open(path)
150 text = f.read()
151 f.close()
152 tags = fontify(text)
153 for tag, start, end, sublist in tags:
154 print tag, `text[start:end]`