blob: a61de659b8f6cce1f08b44af24c8bd12b397f1d7 [file] [log] [blame]
Just van Rossum40f9b7b1999-01-30 22:39:17 +00001"""Module to analyze Python source code; for syntax coloring tools.
2
3Interface:
4 tags = fontify(pytext, searchfrom, searchto)
5
6The 'pytext' argument is a string containing Python source code.
7The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
8The returned value is a list of tuples, formatted like this:
9 [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
10The tuple contents are always like this:
11 (tag, startindex, endindex, sublist)
12tag is one of 'keyword', 'string', 'comment' or 'identifier'
13sublist is not used, hence always None.
14"""
15
16# Based on FontText.py by Mitchell S. Chapman,
17# which was modified by Zachary Roadhouse,
18# then un-Tk'd by Just van Rossum.
19# Many thanks for regular expression debugging & authoring are due to:
20# Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
21# So, who owns the copyright? ;-) How about this:
22# Copyright 1996-1997:
23# Mitchell S. Chapman,
24# Zachary Roadhouse,
25# Tim Peters,
26# Just van Rossum
27
28__version__ = "0.3.1"
29
30import string, regex
31
32# First a little helper, since I don't like to repeat things. (Tismer speaking)
33import string
34def replace(where, what, with):
35 return string.join(string.split(where, what), with)
36
37# This list of keywords is taken from ref/node13.html of the
38# Python 1.3 HTML documentation. ("access" is intentionally omitted.)
39keywordsList = [
40 "assert",
41 "del", "from", "lambda", "return",
42 "and", "elif", "global", "not", "try",
43 "break", "else", "if", "or", "while",
44 "class", "except", "import", "pass",
45 "continue", "finally", "in", "print",
46 "def", "for", "is", "raise"]
47
48# Build up a regular expression which will match anything
49# interesting, including multi-line triple-quoted strings.
50commentPat = "#.*"
51
52pat = "q[^\q\n]*\(\\\\[\000-\377][^\q\n]*\)*q"
53quotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')
54
55# Way to go, Tim!
56pat = """
57 qqq
58 [^\\q]*
59 \(
60 \( \\\\[\000-\377]
61 \| q
62 \( \\\\[\000-\377]
63 \| [^\\q]
64 \| q
65 \( \\\\[\000-\377]
66 \| [^\\q]
67 \)
68 \)
69 \)
70 [^\\q]*
71 \)*
72 qqq
73"""
74pat = string.join(string.split(pat), '') # get rid of whitespace
75tripleQuotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')
76
77# Build up a regular expression which matches all and only
78# Python keywords. This will let us skip the uninteresting
79# identifier references.
80# nonKeyPat identifies characters which may legally precede
81# a keyword pattern.
82nonKeyPat = "\(^\|[^a-zA-Z0-9_.\"']\)"
83
84keyPat = nonKeyPat + "\("
85for keyword in keywordsList:
86 keyPat = keyPat + keyword + "\|"
87keyPat = keyPat[:-2] + "\)" + nonKeyPat
88
89matchPat = keyPat + "\|" + commentPat + "\|" + tripleQuotePat + "\|" + quotePat
90matchRE = regex.compile(matchPat)
91
92idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace.
93idRE = regex.compile(idKeyPat)
94
95
96def fontify(pytext, searchfrom = 0, searchto = None):
97 if searchto is None:
98 searchto = len(pytext)
99 # Cache a few attributes for quicker reference.
100 search = matchRE.search
101 group = matchRE.group
102 idSearch = idRE.search
103 idGroup = idRE.group
104
105 tags = []
106 tags_append = tags.append
107 commentTag = 'comment'
108 stringTag = 'string'
109 keywordTag = 'keyword'
110 identifierTag = 'identifier'
111
112 start = 0
113 end = searchfrom
114 while 1:
115 start = search(pytext, end)
116 if start < 0 or start >= searchto:
117 break # EXIT LOOP
118 match = group(0)
119 end = start + len(match)
120 c = match[0]
121 if c not in "#'\"":
122 # Must have matched a keyword.
123 if start <> searchfrom:
124 # there's still a redundant char before and after it, strip!
125 match = match[1:-1]
126 start = start + 1
127 else:
128 # this is the first keyword in the text.
129 # Only a space at the end.
130 match = match[:-1]
131 end = end - 1
132 tags_append((keywordTag, start, end, None))
133 # If this was a defining keyword, look ahead to the
134 # following identifier.
135 if match in ["def", "class"]:
136 start = idSearch(pytext, end)
137 if start == end:
138 match = idGroup(0)
139 end = start + len(match)
140 tags_append((identifierTag, start, end, None))
141 elif c == "#":
142 tags_append((commentTag, start, end, None))
143 else:
144 tags_append((stringTag, start, end, None))
145 return tags
146
147
148def test(path):
149 f = open(path)
150 text = f.read()
151 f.close()
152 tags = fontify(text)
153 for tag, start, end, sublist in tags:
154 print tag, `text[start:end]`