blob: b5d6102777138b228739da39ccc0a2338aab40c4 [file] [log] [blame]
Just van Rossum40f9b7b1999-01-30 22:39:17 +00001"""Module to analyze Python source code; for syntax coloring tools.
2
3Interface:
4 tags = fontify(pytext, searchfrom, searchto)
5
6The 'pytext' argument is a string containing Python source code.
7The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
8The returned value is a list of tuples, formatted like this:
9 [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
10The tuple contents are always like this:
11 (tag, startindex, endindex, sublist)
12tag is one of 'keyword', 'string', 'comment' or 'identifier'
13sublist is not used, hence always None.
14"""
15
16# Based on FontText.py by Mitchell S. Chapman,
17# which was modified by Zachary Roadhouse,
18# then un-Tk'd by Just van Rossum.
19# Many thanks for regular expression debugging & authoring are due to:
20# Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
21# So, who owns the copyright? ;-) How about this:
Just van Rossumed2ed942000-07-01 14:30:08 +000022# Copyright 1996-2000:
Just van Rossum40f9b7b1999-01-30 22:39:17 +000023# Mitchell S. Chapman,
24# Zachary Roadhouse,
25# Tim Peters,
26# Just van Rossum
27
Just van Rossumed2ed942000-07-01 14:30:08 +000028__version__ = "0.3.3"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000029
Jack Jansen9ad27522001-02-21 13:54:31 +000030import string, re
Just van Rossum40f9b7b1999-01-30 22:39:17 +000031
32# First a little helper, since I don't like to repeat things. (Tismer speaking)
33import string
34def replace(where, what, with):
35 return string.join(string.split(where, what), with)
36
37# This list of keywords is taken from ref/node13.html of the
38# Python 1.3 HTML documentation. ("access" is intentionally omitted.)
39keywordsList = [
Just van Rossum6e5f2d12000-04-09 19:44:13 +000040 "assert", "exec",
Just van Rossum40f9b7b1999-01-30 22:39:17 +000041 "del", "from", "lambda", "return",
42 "and", "elif", "global", "not", "try",
43 "break", "else", "if", "or", "while",
44 "class", "except", "import", "pass",
45 "continue", "finally", "in", "print",
46 "def", "for", "is", "raise"]
47
48# Build up a regular expression which will match anything
49# interesting, including multi-line triple-quoted strings.
50commentPat = "#.*"
51
52pat = "q[^\q\n]*\(\\\\[\000-\377][^\q\n]*\)*q"
53quotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')
54
55# Way to go, Tim!
56pat = """
57 qqq
58 [^\\q]*
59 \(
60 \( \\\\[\000-\377]
61 \| q
62 \( \\\\[\000-\377]
63 \| [^\\q]
64 \| q
65 \( \\\\[\000-\377]
66 \| [^\\q]
67 \)
68 \)
69 \)
70 [^\\q]*
71 \)*
72 qqq
73"""
74pat = string.join(string.split(pat), '') # get rid of whitespace
75tripleQuotePat = replace(pat, "q", "'") + "\|" + replace(pat, 'q', '"')
76
77# Build up a regular expression which matches all and only
78# Python keywords. This will let us skip the uninteresting
79# identifier references.
80# nonKeyPat identifies characters which may legally precede
81# a keyword pattern.
82nonKeyPat = "\(^\|[^a-zA-Z0-9_.\"']\)"
83
84keyPat = nonKeyPat + "\("
85for keyword in keywordsList:
86 keyPat = keyPat + keyword + "\|"
87keyPat = keyPat[:-2] + "\)" + nonKeyPat
88
Just van Rossumed2ed942000-07-01 14:30:08 +000089matchPat = commentPat + "\|" + keyPat + "\|" + tripleQuotePat + "\|" + quotePat
Jack Jansen9ad27522001-02-21 13:54:31 +000090matchRE = re.compile(matchPat)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000091
92idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace.
Jack Jansen9ad27522001-02-21 13:54:31 +000093idRE = re.compile(idKeyPat)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000094
95
96def fontify(pytext, searchfrom = 0, searchto = None):
97 if searchto is None:
98 searchto = len(pytext)
99 # Cache a few attributes for quicker reference.
100 search = matchRE.search
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000101 idSearch = idRE.search
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000102
103 tags = []
104 tags_append = tags.append
105 commentTag = 'comment'
106 stringTag = 'string'
107 keywordTag = 'keyword'
108 identifierTag = 'identifier'
109
110 start = 0
111 end = searchfrom
112 while 1:
Jack Jansen9ad27522001-02-21 13:54:31 +0000113 m = search(pytext, end)
114 if not m or m.start() >= searchto:
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000115 break # EXIT LOOP
Jack Jansen9ad27522001-02-21 13:54:31 +0000116 match = m.group(0)
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000117 end = start + len(match)
118 c = match[0]
119 if c not in "#'\"":
120 # Must have matched a keyword.
121 if start <> searchfrom:
122 # there's still a redundant char before and after it, strip!
123 match = match[1:-1]
124 start = start + 1
125 else:
126 # this is the first keyword in the text.
127 # Only a space at the end.
128 match = match[:-1]
129 end = end - 1
130 tags_append((keywordTag, start, end, None))
131 # If this was a defining keyword, look ahead to the
132 # following identifier.
133 if match in ["def", "class"]:
Jack Jansen9ad27522001-02-21 13:54:31 +0000134 m = idSearch(pytext, end)
135 if m and m.start() == end:
136 match = m.group(0)
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000137 end = start + len(match)
138 tags_append((identifierTag, start, end, None))
139 elif c == "#":
140 tags_append((commentTag, start, end, None))
141 else:
142 tags_append((stringTag, start, end, None))
143 return tags
144
145
146def test(path):
147 f = open(path)
148 text = f.read()
149 f.close()
150 tags = fontify(text)
151 for tag, start, end, sublist in tags:
152 print tag, `text[start:end]`