blob: 5680aa871ac328878fe7ea88a4ba7bcd3367b658 [file] [log] [blame]
Just van Rossum40f9b7b1999-01-30 22:39:17 +00001"""Module to analyze Python source code; for syntax coloring tools.
2
3Interface:
4 tags = fontify(pytext, searchfrom, searchto)
5
6The 'pytext' argument is a string containing Python source code.
7The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
8The returned value is a list of tuples, formatted like this:
9 [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
10The tuple contents are always like this:
11 (tag, startindex, endindex, sublist)
12tag is one of 'keyword', 'string', 'comment' or 'identifier'
13sublist is not used, hence always None.
14"""
15
16# Based on FontText.py by Mitchell S. Chapman,
17# which was modified by Zachary Roadhouse,
18# then un-Tk'd by Just van Rossum.
19# Many thanks for regular expression debugging & authoring are due to:
20# Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
21# So, who owns the copyright? ;-) How about this:
Just van Rossum3eec7622001-07-10 19:25:40 +000022# Copyright 1996-2001:
Just van Rossum40f9b7b1999-01-30 22:39:17 +000023# Mitchell S. Chapman,
24# Zachary Roadhouse,
25# Tim Peters,
26# Just van Rossum
27
Just van Rossum3eec7622001-07-10 19:25:40 +000028__version__ = "0.4"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000029
Just van Rossum3eec7622001-07-10 19:25:40 +000030import string
31import re
Just van Rossum40f9b7b1999-01-30 22:39:17 +000032
33# First a little helper, since I don't like to repeat things. (Tismer speaking)
34import string
35def replace(where, what, with):
36 return string.join(string.split(where, what), with)
37
38# This list of keywords is taken from ref/node13.html of the
39# Python 1.3 HTML documentation. ("access" is intentionally omitted.)
40keywordsList = [
Just van Rossum6e5f2d12000-04-09 19:44:13 +000041 "assert", "exec",
Just van Rossum40f9b7b1999-01-30 22:39:17 +000042 "del", "from", "lambda", "return",
43 "and", "elif", "global", "not", "try",
44 "break", "else", "if", "or", "while",
45 "class", "except", "import", "pass",
46 "continue", "finally", "in", "print",
Just van Rossum3eec7622001-07-10 19:25:40 +000047 "def", "for", "is", "raise", "yield"]
Just van Rossum40f9b7b1999-01-30 22:39:17 +000048
49# Build up a regular expression which will match anything
50# interesting, including multi-line triple-quoted strings.
Just van Rossum3eec7622001-07-10 19:25:40 +000051commentPat = r"#[^\n]*"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000052
Just van Rossum3eec7622001-07-10 19:25:40 +000053pat = r"q[^\\q\n]*(\\[\000-\377][^\\q\n]*)*q"
54quotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"')
Just van Rossum40f9b7b1999-01-30 22:39:17 +000055
56# Way to go, Tim!
Just van Rossum3eec7622001-07-10 19:25:40 +000057pat = r"""
Just van Rossum40f9b7b1999-01-30 22:39:17 +000058 qqq
59 [^\\q]*
Just van Rossum3eec7622001-07-10 19:25:40 +000060 (
61 ( \\[\000-\377]
62 | q
63 ( \\[\000-\377]
64 | [^\q]
65 | q
66 ( \\[\000-\377]
67 | [^\\q]
68 )
69 )
70 )
Just van Rossum40f9b7b1999-01-30 22:39:17 +000071 [^\\q]*
Just van Rossum3eec7622001-07-10 19:25:40 +000072 )*
Just van Rossum40f9b7b1999-01-30 22:39:17 +000073 qqq
74"""
75pat = string.join(string.split(pat), '') # get rid of whitespace
Just van Rossum3eec7622001-07-10 19:25:40 +000076tripleQuotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"')
Just van Rossum40f9b7b1999-01-30 22:39:17 +000077
78# Build up a regular expression which matches all and only
79# Python keywords. This will let us skip the uninteresting
80# identifier references.
81# nonKeyPat identifies characters which may legally precede
82# a keyword pattern.
Just van Rossum3eec7622001-07-10 19:25:40 +000083nonKeyPat = r"(^|[^a-zA-Z0-9_.\"'])"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000084
Just van Rossum3eec7622001-07-10 19:25:40 +000085keyPat = nonKeyPat + "(" + "|".join(keywordsList) + ")" + nonKeyPat
Just van Rossum40f9b7b1999-01-30 22:39:17 +000086
Just van Rossum3eec7622001-07-10 19:25:40 +000087matchPat = commentPat + "|" + keyPat + "|" + tripleQuotePat + "|" + quotePat
Jack Jansen9ad27522001-02-21 13:54:31 +000088matchRE = re.compile(matchPat)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000089
90idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace.
Jack Jansen9ad27522001-02-21 13:54:31 +000091idRE = re.compile(idKeyPat)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000092
93
94def fontify(pytext, searchfrom = 0, searchto = None):
95 if searchto is None:
96 searchto = len(pytext)
97 # Cache a few attributes for quicker reference.
98 search = matchRE.search
Just van Rossum40f9b7b1999-01-30 22:39:17 +000099 idSearch = idRE.search
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000100
101 tags = []
102 tags_append = tags.append
103 commentTag = 'comment'
104 stringTag = 'string'
105 keywordTag = 'keyword'
106 identifierTag = 'identifier'
107
108 start = 0
109 end = searchfrom
110 while 1:
Jack Jansen9ad27522001-02-21 13:54:31 +0000111 m = search(pytext, end)
Just van Rossum3eec7622001-07-10 19:25:40 +0000112 if m is None:
113 break # EXIT LOOP
114 start = m.start()
115 if start >= searchto:
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000116 break # EXIT LOOP
Jack Jansen9ad27522001-02-21 13:54:31 +0000117 match = m.group(0)
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000118 end = start + len(match)
119 c = match[0]
120 if c not in "#'\"":
121 # Must have matched a keyword.
122 if start <> searchfrom:
123 # there's still a redundant char before and after it, strip!
124 match = match[1:-1]
125 start = start + 1
126 else:
127 # this is the first keyword in the text.
128 # Only a space at the end.
129 match = match[:-1]
130 end = end - 1
131 tags_append((keywordTag, start, end, None))
132 # If this was a defining keyword, look ahead to the
133 # following identifier.
134 if match in ["def", "class"]:
Jack Jansen9ad27522001-02-21 13:54:31 +0000135 m = idSearch(pytext, end)
Just van Rossum3eec7622001-07-10 19:25:40 +0000136 if m is not None:
137 start = m.start()
138 if start == end:
139 match = m.group(0)
140 end = start + len(match)
141 tags_append((identifierTag, start, end, None))
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000142 elif c == "#":
143 tags_append((commentTag, start, end, None))
144 else:
145 tags_append((stringTag, start, end, None))
146 return tags
147
148
149def test(path):
150 f = open(path)
151 text = f.read()
152 f.close()
153 tags = fontify(text)
154 for tag, start, end, sublist in tags:
155 print tag, `text[start:end]`