blob: 41f19424c39e5680f5fb17f3441778d1a22c23e0 [file] [log] [blame]
Just van Rossum40f9b7b1999-01-30 22:39:17 +00001"""Module to analyze Python source code; for syntax coloring tools.
2
3Interface:
Tim Peters182b5ac2004-07-18 06:16:08 +00004 tags = fontify(pytext, searchfrom, searchto)
Just van Rossum40f9b7b1999-01-30 22:39:17 +00005
6The 'pytext' argument is a string containing Python source code.
Tim Peters182b5ac2004-07-18 06:16:08 +00007The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
Just van Rossum40f9b7b1999-01-30 22:39:17 +00008The returned value is a list of tuples, formatted like this:
Tim Peters182b5ac2004-07-18 06:16:08 +00009 [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
Just van Rossum40f9b7b1999-01-30 22:39:17 +000010The tuple contents are always like this:
Tim Peters182b5ac2004-07-18 06:16:08 +000011 (tag, startindex, endindex, sublist)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000012tag is one of 'keyword', 'string', 'comment' or 'identifier'
Tim Peters182b5ac2004-07-18 06:16:08 +000013sublist is not used, hence always None.
Just van Rossum40f9b7b1999-01-30 22:39:17 +000014"""
15
16# Based on FontText.py by Mitchell S. Chapman,
17# which was modified by Zachary Roadhouse,
18# then un-Tk'd by Just van Rossum.
19# Many thanks for regular expression debugging & authoring are due to:
Tim Peters182b5ac2004-07-18 06:16:08 +000020# Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
Just van Rossum40f9b7b1999-01-30 22:39:17 +000021# So, who owns the copyright? ;-) How about this:
Tim Peters182b5ac2004-07-18 06:16:08 +000022# Copyright 1996-2001:
23# Mitchell S. Chapman,
24# Zachary Roadhouse,
25# Tim Peters,
26# Just van Rossum
Just van Rossum40f9b7b1999-01-30 22:39:17 +000027
Just van Rossum3eec7622001-07-10 19:25:40 +000028__version__ = "0.4"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000029
Just van Rossum3eec7622001-07-10 19:25:40 +000030import string
31import re
Just van Rossum40f9b7b1999-01-30 22:39:17 +000032
33# First a little helper, since I don't like to repeat things. (Tismer speaking)
34import string
35def replace(where, what, with):
Tim Peters182b5ac2004-07-18 06:16:08 +000036 return string.join(string.split(where, what), with)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000037
38# This list of keywords is taken from ref/node13.html of the
39# Python 1.3 HTML documentation. ("access" is intentionally omitted.)
40keywordsList = [
Tim Peters182b5ac2004-07-18 06:16:08 +000041 "assert", "exec",
42 "del", "from", "lambda", "return",
43 "and", "elif", "global", "not", "try",
44 "break", "else", "if", "or", "while",
45 "class", "except", "import", "pass",
46 "continue", "finally", "in", "print",
47 "def", "for", "is", "raise", "yield"]
Just van Rossum40f9b7b1999-01-30 22:39:17 +000048
49# Build up a regular expression which will match anything
50# interesting, including multi-line triple-quoted strings.
Just van Rossum3eec7622001-07-10 19:25:40 +000051commentPat = r"#[^\n]*"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000052
Just van Rossum3eec7622001-07-10 19:25:40 +000053pat = r"q[^\\q\n]*(\\[\000-\377][^\\q\n]*)*q"
54quotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"')
Just van Rossum40f9b7b1999-01-30 22:39:17 +000055
56# Way to go, Tim!
Just van Rossum3eec7622001-07-10 19:25:40 +000057pat = r"""
Tim Peters182b5ac2004-07-18 06:16:08 +000058 qqq
59 [^\\q]*
60 (
61 ( \\[\000-\377]
62 | q
63 ( \\[\000-\377]
64 | [^\q]
65 | q
66 ( \\[\000-\377]
67 | [^\\q]
68 )
69 )
70 )
71 [^\\q]*
72 )*
73 qqq
Just van Rossum40f9b7b1999-01-30 22:39:17 +000074"""
Tim Peters182b5ac2004-07-18 06:16:08 +000075pat = string.join(string.split(pat), '') # get rid of whitespace
Just van Rossum3eec7622001-07-10 19:25:40 +000076tripleQuotePat = replace(pat, "q", "'") + "|" + replace(pat, 'q', '"')
Just van Rossum40f9b7b1999-01-30 22:39:17 +000077
78# Build up a regular expression which matches all and only
79# Python keywords. This will let us skip the uninteresting
80# identifier references.
81# nonKeyPat identifies characters which may legally precede
82# a keyword pattern.
Just van Rossum3eec7622001-07-10 19:25:40 +000083nonKeyPat = r"(^|[^a-zA-Z0-9_.\"'])"
Just van Rossum40f9b7b1999-01-30 22:39:17 +000084
Just van Rossum3eec7622001-07-10 19:25:40 +000085keyPat = nonKeyPat + "(" + "|".join(keywordsList) + ")" + nonKeyPat
Just van Rossum40f9b7b1999-01-30 22:39:17 +000086
Just van Rossum3eec7622001-07-10 19:25:40 +000087matchPat = commentPat + "|" + keyPat + "|" + tripleQuotePat + "|" + quotePat
Jack Jansen9ad27522001-02-21 13:54:31 +000088matchRE = re.compile(matchPat)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000089
Tim Peters182b5ac2004-07-18 06:16:08 +000090idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace.
Jack Jansen9ad27522001-02-21 13:54:31 +000091idRE = re.compile(idKeyPat)
Just van Rossum40f9b7b1999-01-30 22:39:17 +000092
93
94def fontify(pytext, searchfrom = 0, searchto = None):
Tim Peters182b5ac2004-07-18 06:16:08 +000095 if searchto is None:
96 searchto = len(pytext)
97 # Cache a few attributes for quicker reference.
98 search = matchRE.search
99 idSearch = idRE.search
100
101 tags = []
102 tags_append = tags.append
103 commentTag = 'comment'
104 stringTag = 'string'
105 keywordTag = 'keyword'
106 identifierTag = 'identifier'
107
108 start = 0
109 end = searchfrom
110 while 1:
111 m = search(pytext, end)
112 if m is None:
113 break # EXIT LOOP
114 start = m.start()
115 if start >= searchto:
116 break # EXIT LOOP
117 match = m.group(0)
118 end = start + len(match)
119 c = match[0]
120 if c not in "#'\"":
121 # Must have matched a keyword.
122 if start <> searchfrom:
123 # there's still a redundant char before and after it, strip!
124 match = match[1:-1]
125 start = start + 1
126 else:
127 # this is the first keyword in the text.
128 # Only a space at the end.
129 match = match[:-1]
130 end = end - 1
131 tags_append((keywordTag, start, end, None))
132 # If this was a defining keyword, look ahead to the
133 # following identifier.
134 if match in ["def", "class"]:
135 m = idSearch(pytext, end)
136 if m is not None:
137 start = m.start()
138 if start == end:
139 match = m.group(0)
140 end = start + len(match)
141 tags_append((identifierTag, start, end, None))
142 elif c == "#":
143 tags_append((commentTag, start, end, None))
144 else:
145 tags_append((stringTag, start, end, None))
146 return tags
Just van Rossum40f9b7b1999-01-30 22:39:17 +0000147
148
149def test(path):
Tim Peters182b5ac2004-07-18 06:16:08 +0000150 f = open(path)
151 text = f.read()
152 f.close()
153 tags = fontify(text)
154 for tag, start, end, sublist in tags:
155 print tag, repr(text[start:end])