blob: 77cb057ce21f58b10721cdeb59417c28edff08a4 [file] [log] [blame]
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -04001"""Provide advanced parsing abilities for ParenMatch and other extensions.
Terry Jan Reedy3e583302014-06-16 02:33:35 -04002
3HyperParser uses PyParser. PyParser mostly gives information on the
4proper indentation of code. HyperParser gives additional information on
5the structure of code.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +00006"""
7
8import string
Tal Einat9b7f9e62014-07-16 16:33:36 +03009from keyword import iskeyword
Kurt B. Kaiser2d7f6a02007-08-22 23:01:33 +000010from idlelib import PyParse
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000011
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000012
Tal Einat9b7f9e62014-07-16 16:33:36 +030013# all ASCII chars that may be in an identifier
14_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
15# all ASCII chars that may be the first char of an identifier
16_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
17
18# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
19_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
20# lookup table for whether 7-bit ASCII chars are valid as the first
21# char in a Python identifier
22_IS_ASCII_ID_FIRST_CHAR = \
23 [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
24
25
26class HyperParser:
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000027 def __init__(self, editwin, index):
Terry Jan Reedy3e583302014-06-16 02:33:35 -040028 "To initialize, analyze the surroundings of the given index."
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000029
30 self.editwin = editwin
31 self.text = text = editwin.text
32
33 parser = PyParse.Parser(editwin.indentwidth, editwin.tabwidth)
34
35 def index2line(index):
36 return int(float(index))
37 lno = index2line(text.index(index))
38
39 if not editwin.context_use_ps1:
40 for context in editwin.num_context_lines:
41 startat = max(lno - context, 1)
Brett Cannon0b70cca2006-08-25 02:59:59 +000042 startatindex = repr(startat) + ".0"
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000043 stopatindex = "%d.end" % lno
Terry Jan Reedy3e583302014-06-16 02:33:35 -040044 # We add the newline because PyParse requires a newline
45 # at end. We add a space so that index won't be at end
46 # of line, so that its status will be the same as the
47 # char before it, if should.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000048 parser.set_str(text.get(startatindex, stopatindex)+' \n')
49 bod = parser.find_good_parse_start(
50 editwin._build_char_in_string_func(startatindex))
51 if bod is not None or startat == 1:
52 break
53 parser.set_lo(bod or 0)
54 else:
55 r = text.tag_prevrange("console", index)
56 if r:
57 startatindex = r[1]
58 else:
59 startatindex = "1.0"
60 stopatindex = "%d.end" % lno
Terry Jan Reedy3e583302014-06-16 02:33:35 -040061 # We add the newline because PyParse requires it. We add a
62 # space so that index won't be at end of line, so that its
63 # status will be the same as the char before it, if should.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000064 parser.set_str(text.get(startatindex, stopatindex)+' \n')
65 parser.set_lo(0)
66
Terry Jan Reedy3e583302014-06-16 02:33:35 -040067 # We want what the parser has, minus the last newline and space.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000068 self.rawtext = parser.str[:-2]
Terry Jan Reedy3e583302014-06-16 02:33:35 -040069 # Parser.str apparently preserves the statement we are in, so
70 # that stopatindex can be used to synchronize the string with
71 # the text box indices.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000072 self.stopatindex = stopatindex
73 self.bracketing = parser.get_last_stmt_bracketing()
Terry Jan Reedy3e583302014-06-16 02:33:35 -040074 # find which pairs of bracketing are openers. These always
75 # correspond to a character of rawtext.
76 self.isopener = [i>0 and self.bracketing[i][1] >
77 self.bracketing[i-1][1]
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000078 for i in range(len(self.bracketing))]
79
80 self.set_index(index)
81
82 def set_index(self, index):
Terry Jan Reedy3e583302014-06-16 02:33:35 -040083 """Set the index to which the functions relate.
84
85 The index must be in the same statement.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000086 """
Terry Jan Reedy3e583302014-06-16 02:33:35 -040087 indexinrawtext = (len(self.rawtext) -
88 len(self.text.get(index, self.stopatindex)))
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000089 if indexinrawtext < 0:
Terry Jan Reedy3e583302014-06-16 02:33:35 -040090 raise ValueError("Index %s precedes the analyzed statement"
91 % index)
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000092 self.indexinrawtext = indexinrawtext
93 # find the rightmost bracket to which index belongs
94 self.indexbracket = 0
Terry Jan Reedy3e583302014-06-16 02:33:35 -040095 while (self.indexbracket < len(self.bracketing)-1 and
96 self.bracketing[self.indexbracket+1][0] < self.indexinrawtext):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000097 self.indexbracket += 1
Terry Jan Reedy3e583302014-06-16 02:33:35 -040098 if (self.indexbracket < len(self.bracketing)-1 and
99 self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and
100 not self.isopener[self.indexbracket+1]):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000101 self.indexbracket += 1
102
103 def is_in_string(self):
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -0400104 """Is the index given to the HyperParser in a string?"""
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000105 # The bracket to which we belong should be an opener.
106 # If it's an opener, it has to have a character.
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400107 return (self.isopener[self.indexbracket] and
108 self.rawtext[self.bracketing[self.indexbracket][0]]
109 in ('"', "'"))
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000110
111 def is_in_code(self):
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -0400112 """Is the index given to the HyperParser in normal code?"""
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400113 return (not self.isopener[self.indexbracket] or
114 self.rawtext[self.bracketing[self.indexbracket][0]]
115 not in ('#', '"', "'"))
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000116
117 def get_surrounding_brackets(self, openers='([{', mustclose=False):
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400118 """Return bracket indexes or None.
119
120 If the index given to the HyperParser is surrounded by a
121 bracket defined in openers (or at least has one before it),
122 return the indices of the opening bracket and the closing
123 bracket (or the end of line, whichever comes first).
124
125 If it is not surrounded by brackets, or the end of line comes
126 before the closing bracket and mustclose is True, returns None.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000127 """
Terry Jan Reedyd0c1ea42014-06-16 02:40:24 -0400128
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000129 bracketinglevel = self.bracketing[self.indexbracket][1]
130 before = self.indexbracket
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400131 while (not self.isopener[before] or
132 self.rawtext[self.bracketing[before][0]] not in openers or
133 self.bracketing[before][1] > bracketinglevel):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000134 before -= 1
135 if before < 0:
136 return None
137 bracketinglevel = min(bracketinglevel, self.bracketing[before][1])
138 after = self.indexbracket + 1
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400139 while (after < len(self.bracketing) and
140 self.bracketing[after][1] >= bracketinglevel):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000141 after += 1
142
143 beforeindex = self.text.index("%s-%dc" %
144 (self.stopatindex, len(self.rawtext)-self.bracketing[before][0]))
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400145 if (after >= len(self.bracketing) or
146 self.bracketing[after][0] > len(self.rawtext)):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000147 if mustclose:
148 return None
149 afterindex = self.stopatindex
150 else:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400151 # We are after a real char, so it is a ')' and we give the
152 # index before it.
153 afterindex = self.text.index(
154 "%s-%dc" % (self.stopatindex,
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000155 len(self.rawtext)-(self.bracketing[after][0]-1)))
156
157 return beforeindex, afterindex
158
Tal Einat9b7f9e62014-07-16 16:33:36 +0300159 # the set of built-in identifiers which are also keywords,
160 # i.e. keyword.iskeyword() returns True for them
161 _ID_KEYWORDS = frozenset({"True", "False", "None"})
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000162
Tal Einat9b7f9e62014-07-16 16:33:36 +0300163 @classmethod
164 def _eat_identifier(cls, str, limit, pos):
165 """Given a string and pos, return the number of chars in the
166 identifier which ends at pos, or 0 if there is no such one.
167
168 This ignores non-identifier eywords are not identifiers.
169 """
170 is_ascii_id_char = _IS_ASCII_ID_CHAR
171
172 # Start at the end (pos) and work backwards.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000173 i = pos
Tal Einat9b7f9e62014-07-16 16:33:36 +0300174
175 # Go backwards as long as the characters are valid ASCII
176 # identifier characters. This is an optimization, since it
177 # is faster in the common case where most of the characters
178 # are ASCII.
179 while i > limit and (
180 ord(str[i - 1]) < 128 and
181 is_ascii_id_char[ord(str[i - 1])]
182 ):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000183 i -= 1
Tal Einat9b7f9e62014-07-16 16:33:36 +0300184
185 # If the above loop ended due to reaching a non-ASCII
186 # character, continue going backwards using the most generic
187 # test for whether a string contains only valid identifier
188 # characters.
189 if i > limit and ord(str[i - 1]) >= 128:
190 while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
191 i -= 4
192 if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
193 i -= 2
194 if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
195 i -= 1
196
197 # The identifier candidate starts here. If it isn't a valid
198 # identifier, don't eat anything. At this point that is only
199 # possible if the first character isn't a valid first
200 # character for an identifier.
201 if not str[i:pos].isidentifier():
202 return 0
203 elif i < pos:
204 # All characters in str[i:pos] are valid ASCII identifier
205 # characters, so it is enough to check that the first is
206 # valid as the first character of an identifier.
207 if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
208 return 0
209
210 # All keywords are valid identifiers, but should not be
211 # considered identifiers here, except for True, False and None.
212 if i < pos and (
213 iskeyword(str[i:pos]) and
214 str[i:pos] not in cls._ID_KEYWORDS
215 ):
216 return 0
217
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000218 return pos - i
219
Tal Einat9b7f9e62014-07-16 16:33:36 +0300220 # This string includes all chars that may be in a white space
221 _whitespace_chars = " \t\n\\"
222
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000223 def get_expression(self):
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400224 """Return a string with the Python expression which ends at the
225 given index, which is empty if there is no real one.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000226 """
227 if not self.is_in_code():
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400228 raise ValueError("get_expression should only be called"
229 "if index is inside a code.")
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000230
231 rawtext = self.rawtext
232 bracketing = self.bracketing
233
234 brck_index = self.indexbracket
235 brck_limit = bracketing[brck_index][0]
236 pos = self.indexinrawtext
237
238 last_identifier_pos = pos
239 postdot_phase = True
240
241 while 1:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400242 # Eat whitespaces, comments, and if postdot_phase is False - a dot
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000243 while 1:
244 if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars:
245 # Eat a whitespace
246 pos -= 1
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400247 elif (not postdot_phase and
248 pos > brck_limit and rawtext[pos-1] == '.'):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000249 # Eat a dot
250 pos -= 1
251 postdot_phase = True
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400252 # The next line will fail if we are *inside* a comment,
253 # but we shouldn't be.
254 elif (pos == brck_limit and brck_index > 0 and
255 rawtext[bracketing[brck_index-1][0]] == '#'):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000256 # Eat a comment
257 brck_index -= 2
258 brck_limit = bracketing[brck_index][0]
259 pos = bracketing[brck_index+1][0]
260 else:
261 # If we didn't eat anything, quit.
262 break
263
264 if not postdot_phase:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400265 # We didn't find a dot, so the expression end at the
266 # last identifier pos.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000267 break
268
269 ret = self._eat_identifier(rawtext, brck_limit, pos)
270 if ret:
271 # There is an identifier to eat
272 pos = pos - ret
273 last_identifier_pos = pos
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400274 # Now, to continue the search, we must find a dot.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000275 postdot_phase = False
276 # (the loop continues now)
277
278 elif pos == brck_limit:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400279 # We are at a bracketing limit. If it is a closing
280 # bracket, eat the bracket, otherwise, stop the search.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000281 level = bracketing[brck_index][1]
282 while brck_index > 0 and bracketing[brck_index-1][1] > level:
283 brck_index -= 1
284 if bracketing[brck_index][0] == brck_limit:
285 # We were not at the end of a closing bracket
286 break
287 pos = bracketing[brck_index][0]
288 brck_index -= 1
289 brck_limit = bracketing[brck_index][0]
290 last_identifier_pos = pos
291 if rawtext[pos] in "([":
292 # [] and () may be used after an identifier, so we
293 # continue. postdot_phase is True, so we don't allow a dot.
294 pass
295 else:
296 # We can't continue after other types of brackets
Serhiy Storchaka8c126d72013-01-01 22:25:59 +0200297 if rawtext[pos] in "'\"":
298 # Scan a string prefix
Serhiy Storchakaeb6aa5c2013-01-01 22:32:42 +0200299 while pos > 0 and rawtext[pos - 1] in "rRbBuU":
Serhiy Storchaka8c126d72013-01-01 22:25:59 +0200300 pos -= 1
301 last_identifier_pos = pos
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000302 break
303
304 else:
305 # We've found an operator or something.
306 break
307
308 return rawtext[last_identifier_pos:self.indexinrawtext]
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -0400309
310
311if __name__ == '__main__':
312 import unittest
313 unittest.main('idlelib.idle_test.test_hyperparser', verbosity=2)