blob: 450a709c09bbfafe02ac3631c469451888af43c8 [file] [log] [blame]
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -04001"""Provide advanced parsing abilities for ParenMatch and other extensions.
Terry Jan Reedy3e583302014-06-16 02:33:35 -04002
3HyperParser uses PyParser. PyParser mostly gives information on the
4proper indentation of code. HyperParser gives additional information on
5the structure of code.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +00006"""
Tal Einat9b7f9e62014-07-16 16:33:36 +03007from keyword import iskeyword
Terry Jan Reedybfbaa6b2016-08-31 00:50:55 -04008import string
Kurt B. Kaiserb1754452005-11-18 22:05:48 +00009
Terry Jan Reedybfbaa6b2016-08-31 00:50:55 -040010from idlelib import pyparse
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000011
Tal Einat9b7f9e62014-07-16 16:33:36 +030012# all ASCII chars that may be in an identifier
13_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
14# all ASCII chars that may be the first char of an identifier
15_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
16
17# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
18_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
19# lookup table for whether 7-bit ASCII chars are valid as the first
20# char in a Python identifier
21_IS_ASCII_ID_FIRST_CHAR = \
22 [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
23
24
25class HyperParser:
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000026 def __init__(self, editwin, index):
Terry Jan Reedy3e583302014-06-16 02:33:35 -040027 "To initialize, analyze the surroundings of the given index."
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000028
29 self.editwin = editwin
30 self.text = text = editwin.text
31
Terry Jan Reedy6fa5bdc2016-05-28 13:22:31 -040032 parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth)
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000033
34 def index2line(index):
35 return int(float(index))
36 lno = index2line(text.index(index))
37
38 if not editwin.context_use_ps1:
39 for context in editwin.num_context_lines:
40 startat = max(lno - context, 1)
Brett Cannon0b70cca2006-08-25 02:59:59 +000041 startatindex = repr(startat) + ".0"
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000042 stopatindex = "%d.end" % lno
Terry Jan Reedy3e583302014-06-16 02:33:35 -040043 # We add the newline because PyParse requires a newline
44 # at end. We add a space so that index won't be at end
45 # of line, so that its status will be the same as the
46 # char before it, if should.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000047 parser.set_str(text.get(startatindex, stopatindex)+' \n')
48 bod = parser.find_good_parse_start(
49 editwin._build_char_in_string_func(startatindex))
50 if bod is not None or startat == 1:
51 break
52 parser.set_lo(bod or 0)
53 else:
54 r = text.tag_prevrange("console", index)
55 if r:
56 startatindex = r[1]
57 else:
58 startatindex = "1.0"
59 stopatindex = "%d.end" % lno
Terry Jan Reedy3e583302014-06-16 02:33:35 -040060 # We add the newline because PyParse requires it. We add a
61 # space so that index won't be at end of line, so that its
62 # status will be the same as the char before it, if should.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000063 parser.set_str(text.get(startatindex, stopatindex)+' \n')
64 parser.set_lo(0)
65
Terry Jan Reedy3e583302014-06-16 02:33:35 -040066 # We want what the parser has, minus the last newline and space.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000067 self.rawtext = parser.str[:-2]
Terry Jan Reedy3e583302014-06-16 02:33:35 -040068 # Parser.str apparently preserves the statement we are in, so
69 # that stopatindex can be used to synchronize the string with
70 # the text box indices.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000071 self.stopatindex = stopatindex
72 self.bracketing = parser.get_last_stmt_bracketing()
Terry Jan Reedy3e583302014-06-16 02:33:35 -040073 # find which pairs of bracketing are openers. These always
74 # correspond to a character of rawtext.
75 self.isopener = [i>0 and self.bracketing[i][1] >
76 self.bracketing[i-1][1]
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000077 for i in range(len(self.bracketing))]
78
79 self.set_index(index)
80
81 def set_index(self, index):
Terry Jan Reedy3e583302014-06-16 02:33:35 -040082 """Set the index to which the functions relate.
83
84 The index must be in the same statement.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000085 """
Terry Jan Reedy3e583302014-06-16 02:33:35 -040086 indexinrawtext = (len(self.rawtext) -
87 len(self.text.get(index, self.stopatindex)))
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000088 if indexinrawtext < 0:
Terry Jan Reedy3e583302014-06-16 02:33:35 -040089 raise ValueError("Index %s precedes the analyzed statement"
90 % index)
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000091 self.indexinrawtext = indexinrawtext
92 # find the rightmost bracket to which index belongs
93 self.indexbracket = 0
Terry Jan Reedy3e583302014-06-16 02:33:35 -040094 while (self.indexbracket < len(self.bracketing)-1 and
95 self.bracketing[self.indexbracket+1][0] < self.indexinrawtext):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000096 self.indexbracket += 1
Terry Jan Reedy3e583302014-06-16 02:33:35 -040097 if (self.indexbracket < len(self.bracketing)-1 and
98 self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and
99 not self.isopener[self.indexbracket+1]):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000100 self.indexbracket += 1
101
102 def is_in_string(self):
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -0400103 """Is the index given to the HyperParser in a string?"""
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000104 # The bracket to which we belong should be an opener.
105 # If it's an opener, it has to have a character.
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400106 return (self.isopener[self.indexbracket] and
107 self.rawtext[self.bracketing[self.indexbracket][0]]
108 in ('"', "'"))
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000109
110 def is_in_code(self):
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -0400111 """Is the index given to the HyperParser in normal code?"""
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400112 return (not self.isopener[self.indexbracket] or
113 self.rawtext[self.bracketing[self.indexbracket][0]]
114 not in ('#', '"', "'"))
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000115
116 def get_surrounding_brackets(self, openers='([{', mustclose=False):
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400117 """Return bracket indexes or None.
118
119 If the index given to the HyperParser is surrounded by a
120 bracket defined in openers (or at least has one before it),
121 return the indices of the opening bracket and the closing
122 bracket (or the end of line, whichever comes first).
123
124 If it is not surrounded by brackets, or the end of line comes
125 before the closing bracket and mustclose is True, returns None.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000126 """
Terry Jan Reedyd0c1ea42014-06-16 02:40:24 -0400127
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000128 bracketinglevel = self.bracketing[self.indexbracket][1]
129 before = self.indexbracket
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400130 while (not self.isopener[before] or
131 self.rawtext[self.bracketing[before][0]] not in openers or
132 self.bracketing[before][1] > bracketinglevel):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000133 before -= 1
134 if before < 0:
135 return None
136 bracketinglevel = min(bracketinglevel, self.bracketing[before][1])
137 after = self.indexbracket + 1
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400138 while (after < len(self.bracketing) and
139 self.bracketing[after][1] >= bracketinglevel):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000140 after += 1
141
142 beforeindex = self.text.index("%s-%dc" %
143 (self.stopatindex, len(self.rawtext)-self.bracketing[before][0]))
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400144 if (after >= len(self.bracketing) or
145 self.bracketing[after][0] > len(self.rawtext)):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000146 if mustclose:
147 return None
148 afterindex = self.stopatindex
149 else:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400150 # We are after a real char, so it is a ')' and we give the
151 # index before it.
152 afterindex = self.text.index(
153 "%s-%dc" % (self.stopatindex,
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000154 len(self.rawtext)-(self.bracketing[after][0]-1)))
155
156 return beforeindex, afterindex
157
Tal Einat9b7f9e62014-07-16 16:33:36 +0300158 # the set of built-in identifiers which are also keywords,
159 # i.e. keyword.iskeyword() returns True for them
160 _ID_KEYWORDS = frozenset({"True", "False", "None"})
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000161
Tal Einat9b7f9e62014-07-16 16:33:36 +0300162 @classmethod
163 def _eat_identifier(cls, str, limit, pos):
164 """Given a string and pos, return the number of chars in the
165 identifier which ends at pos, or 0 if there is no such one.
166
167 This ignores non-identifier eywords are not identifiers.
168 """
169 is_ascii_id_char = _IS_ASCII_ID_CHAR
170
171 # Start at the end (pos) and work backwards.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000172 i = pos
Tal Einat9b7f9e62014-07-16 16:33:36 +0300173
174 # Go backwards as long as the characters are valid ASCII
175 # identifier characters. This is an optimization, since it
176 # is faster in the common case where most of the characters
177 # are ASCII.
178 while i > limit and (
179 ord(str[i - 1]) < 128 and
180 is_ascii_id_char[ord(str[i - 1])]
181 ):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000182 i -= 1
Tal Einat9b7f9e62014-07-16 16:33:36 +0300183
184 # If the above loop ended due to reaching a non-ASCII
185 # character, continue going backwards using the most generic
186 # test for whether a string contains only valid identifier
187 # characters.
188 if i > limit and ord(str[i - 1]) >= 128:
189 while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
190 i -= 4
191 if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
192 i -= 2
193 if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
194 i -= 1
195
196 # The identifier candidate starts here. If it isn't a valid
197 # identifier, don't eat anything. At this point that is only
198 # possible if the first character isn't a valid first
199 # character for an identifier.
200 if not str[i:pos].isidentifier():
201 return 0
202 elif i < pos:
203 # All characters in str[i:pos] are valid ASCII identifier
204 # characters, so it is enough to check that the first is
205 # valid as the first character of an identifier.
206 if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
207 return 0
208
209 # All keywords are valid identifiers, but should not be
210 # considered identifiers here, except for True, False and None.
211 if i < pos and (
212 iskeyword(str[i:pos]) and
213 str[i:pos] not in cls._ID_KEYWORDS
214 ):
215 return 0
216
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000217 return pos - i
218
Tal Einat9b7f9e62014-07-16 16:33:36 +0300219 # This string includes all chars that may be in a white space
220 _whitespace_chars = " \t\n\\"
221
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000222 def get_expression(self):
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400223 """Return a string with the Python expression which ends at the
224 given index, which is empty if there is no real one.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000225 """
226 if not self.is_in_code():
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400227 raise ValueError("get_expression should only be called"
228 "if index is inside a code.")
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000229
230 rawtext = self.rawtext
231 bracketing = self.bracketing
232
233 brck_index = self.indexbracket
234 brck_limit = bracketing[brck_index][0]
235 pos = self.indexinrawtext
236
237 last_identifier_pos = pos
238 postdot_phase = True
239
240 while 1:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400241 # Eat whitespaces, comments, and if postdot_phase is False - a dot
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000242 while 1:
243 if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars:
244 # Eat a whitespace
245 pos -= 1
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400246 elif (not postdot_phase and
247 pos > brck_limit and rawtext[pos-1] == '.'):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000248 # Eat a dot
249 pos -= 1
250 postdot_phase = True
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400251 # The next line will fail if we are *inside* a comment,
252 # but we shouldn't be.
253 elif (pos == brck_limit and brck_index > 0 and
254 rawtext[bracketing[brck_index-1][0]] == '#'):
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000255 # Eat a comment
256 brck_index -= 2
257 brck_limit = bracketing[brck_index][0]
258 pos = bracketing[brck_index+1][0]
259 else:
260 # If we didn't eat anything, quit.
261 break
262
263 if not postdot_phase:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400264 # We didn't find a dot, so the expression end at the
265 # last identifier pos.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000266 break
267
268 ret = self._eat_identifier(rawtext, brck_limit, pos)
269 if ret:
270 # There is an identifier to eat
271 pos = pos - ret
272 last_identifier_pos = pos
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400273 # Now, to continue the search, we must find a dot.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000274 postdot_phase = False
275 # (the loop continues now)
276
277 elif pos == brck_limit:
Terry Jan Reedy3e583302014-06-16 02:33:35 -0400278 # We are at a bracketing limit. If it is a closing
279 # bracket, eat the bracket, otherwise, stop the search.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000280 level = bracketing[brck_index][1]
281 while brck_index > 0 and bracketing[brck_index-1][1] > level:
282 brck_index -= 1
283 if bracketing[brck_index][0] == brck_limit:
284 # We were not at the end of a closing bracket
285 break
286 pos = bracketing[brck_index][0]
287 brck_index -= 1
288 brck_limit = bracketing[brck_index][0]
289 last_identifier_pos = pos
290 if rawtext[pos] in "([":
291 # [] and () may be used after an identifier, so we
292 # continue. postdot_phase is True, so we don't allow a dot.
293 pass
294 else:
295 # We can't continue after other types of brackets
Serhiy Storchaka8c126d72013-01-01 22:25:59 +0200296 if rawtext[pos] in "'\"":
297 # Scan a string prefix
Serhiy Storchakaeb6aa5c2013-01-01 22:32:42 +0200298 while pos > 0 and rawtext[pos - 1] in "rRbBuU":
Serhiy Storchaka8c126d72013-01-01 22:25:59 +0200299 pos -= 1
300 last_identifier_pos = pos
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000301 break
302
303 else:
304 # We've found an operator or something.
305 break
306
307 return rawtext[last_identifier_pos:self.indexinrawtext]
Terry Jan Reedy10b1c7c2014-06-16 19:01:01 -0400308
309
310if __name__ == '__main__':
311 import unittest
312 unittest.main('idlelib.idle_test.test_hyperparser', verbosity=2)