blob: 43e10b3eea6964565e9be5ec0b1fde021f790ee0 [file] [log] [blame]
Guido van Rossum8113cdc1999-06-01 19:49:21 +00001import string
2import re
3import sys
4
5# Reason last stmt is continued (or C_NONE if it's not).
6C_NONE, C_BACKSLASH, C_STRING, C_BRACKET = range(4)
7
8if 0: # for throwaway debugging output
9 def dump(*stuff):
Guido van Rossum8113cdc1999-06-01 19:49:21 +000010 sys.__stdout__.write(string.join(map(str, stuff), " ") + "\n")
11
Guido van Rossumf4a15081999-06-03 14:32:16 +000012# Find what looks like the start of a popular stmt.
Guido van Rossumbbaba851999-06-01 19:55:34 +000013
Guido van Rossumf4a15081999-06-03 14:32:16 +000014_synchre = re.compile(r"""
Guido van Rossum8113cdc1999-06-01 19:49:21 +000015 ^
16 [ \t]*
Guido van Rossumf4a15081999-06-03 14:32:16 +000017 (?: if | else | elif | while | def | class )
18 \b
Guido van Rossum8113cdc1999-06-01 19:49:21 +000019""", re.VERBOSE | re.MULTILINE).search
20
Guido van Rossumbbaba851999-06-01 19:55:34 +000021# Match blank line or non-indenting comment line.
22
Guido van Rossum8113cdc1999-06-01 19:49:21 +000023_junkre = re.compile(r"""
24 [ \t]*
Guido van Rossumbbaba851999-06-01 19:55:34 +000025 (?: \# \S .* )?
Guido van Rossum8113cdc1999-06-01 19:49:21 +000026 \n
27""", re.VERBOSE).match
28
Guido van Rossumbbaba851999-06-01 19:55:34 +000029# Match any flavor of string; the terminating quote is optional
30# so that we're robust in the face of incomplete program text.
31
Guido van Rossum8113cdc1999-06-01 19:49:21 +000032_match_stringre = re.compile(r"""
33 \""" [^"\\]* (?:
34 (?: \\. | "(?!"") )
35 [^"\\]*
36 )*
37 (?: \""" )?
38
39| " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
40
41| ''' [^'\\]* (?:
42 (?: \\. | '(?!'') )
43 [^'\\]*
44 )*
45 (?: ''' )?
46
47| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
48""", re.VERBOSE | re.DOTALL).match
49
Guido van Rossumbbaba851999-06-01 19:55:34 +000050# Match a line that starts with something interesting;
51# used to find the first item of a bracket structure.
52
53_itemre = re.compile(r"""
Guido van Rossum8113cdc1999-06-01 19:49:21 +000054 [ \t]*
Guido van Rossumbbaba851999-06-01 19:55:34 +000055 [^\s#\\] # if we match, m.end()-1 is the interesting char
Guido van Rossum8113cdc1999-06-01 19:49:21 +000056""", re.VERBOSE).match
57
Guido van Rossumbbaba851999-06-01 19:55:34 +000058# Match start of stmts that should be followed by a dedent.
59
Guido van Rossum8113cdc1999-06-01 19:49:21 +000060_closere = re.compile(r"""
61 \s*
62 (?: return
63 | break
64 | continue
65 | raise
66 | pass
67 )
68 \b
69""", re.VERBOSE).match
70
Guido van Rossumbbaba851999-06-01 19:55:34 +000071# Chew up non-special chars as quickly as possible, but retaining
72# enough info to determine the last non-ws char seen; if match is
73# successful, and m.group(1) isn't None, m.end(1) less 1 is the
74# index of the last non-ws char matched.
75
76_chew_ordinaryre = re.compile(r"""
77 (?: \s+
78 | ( [^\s[\](){}#'"\\]+ )
79 )+
80""", re.VERBOSE).match
81
Guido van Rossum8113cdc1999-06-01 19:49:21 +000082# Build translation table to map uninteresting chars to "x", open
83# brackets to "(", and close brackets to ")".
84
85_tran = ['x'] * 256
86for ch in "({[":
87 _tran[ord(ch)] = '('
88for ch in ")}]":
89 _tran[ord(ch)] = ')'
90for ch in "\"'\\\n#":
91 _tran[ord(ch)] = ch
92_tran = string.join(_tran, '')
93del ch
94
95class Parser:
96
97 def __init__(self, indentwidth, tabwidth):
98 self.indentwidth = indentwidth
99 self.tabwidth = tabwidth
100
101 def set_str(self, str):
102 assert len(str) == 0 or str[-1] == '\n'
103 self.str = str
104 self.study_level = 0
105
Guido van Rossumf4a15081999-06-03 14:32:16 +0000106 # Return index of a good place to begin parsing, as close to the
107 # end of the string as possible. This will be the start of some
108 # popular stmt like "if" or "def". Return None if none found.
109 #
110 # This will be reliable iff given a reliable is_char_in_string
111 # function, meaning that when it says "no", it's absolutely guaranteed
112 # that the char is not in a string.
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000113 #
114 # Ack, hack: in the shell window this kills us, because there's
115 # no way to tell the differences between output, >>> etc and
116 # user input. Indeed, IDLE's first output line makes the rest
117 # look like it's in an unclosed paren!:
118 # Python 1.5.2 (#0, Apr 13 1999, ...
119
Guido van Rossumf4a15081999-06-03 14:32:16 +0000120 def find_good_parse_start(self, use_ps1,
121 is_char_in_string=None,
122 _synchre=_synchre):
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000123 str, pos = self.str, None
Guido van Rossumbbaba851999-06-01 19:55:34 +0000124 if use_ps1:
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000125 # hack for shell window
126 ps1 = '\n' + sys.ps1
127 i = string.rfind(str, ps1)
128 if i >= 0:
129 pos = i + len(ps1)
130 self.str = str[:pos-1] + '\n' + str[pos:]
Guido van Rossumf4a15081999-06-03 14:32:16 +0000131 elif is_char_in_string:
132 # otherwise we can't be sure, so leave pos at None
Guido van Rossumbbaba851999-06-01 19:55:34 +0000133 i = 0
134 while 1:
Guido van Rossumf4a15081999-06-03 14:32:16 +0000135 m = _synchre(str, i)
Guido van Rossumbbaba851999-06-01 19:55:34 +0000136 if m:
Guido van Rossumf4a15081999-06-03 14:32:16 +0000137 s, i = m.span()
138 if not is_char_in_string(s):
139 pos = s
Guido van Rossumbbaba851999-06-01 19:55:34 +0000140 else:
141 break
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000142 return pos
143
144 # Throw away the start of the string. Intended to be called with
Guido van Rossumf4a15081999-06-03 14:32:16 +0000145 # find_good_parse_start's result.
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000146
147 def set_lo(self, lo):
148 assert lo == 0 or self.str[lo-1] == '\n'
149 if lo > 0:
150 self.str = self.str[lo:]
151
152 # As quickly as humanly possible <wink>, find the line numbers (0-
153 # based) of the non-continuation lines.
Guido van Rossumbbaba851999-06-01 19:55:34 +0000154 # Creates self.{goodlines, continuation}.
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000155
156 def _study1(self, _replace=string.replace, _find=string.find):
157 if self.study_level >= 1:
158 return
159 self.study_level = 1
160
161 # Map all uninteresting characters to "x", all open brackets
162 # to "(", all close brackets to ")", then collapse runs of
163 # uninteresting characters. This can cut the number of chars
164 # by a factor of 10-40, and so greatly speed the following loop.
165 str = self.str
166 str = string.translate(str, _tran)
167 str = _replace(str, 'xxxxxxxx', 'x')
168 str = _replace(str, 'xxxx', 'x')
169 str = _replace(str, 'xx', 'x')
170 str = _replace(str, 'xx', 'x')
171 str = _replace(str, '\nx', '\n')
172 # note that replacing x\n with \n would be incorrect, because
173 # x may be preceded by a backslash
174
175 # March over the squashed version of the program, accumulating
176 # the line numbers of non-continued stmts, and determining
177 # whether & why the last stmt is a continuation.
178 continuation = C_NONE
179 level = lno = 0 # level is nesting level; lno is line number
Guido van Rossumbbaba851999-06-01 19:55:34 +0000180 self.goodlines = goodlines = [0]
181 push_good = goodlines.append
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000182 i, n = 0, len(str)
183 while i < n:
184 ch = str[i]
Guido van Rossumbbaba851999-06-01 19:55:34 +0000185 i = i+1
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000186
Guido van Rossumbbaba851999-06-01 19:55:34 +0000187 # cases are checked in decreasing order of frequency
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000188 if ch == 'x':
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000189 continue
190
191 if ch == '\n':
192 lno = lno + 1
193 if level == 0:
Guido van Rossumbbaba851999-06-01 19:55:34 +0000194 push_good(lno)
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000195 # else we're in an unclosed bracket structure
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000196 continue
197
198 if ch == '(':
199 level = level + 1
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000200 continue
201
202 if ch == ')':
203 if level:
204 level = level - 1
205 # else the program is invalid, but we can't complain
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000206 continue
207
208 if ch == '"' or ch == "'":
209 # consume the string
210 quote = ch
Guido van Rossumbbaba851999-06-01 19:55:34 +0000211 if str[i-1:i+2] == quote * 3:
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000212 quote = quote * 3
Guido van Rossumbbaba851999-06-01 19:55:34 +0000213 w = len(quote) - 1
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000214 i = i+w
215 while i < n:
216 ch = str[i]
Guido van Rossumbbaba851999-06-01 19:55:34 +0000217 i = i+1
218
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000219 if ch == 'x':
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000220 continue
221
Guido van Rossumbbaba851999-06-01 19:55:34 +0000222 if str[i-1:i+w] == quote:
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000223 i = i+w
224 break
225
226 if ch == '\n':
227 lno = lno + 1
Guido van Rossumbbaba851999-06-01 19:55:34 +0000228 if w == 0:
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000229 # unterminated single-quoted string
230 if level == 0:
Guido van Rossumbbaba851999-06-01 19:55:34 +0000231 push_good(lno)
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000232 break
233 continue
234
235 if ch == '\\':
Guido van Rossumbbaba851999-06-01 19:55:34 +0000236 assert i < n
237 if str[i] == '\n':
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000238 lno = lno + 1
Guido van Rossumbbaba851999-06-01 19:55:34 +0000239 i = i+1
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000240 continue
241
242 # else comment char or paren inside string
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000243
244 else:
Guido van Rossumbbaba851999-06-01 19:55:34 +0000245 # didn't break out of the loop, so we're still
246 # inside a string
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000247 continuation = C_STRING
Guido van Rossumbbaba851999-06-01 19:55:34 +0000248 continue # with outer loop
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000249
250 if ch == '#':
251 # consume the comment
252 i = _find(str, '\n', i)
253 assert i >= 0
254 continue
255
256 assert ch == '\\'
Guido van Rossumbbaba851999-06-01 19:55:34 +0000257 assert i < n
258 if str[i] == '\n':
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000259 lno = lno + 1
Guido van Rossumbbaba851999-06-01 19:55:34 +0000260 if i+1 == n:
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000261 continuation = C_BACKSLASH
Guido van Rossumbbaba851999-06-01 19:55:34 +0000262 i = i+1
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000263
264 # The last stmt may be continued for all 3 reasons.
265 # String continuation takes precedence over bracket
266 # continuation, which beats backslash continuation.
267 if continuation != C_STRING and level > 0:
268 continuation = C_BRACKET
269 self.continuation = continuation
270
Guido van Rossumbbaba851999-06-01 19:55:34 +0000271 # Push the final line number as a sentinel value, regardless of
272 # whether it's continued.
273 assert (continuation == C_NONE) == (goodlines[-1] == lno)
274 if goodlines[-1] != lno:
275 push_good(lno)
276
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000277 def get_continuation_type(self):
278 self._study1()
279 return self.continuation
280
281 # study1 was sufficient to determine the continuation status,
282 # but doing more requires looking at every character. study2
283 # does this for the last interesting statement in the block.
284 # Creates:
285 # self.stmt_start, stmt_end
286 # slice indices of last interesting stmt
287 # self.lastch
288 # last non-whitespace character before optional trailing
289 # comment
290 # self.lastopenbracketpos
291 # if continuation is C_BRACKET, index of last open bracket
292
293 def _study2(self, _rfind=string.rfind, _find=string.find,
294 _ws=string.whitespace):
295 if self.study_level >= 2:
296 return
297 self._study1()
298 self.study_level = 2
299
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000300 # Set p and q to slice indices of last interesting stmt.
Guido van Rossumbbaba851999-06-01 19:55:34 +0000301 str, goodlines = self.str, self.goodlines
302 i = len(goodlines) - 1
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000303 p = len(str) # index of newest line
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000304 while i:
305 assert p
Guido van Rossumbbaba851999-06-01 19:55:34 +0000306 # p is the index of the stmt at line number goodlines[i].
307 # Move p back to the stmt at line number goodlines[i-1].
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000308 q = p
Guido van Rossumbbaba851999-06-01 19:55:34 +0000309 for nothing in range(goodlines[i-1], goodlines[i]):
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000310 # tricky: sets p to 0 if no preceding newline
311 p = _rfind(str, '\n', 0, p-1) + 1
312 # The stmt str[p:q] isn't a continuation, but may be blank
313 # or a non-indenting comment line.
314 if _junkre(str, p):
315 i = i-1
316 else:
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000317 break
Guido van Rossumbbaba851999-06-01 19:55:34 +0000318 if i == 0:
319 # nothing but junk!
320 assert p == 0
321 q = p
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000322 self.stmt_start, self.stmt_end = p, q
323
324 # Analyze this stmt, to find the last open bracket (if any)
325 # and last interesting character (if any).
Guido van Rossumbbaba851999-06-01 19:55:34 +0000326 lastch = ""
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000327 stack = [] # stack of open bracket indices
328 push_stack = stack.append
329 while p < q:
Guido van Rossumbbaba851999-06-01 19:55:34 +0000330 # suck up all except ()[]{}'"#\\
331 m = _chew_ordinaryre(str, p, q)
332 if m:
333 i = m.end(1) - 1 # last non-ws (if any)
334 if i >= 0:
335 lastch = str[i]
336 p = m.end()
337 if p >= q:
338 break
339
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000340 ch = str[p]
Guido van Rossumbbaba851999-06-01 19:55:34 +0000341
342 if ch in "([{":
343 push_stack(p)
344 lastch = ch
345 p = p+1
346 continue
347
348 if ch in ")]}":
349 if stack:
350 del stack[-1]
351 lastch = ch
352 p = p+1
353 continue
354
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000355 if ch == '"' or ch == "'":
356 # consume string
357 # Note that study1 did this with a Python loop, but
358 # we use a regexp here; the reason is speed in both
359 # cases; the string may be huge, but study1 pre-squashed
360 # strings to a couple of characters per line. study1
361 # also needed to keep track of newlines, and we don't
362 # have to.
Guido van Rossumbbaba851999-06-01 19:55:34 +0000363 lastch = ch
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000364 p = _match_stringre(str, p, q).end()
365 continue
366
367 if ch == '#':
368 # consume comment and trailing newline
369 p = _find(str, '\n', p, q) + 1
370 assert p > 0
371 continue
372
Guido van Rossumbbaba851999-06-01 19:55:34 +0000373 assert ch == '\\'
374 p = p+1 # beyond backslash
375 assert p < q
376 if str[p] != '\n':
377 # the program is invalid, but can't complain
378 lastch = ch + str[p]
379 p = p+1 # beyond escaped char
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000380
381 # end while p < q:
382
Guido van Rossumbbaba851999-06-01 19:55:34 +0000383 self.lastch = lastch
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000384 if stack:
385 self.lastopenbracketpos = stack[-1]
386
387 # Assuming continuation is C_BRACKET, return the number
388 # of spaces the next line should be indented.
389
390 def compute_bracket_indent(self, _find=string.find):
391 self._study2()
392 assert self.continuation == C_BRACKET
393 j = self.lastopenbracketpos
394 str = self.str
395 n = len(str)
396 origi = i = string.rfind(str, '\n', 0, j) + 1
Guido van Rossumbbaba851999-06-01 19:55:34 +0000397 j = j+1 # one beyond open bracket
398 # find first list item; set i to start of its line
399 while j < n:
400 m = _itemre(str, j)
401 if m:
402 j = m.end() - 1 # index of first interesting char
403 extra = 0
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000404 break
Guido van Rossumbbaba851999-06-01 19:55:34 +0000405 else:
406 # this line is junk; advance to next line
407 i = j = _find(str, '\n', j) + 1
408 else:
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000409 # nothing interesting follows the bracket;
410 # reproduce the bracket line's indentation + a level
411 j = i = origi
Guido van Rossumbbaba851999-06-01 19:55:34 +0000412 while str[j] in " \t":
413 j = j+1
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000414 extra = self.indentwidth
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000415 return len(string.expandtabs(str[i:j],
416 self.tabwidth)) + extra
417
418 # Return number of physical lines in last stmt (whether or not
419 # it's an interesting stmt! this is intended to be called when
420 # continuation is C_BACKSLASH).
421
422 def get_num_lines_in_stmt(self):
423 self._study1()
Guido van Rossumbbaba851999-06-01 19:55:34 +0000424 goodlines = self.goodlines
425 return goodlines[-1] - goodlines[-2]
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000426
427 # Assuming continuation is C_BACKSLASH, return the number of spaces
428 # the next line should be indented. Also assuming the new line is
429 # the first one following the initial line of the stmt.
430
431 def compute_backslash_indent(self):
432 self._study2()
433 assert self.continuation == C_BACKSLASH
434 str = self.str
435 i = self.stmt_start
436 while str[i] in " \t":
437 i = i+1
438 startpos = i
Guido van Rossumbbaba851999-06-01 19:55:34 +0000439
440 # See whether the initial line starts an assignment stmt; i.e.,
441 # look for an = operator
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000442 endpos = string.find(str, '\n', startpos) + 1
443 found = level = 0
444 while i < endpos:
445 ch = str[i]
446 if ch in "([{":
447 level = level + 1
448 i = i+1
449 elif ch in ")]}":
450 if level:
451 level = level - 1
452 i = i+1
453 elif ch == '"' or ch == "'":
454 i = _match_stringre(str, i, endpos).end()
455 elif ch == '#':
456 break
457 elif level == 0 and ch == '=' and \
Guido van Rossumbbaba851999-06-01 19:55:34 +0000458 (i == 0 or str[i-1] not in "=<>!") and \
459 str[i+1] != '=':
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000460 found = 1
461 break
462 else:
463 i = i+1
464
465 if found:
466 # found a legit =, but it may be the last interesting
467 # thing on the line
468 i = i+1 # move beyond the =
469 found = re.match(r"\s*\\", str[i:endpos]) is None
470
471 if not found:
472 # oh well ... settle for moving beyond the first chunk
473 # of non-whitespace chars
474 i = startpos
475 while str[i] not in " \t\n":
476 i = i+1
477
478 return len(string.expandtabs(str[self.stmt_start :
479 i],
480 self.tabwidth)) + 1
481
482 # Return the leading whitespace on the initial line of the last
483 # interesting stmt.
484
485 def get_base_indent_string(self):
486 self._study2()
487 i, n = self.stmt_start, self.stmt_end
Guido van Rossum8113cdc1999-06-01 19:49:21 +0000488 j = i
489 str = self.str
490 while j < n and str[j] in " \t":
491 j = j + 1
492 return str[i:j]
493
494 # Did the last interesting stmt open a block?
495
496 def is_block_opener(self):
497 self._study2()
498 return self.lastch == ':'
499
500 # Did the last interesting stmt close a block?
501
502 def is_block_closer(self):
503 self._study2()
504 return _closere(self.str, self.stmt_start) is not None
Guido van Rossumf4a15081999-06-03 14:32:16 +0000505
506 # index of last open bracket ({[, or None if none
507 lastopenbracketpos = None
508
509 def get_last_open_bracket_pos(self):
510 self._study2()
511 return self.lastopenbracketpos