blob: 3e501ca1a191af4566e33fa119c998d23925dee9 [file] [log] [blame]
David Scherer7aced172000-08-15 01:13:23 +00001import re
2import sys
Tal Einat9b7f9e62014-07-16 16:33:36 +03003from collections import Mapping
4from functools import partial
David Scherer7aced172000-08-15 01:13:23 +00005
6# Reason last stmt is continued (or C_NONE if it's not).
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +00007(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
8 C_STRING_NEXT_LINES, C_BRACKET) = range(5)
David Scherer7aced172000-08-15 01:13:23 +00009
10if 0: # for throwaway debugging output
11 def dump(*stuff):
Kurt B. Kaiser254eb532002-09-17 03:55:13 +000012 sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
David Scherer7aced172000-08-15 01:13:23 +000013
14# Find what looks like the start of a popular stmt.
15
16_synchre = re.compile(r"""
17 ^
18 [ \t]*
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000019 (?: while
David Scherer7aced172000-08-15 01:13:23 +000020 | else
21 | def
22 | return
23 | assert
24 | break
25 | class
26 | continue
27 | elif
28 | try
29 | except
30 | raise
31 | import
Kurt B. Kaiser752e4d52001-07-14 04:59:24 +000032 | yield
David Scherer7aced172000-08-15 01:13:23 +000033 )
34 \b
35""", re.VERBOSE | re.MULTILINE).search
36
37# Match blank line or non-indenting comment line.
38
39_junkre = re.compile(r"""
40 [ \t]*
41 (?: \# \S .* )?
42 \n
43""", re.VERBOSE).match
44
45# Match any flavor of string; the terminating quote is optional
46# so that we're robust in the face of incomplete program text.
47
48_match_stringre = re.compile(r"""
49 \""" [^"\\]* (?:
50 (?: \\. | "(?!"") )
51 [^"\\]*
52 )*
53 (?: \""" )?
54
55| " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
56
57| ''' [^'\\]* (?:
58 (?: \\. | '(?!'') )
59 [^'\\]*
60 )*
61 (?: ''' )?
62
63| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
64""", re.VERBOSE | re.DOTALL).match
65
66# Match a line that starts with something interesting;
67# used to find the first item of a bracket structure.
68
69_itemre = re.compile(r"""
70 [ \t]*
71 [^\s#\\] # if we match, m.end()-1 is the interesting char
72""", re.VERBOSE).match
73
74# Match start of stmts that should be followed by a dedent.
75
76_closere = re.compile(r"""
77 \s*
78 (?: return
79 | break
80 | continue
81 | raise
82 | pass
83 )
84 \b
85""", re.VERBOSE).match
86
87# Chew up non-special chars as quickly as possible. If match is
88# successful, m.end() less 1 is the index of the last boring char
89# matched. If match is unsuccessful, the string starts with an
90# interesting char.
91
92_chew_ordinaryre = re.compile(r"""
93 [^[\](){}#'"\\]+
94""", re.VERBOSE).match
95
David Scherer7aced172000-08-15 01:13:23 +000096
Tal Einat9b7f9e62014-07-16 16:33:36 +030097class StringTranslatePseudoMapping(Mapping):
98 r"""Utility class to be used with str.translate()
99
100 This Mapping class wraps a given dict. When a value for a key is
101 requested via __getitem__() or get(), the key is looked up in the
102 given dict. If found there, the value from the dict is returned.
103 Otherwise, the default value given upon initialization is returned.
104
105 This allows using str.translate() to make some replacements, and to
106 replace all characters for which no replacement was specified with
107 a given character instead of leaving them as-is.
108
109 For example, to replace everything except whitespace with 'x':
110
111 >>> whitespace_chars = ' \t\n\r'
112 >>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars}
113 >>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x'))
114 >>> text = "a + b\tc\nd"
115 >>> text.translate(mapping)
116 'x x x\tx\nx'
117 """
118 def __init__(self, non_defaults, default_value):
119 self._non_defaults = non_defaults
120 self._default_value = default_value
121
122 def _get(key, _get=non_defaults.get, _default=default_value):
123 return _get(key, _default)
124 self._get = _get
125
126 def __getitem__(self, item):
127 return self._get(item)
128
129 def __len__(self):
130 return len(self._non_defaults)
131
132 def __iter__(self):
133 return iter(self._non_defaults)
134
135 def get(self, key, default=None):
136 return self._get(key)
137
David Scherer7aced172000-08-15 01:13:23 +0000138
139class Parser:
140
141 def __init__(self, indentwidth, tabwidth):
142 self.indentwidth = indentwidth
143 self.tabwidth = tabwidth
144
Walter Dörwald5de48bd2007-06-11 21:38:39 +0000145 def set_str(self, s):
146 assert len(s) == 0 or s[-1] == '\n'
Walter Dörwald5de48bd2007-06-11 21:38:39 +0000147 self.str = s
David Scherer7aced172000-08-15 01:13:23 +0000148 self.study_level = 0
149
150 # Return index of a good place to begin parsing, as close to the
151 # end of the string as possible. This will be the start of some
152 # popular stmt like "if" or "def". Return None if none found:
153 # the caller should pass more prior context then, if possible, or
154 # if not (the entire program text up until the point of interest
155 # has already been tried) pass 0 to set_lo.
156 #
157 # This will be reliable iff given a reliable is_char_in_string
158 # function, meaning that when it says "no", it's absolutely
159 # guaranteed that the char is not in a string.
David Scherer7aced172000-08-15 01:13:23 +0000160
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000161 def find_good_parse_start(self, is_char_in_string=None,
David Scherer7aced172000-08-15 01:13:23 +0000162 _synchre=_synchre):
163 str, pos = self.str, None
David Scherer7aced172000-08-15 01:13:23 +0000164
David Scherer7aced172000-08-15 01:13:23 +0000165 if not is_char_in_string:
166 # no clue -- make the caller pass everything
167 return None
168
169 # Peek back from the end for a good place to start,
170 # but don't try too often; pos will be left None, or
171 # bumped to a legitimate synch point.
172 limit = len(str)
173 for tries in range(5):
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000174 i = str.rfind(":\n", 0, limit)
David Scherer7aced172000-08-15 01:13:23 +0000175 if i < 0:
176 break
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000177 i = str.rfind('\n', 0, i) + 1 # start of colon line
David Scherer7aced172000-08-15 01:13:23 +0000178 m = _synchre(str, i, limit)
179 if m and not is_char_in_string(m.start()):
180 pos = m.start()
181 break
182 limit = i
183 if pos is None:
184 # Nothing looks like a block-opener, or stuff does
185 # but is_char_in_string keeps returning true; most likely
186 # we're in or near a giant string, the colorizer hasn't
187 # caught up enough to be helpful, or there simply *aren't*
188 # any interesting stmts. In any of these cases we're
189 # going to have to parse the whole thing to be sure, so
190 # give it one last try from the start, but stop wasting
191 # time here regardless of the outcome.
192 m = _synchre(str)
193 if m and not is_char_in_string(m.start()):
194 pos = m.start()
195 return pos
196
197 # Peeking back worked; look forward until _synchre no longer
198 # matches.
199 i = pos + 1
200 while 1:
201 m = _synchre(str, i)
202 if m:
203 s, i = m.span()
204 if not is_char_in_string(s):
205 pos = s
206 else:
207 break
208 return pos
209
210 # Throw away the start of the string. Intended to be called with
211 # find_good_parse_start's result.
212
213 def set_lo(self, lo):
214 assert lo == 0 or self.str[lo-1] == '\n'
215 if lo > 0:
216 self.str = self.str[lo:]
217
Tal Einat9b7f9e62014-07-16 16:33:36 +0300218 # Build a translation table to map uninteresting chars to 'x', open
219 # brackets to '(', close brackets to ')' while preserving quotes,
220 # backslashes, newlines and hashes. This is to be passed to
221 # str.translate() in _study1().
222 _tran = {}
223 _tran.update((ord(c), ord('(')) for c in "({[")
224 _tran.update((ord(c), ord(')')) for c in ")}]")
225 _tran.update((ord(c), ord(c)) for c in "\"'\\\n#")
226 _tran = StringTranslatePseudoMapping(_tran, default_value=ord('x'))
227
David Scherer7aced172000-08-15 01:13:23 +0000228 # As quickly as humanly possible <wink>, find the line numbers (0-
229 # based) of the non-continuation lines.
230 # Creates self.{goodlines, continuation}.
231
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000232 def _study1(self):
David Scherer7aced172000-08-15 01:13:23 +0000233 if self.study_level >= 1:
234 return
235 self.study_level = 1
236
237 # Map all uninteresting characters to "x", all open brackets
238 # to "(", all close brackets to ")", then collapse runs of
239 # uninteresting characters. This can cut the number of chars
240 # by a factor of 10-40, and so greatly speed the following loop.
241 str = self.str
Tal Einat9b7f9e62014-07-16 16:33:36 +0300242 str = str.translate(self._tran)
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000243 str = str.replace('xxxxxxxx', 'x')
244 str = str.replace('xxxx', 'x')
245 str = str.replace('xx', 'x')
246 str = str.replace('xx', 'x')
247 str = str.replace('\nx', '\n')
David Scherer7aced172000-08-15 01:13:23 +0000248 # note that replacing x\n with \n would be incorrect, because
249 # x may be preceded by a backslash
250
251 # March over the squashed version of the program, accumulating
252 # the line numbers of non-continued stmts, and determining
253 # whether & why the last stmt is a continuation.
254 continuation = C_NONE
255 level = lno = 0 # level is nesting level; lno is line number
256 self.goodlines = goodlines = [0]
257 push_good = goodlines.append
258 i, n = 0, len(str)
259 while i < n:
260 ch = str[i]
261 i = i+1
262
263 # cases are checked in decreasing order of frequency
264 if ch == 'x':
265 continue
266
267 if ch == '\n':
268 lno = lno + 1
269 if level == 0:
270 push_good(lno)
271 # else we're in an unclosed bracket structure
272 continue
273
274 if ch == '(':
275 level = level + 1
276 continue
277
278 if ch == ')':
279 if level:
280 level = level - 1
281 # else the program is invalid, but we can't complain
282 continue
283
284 if ch == '"' or ch == "'":
285 # consume the string
286 quote = ch
287 if str[i-1:i+2] == quote * 3:
288 quote = quote * 3
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +0000289 firstlno = lno
David Scherer7aced172000-08-15 01:13:23 +0000290 w = len(quote) - 1
291 i = i+w
292 while i < n:
293 ch = str[i]
294 i = i+1
295
296 if ch == 'x':
297 continue
298
299 if str[i-1:i+w] == quote:
300 i = i+w
301 break
302
303 if ch == '\n':
304 lno = lno + 1
305 if w == 0:
306 # unterminated single-quoted string
307 if level == 0:
308 push_good(lno)
309 break
310 continue
311
312 if ch == '\\':
313 assert i < n
314 if str[i] == '\n':
315 lno = lno + 1
316 i = i+1
317 continue
318
319 # else comment char or paren inside string
320
321 else:
322 # didn't break out of the loop, so we're still
323 # inside a string
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +0000324 if (lno - 1) == firstlno:
325 # before the previous \n in str, we were in the first
326 # line of the string
327 continuation = C_STRING_FIRST_LINE
328 else:
329 continuation = C_STRING_NEXT_LINES
David Scherer7aced172000-08-15 01:13:23 +0000330 continue # with outer loop
331
332 if ch == '#':
333 # consume the comment
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000334 i = str.find('\n', i)
David Scherer7aced172000-08-15 01:13:23 +0000335 assert i >= 0
336 continue
337
338 assert ch == '\\'
339 assert i < n
340 if str[i] == '\n':
341 lno = lno + 1
342 if i+1 == n:
343 continuation = C_BACKSLASH
344 i = i+1
345
346 # The last stmt may be continued for all 3 reasons.
347 # String continuation takes precedence over bracket
348 # continuation, which beats backslash continuation.
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +0000349 if (continuation != C_STRING_FIRST_LINE
350 and continuation != C_STRING_NEXT_LINES and level > 0):
David Scherer7aced172000-08-15 01:13:23 +0000351 continuation = C_BRACKET
352 self.continuation = continuation
353
354 # Push the final line number as a sentinel value, regardless of
355 # whether it's continued.
356 assert (continuation == C_NONE) == (goodlines[-1] == lno)
357 if goodlines[-1] != lno:
358 push_good(lno)
359
360 def get_continuation_type(self):
361 self._study1()
362 return self.continuation
363
364 # study1 was sufficient to determine the continuation status,
365 # but doing more requires looking at every character. study2
366 # does this for the last interesting statement in the block.
367 # Creates:
368 # self.stmt_start, stmt_end
369 # slice indices of last interesting stmt
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000370 # self.stmt_bracketing
371 # the bracketing structure of the last interesting stmt;
372 # for example, for the statement "say(boo) or die", stmt_bracketing
373 # will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are
374 # treated as brackets, for the matter.
David Scherer7aced172000-08-15 01:13:23 +0000375 # self.lastch
376 # last non-whitespace character before optional trailing
377 # comment
378 # self.lastopenbracketpos
379 # if continuation is C_BRACKET, index of last open bracket
380
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000381 def _study2(self):
David Scherer7aced172000-08-15 01:13:23 +0000382 if self.study_level >= 2:
383 return
384 self._study1()
385 self.study_level = 2
386
387 # Set p and q to slice indices of last interesting stmt.
388 str, goodlines = self.str, self.goodlines
389 i = len(goodlines) - 1
390 p = len(str) # index of newest line
391 while i:
392 assert p
393 # p is the index of the stmt at line number goodlines[i].
394 # Move p back to the stmt at line number goodlines[i-1].
395 q = p
396 for nothing in range(goodlines[i-1], goodlines[i]):
397 # tricky: sets p to 0 if no preceding newline
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000398 p = str.rfind('\n', 0, p-1) + 1
David Scherer7aced172000-08-15 01:13:23 +0000399 # The stmt str[p:q] isn't a continuation, but may be blank
400 # or a non-indenting comment line.
401 if _junkre(str, p):
402 i = i-1
403 else:
404 break
405 if i == 0:
406 # nothing but junk!
407 assert p == 0
408 q = p
409 self.stmt_start, self.stmt_end = p, q
410
411 # Analyze this stmt, to find the last open bracket (if any)
412 # and last interesting character (if any).
413 lastch = ""
414 stack = [] # stack of open bracket indices
415 push_stack = stack.append
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000416 bracketing = [(p, 0)]
David Scherer7aced172000-08-15 01:13:23 +0000417 while p < q:
418 # suck up all except ()[]{}'"#\\
419 m = _chew_ordinaryre(str, p, q)
420 if m:
421 # we skipped at least one boring char
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000422 newp = m.end()
David Scherer7aced172000-08-15 01:13:23 +0000423 # back up over totally boring whitespace
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000424 i = newp - 1 # index of last boring char
425 while i >= p and str[i] in " \t\n":
David Scherer7aced172000-08-15 01:13:23 +0000426 i = i-1
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000427 if i >= p:
David Scherer7aced172000-08-15 01:13:23 +0000428 lastch = str[i]
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000429 p = newp
David Scherer7aced172000-08-15 01:13:23 +0000430 if p >= q:
431 break
432
433 ch = str[p]
434
435 if ch in "([{":
436 push_stack(p)
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000437 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000438 lastch = ch
439 p = p+1
440 continue
441
442 if ch in ")]}":
443 if stack:
444 del stack[-1]
445 lastch = ch
446 p = p+1
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000447 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000448 continue
449
450 if ch == '"' or ch == "'":
451 # consume string
452 # Note that study1 did this with a Python loop, but
453 # we use a regexp here; the reason is speed in both
454 # cases; the string may be huge, but study1 pre-squashed
455 # strings to a couple of characters per line. study1
456 # also needed to keep track of newlines, and we don't
457 # have to.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000458 bracketing.append((p, len(stack)+1))
David Scherer7aced172000-08-15 01:13:23 +0000459 lastch = ch
460 p = _match_stringre(str, p, q).end()
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000461 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000462 continue
463
464 if ch == '#':
465 # consume comment and trailing newline
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000466 bracketing.append((p, len(stack)+1))
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000467 p = str.find('\n', p, q) + 1
David Scherer7aced172000-08-15 01:13:23 +0000468 assert p > 0
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000469 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000470 continue
471
472 assert ch == '\\'
473 p = p+1 # beyond backslash
474 assert p < q
475 if str[p] != '\n':
476 # the program is invalid, but can't complain
477 lastch = ch + str[p]
478 p = p+1 # beyond escaped char
479
480 # end while p < q:
481
482 self.lastch = lastch
483 if stack:
484 self.lastopenbracketpos = stack[-1]
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000485 self.stmt_bracketing = tuple(bracketing)
David Scherer7aced172000-08-15 01:13:23 +0000486
487 # Assuming continuation is C_BRACKET, return the number
488 # of spaces the next line should be indented.
489
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000490 def compute_bracket_indent(self):
David Scherer7aced172000-08-15 01:13:23 +0000491 self._study2()
492 assert self.continuation == C_BRACKET
493 j = self.lastopenbracketpos
494 str = self.str
495 n = len(str)
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000496 origi = i = str.rfind('\n', 0, j) + 1
David Scherer7aced172000-08-15 01:13:23 +0000497 j = j+1 # one beyond open bracket
498 # find first list item; set i to start of its line
499 while j < n:
500 m = _itemre(str, j)
501 if m:
502 j = m.end() - 1 # index of first interesting char
503 extra = 0
504 break
505 else:
506 # this line is junk; advance to next line
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000507 i = j = str.find('\n', j) + 1
David Scherer7aced172000-08-15 01:13:23 +0000508 else:
509 # nothing interesting follows the bracket;
510 # reproduce the bracket line's indentation + a level
511 j = i = origi
512 while str[j] in " \t":
513 j = j+1
514 extra = self.indentwidth
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000515 return len(str[i:j].expandtabs(self.tabwidth)) + extra
David Scherer7aced172000-08-15 01:13:23 +0000516
517 # Return number of physical lines in last stmt (whether or not
518 # it's an interesting stmt! this is intended to be called when
519 # continuation is C_BACKSLASH).
520
521 def get_num_lines_in_stmt(self):
522 self._study1()
523 goodlines = self.goodlines
524 return goodlines[-1] - goodlines[-2]
525
526 # Assuming continuation is C_BACKSLASH, return the number of spaces
527 # the next line should be indented. Also assuming the new line is
528 # the first one following the initial line of the stmt.
529
530 def compute_backslash_indent(self):
531 self._study2()
532 assert self.continuation == C_BACKSLASH
533 str = self.str
534 i = self.stmt_start
535 while str[i] in " \t":
536 i = i+1
537 startpos = i
538
539 # See whether the initial line starts an assignment stmt; i.e.,
540 # look for an = operator
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000541 endpos = str.find('\n', startpos) + 1
David Scherer7aced172000-08-15 01:13:23 +0000542 found = level = 0
543 while i < endpos:
544 ch = str[i]
545 if ch in "([{":
546 level = level + 1
547 i = i+1
548 elif ch in ")]}":
549 if level:
550 level = level - 1
551 i = i+1
552 elif ch == '"' or ch == "'":
553 i = _match_stringre(str, i, endpos).end()
554 elif ch == '#':
555 break
556 elif level == 0 and ch == '=' and \
557 (i == 0 or str[i-1] not in "=<>!") and \
558 str[i+1] != '=':
559 found = 1
560 break
561 else:
562 i = i+1
563
564 if found:
565 # found a legit =, but it may be the last interesting
566 # thing on the line
567 i = i+1 # move beyond the =
568 found = re.match(r"\s*\\", str[i:endpos]) is None
569
570 if not found:
571 # oh well ... settle for moving beyond the first chunk
572 # of non-whitespace chars
573 i = startpos
574 while str[i] not in " \t\n":
575 i = i+1
576
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000577 return len(str[self.stmt_start:i].expandtabs(\
David Scherer7aced172000-08-15 01:13:23 +0000578 self.tabwidth)) + 1
579
580 # Return the leading whitespace on the initial line of the last
581 # interesting stmt.
582
583 def get_base_indent_string(self):
584 self._study2()
585 i, n = self.stmt_start, self.stmt_end
586 j = i
587 str = self.str
588 while j < n and str[j] in " \t":
589 j = j + 1
590 return str[i:j]
591
592 # Did the last interesting stmt open a block?
593
594 def is_block_opener(self):
595 self._study2()
596 return self.lastch == ':'
597
598 # Did the last interesting stmt close a block?
599
600 def is_block_closer(self):
601 self._study2()
602 return _closere(self.str, self.stmt_start) is not None
603
604 # index of last open bracket ({[, or None if none
605 lastopenbracketpos = None
606
607 def get_last_open_bracket_pos(self):
608 self._study2()
609 return self.lastopenbracketpos
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000610
611 # the structure of the bracketing of the last interesting statement,
612 # in the format defined in _study2, or None if the text didn't contain
613 # anything
614 stmt_bracketing = None
615
616 def get_last_stmt_bracketing(self):
617 self._study2()
618 return self.stmt_bracketing