blob: 9ccbb250761e362e038f30a2a1f15e40d1cd3d1b [file] [log] [blame]
David Scherer7aced172000-08-15 01:13:23 +00001import re
2import sys
Tal Einat9b7f9e62014-07-16 16:33:36 +03003from collections import Mapping
David Scherer7aced172000-08-15 01:13:23 +00004
5# Reason last stmt is continued (or C_NONE if it's not).
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +00006(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
7 C_STRING_NEXT_LINES, C_BRACKET) = range(5)
David Scherer7aced172000-08-15 01:13:23 +00008
9if 0: # for throwaway debugging output
10 def dump(*stuff):
Kurt B. Kaiser254eb532002-09-17 03:55:13 +000011 sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
David Scherer7aced172000-08-15 01:13:23 +000012
13# Find what looks like the start of a popular stmt.
14
15_synchre = re.compile(r"""
16 ^
17 [ \t]*
Kurt B. Kaiserb1754452005-11-18 22:05:48 +000018 (?: while
David Scherer7aced172000-08-15 01:13:23 +000019 | else
20 | def
21 | return
22 | assert
23 | break
24 | class
25 | continue
26 | elif
27 | try
28 | except
29 | raise
30 | import
Kurt B. Kaiser752e4d52001-07-14 04:59:24 +000031 | yield
David Scherer7aced172000-08-15 01:13:23 +000032 )
33 \b
34""", re.VERBOSE | re.MULTILINE).search
35
36# Match blank line or non-indenting comment line.
37
38_junkre = re.compile(r"""
39 [ \t]*
40 (?: \# \S .* )?
41 \n
42""", re.VERBOSE).match
43
44# Match any flavor of string; the terminating quote is optional
45# so that we're robust in the face of incomplete program text.
46
47_match_stringre = re.compile(r"""
48 \""" [^"\\]* (?:
49 (?: \\. | "(?!"") )
50 [^"\\]*
51 )*
52 (?: \""" )?
53
54| " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
55
56| ''' [^'\\]* (?:
57 (?: \\. | '(?!'') )
58 [^'\\]*
59 )*
60 (?: ''' )?
61
62| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
63""", re.VERBOSE | re.DOTALL).match
64
65# Match a line that starts with something interesting;
66# used to find the first item of a bracket structure.
67
68_itemre = re.compile(r"""
69 [ \t]*
70 [^\s#\\] # if we match, m.end()-1 is the interesting char
71""", re.VERBOSE).match
72
73# Match start of stmts that should be followed by a dedent.
74
75_closere = re.compile(r"""
76 \s*
77 (?: return
78 | break
79 | continue
80 | raise
81 | pass
82 )
83 \b
84""", re.VERBOSE).match
85
86# Chew up non-special chars as quickly as possible. If match is
87# successful, m.end() less 1 is the index of the last boring char
88# matched. If match is unsuccessful, the string starts with an
89# interesting char.
90
91_chew_ordinaryre = re.compile(r"""
92 [^[\](){}#'"\\]+
93""", re.VERBOSE).match
94
David Scherer7aced172000-08-15 01:13:23 +000095
Tal Einat9b7f9e62014-07-16 16:33:36 +030096class StringTranslatePseudoMapping(Mapping):
97 r"""Utility class to be used with str.translate()
98
99 This Mapping class wraps a given dict. When a value for a key is
100 requested via __getitem__() or get(), the key is looked up in the
101 given dict. If found there, the value from the dict is returned.
102 Otherwise, the default value given upon initialization is returned.
103
104 This allows using str.translate() to make some replacements, and to
105 replace all characters for which no replacement was specified with
106 a given character instead of leaving them as-is.
107
108 For example, to replace everything except whitespace with 'x':
109
110 >>> whitespace_chars = ' \t\n\r'
111 >>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars}
112 >>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x'))
113 >>> text = "a + b\tc\nd"
114 >>> text.translate(mapping)
115 'x x x\tx\nx'
116 """
117 def __init__(self, non_defaults, default_value):
118 self._non_defaults = non_defaults
119 self._default_value = default_value
120
121 def _get(key, _get=non_defaults.get, _default=default_value):
122 return _get(key, _default)
123 self._get = _get
124
125 def __getitem__(self, item):
126 return self._get(item)
127
128 def __len__(self):
129 return len(self._non_defaults)
130
131 def __iter__(self):
132 return iter(self._non_defaults)
133
134 def get(self, key, default=None):
135 return self._get(key)
136
David Scherer7aced172000-08-15 01:13:23 +0000137
138class Parser:
139
140 def __init__(self, indentwidth, tabwidth):
141 self.indentwidth = indentwidth
142 self.tabwidth = tabwidth
143
Walter Dörwald5de48bd2007-06-11 21:38:39 +0000144 def set_str(self, s):
145 assert len(s) == 0 or s[-1] == '\n'
Walter Dörwald5de48bd2007-06-11 21:38:39 +0000146 self.str = s
David Scherer7aced172000-08-15 01:13:23 +0000147 self.study_level = 0
148
149 # Return index of a good place to begin parsing, as close to the
150 # end of the string as possible. This will be the start of some
151 # popular stmt like "if" or "def". Return None if none found:
152 # the caller should pass more prior context then, if possible, or
153 # if not (the entire program text up until the point of interest
154 # has already been tried) pass 0 to set_lo.
155 #
156 # This will be reliable iff given a reliable is_char_in_string
157 # function, meaning that when it says "no", it's absolutely
158 # guaranteed that the char is not in a string.
David Scherer7aced172000-08-15 01:13:23 +0000159
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000160 def find_good_parse_start(self, is_char_in_string=None,
David Scherer7aced172000-08-15 01:13:23 +0000161 _synchre=_synchre):
162 str, pos = self.str, None
David Scherer7aced172000-08-15 01:13:23 +0000163
David Scherer7aced172000-08-15 01:13:23 +0000164 if not is_char_in_string:
165 # no clue -- make the caller pass everything
166 return None
167
168 # Peek back from the end for a good place to start,
169 # but don't try too often; pos will be left None, or
170 # bumped to a legitimate synch point.
171 limit = len(str)
172 for tries in range(5):
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000173 i = str.rfind(":\n", 0, limit)
David Scherer7aced172000-08-15 01:13:23 +0000174 if i < 0:
175 break
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000176 i = str.rfind('\n', 0, i) + 1 # start of colon line
David Scherer7aced172000-08-15 01:13:23 +0000177 m = _synchre(str, i, limit)
178 if m and not is_char_in_string(m.start()):
179 pos = m.start()
180 break
181 limit = i
182 if pos is None:
183 # Nothing looks like a block-opener, or stuff does
184 # but is_char_in_string keeps returning true; most likely
185 # we're in or near a giant string, the colorizer hasn't
186 # caught up enough to be helpful, or there simply *aren't*
187 # any interesting stmts. In any of these cases we're
188 # going to have to parse the whole thing to be sure, so
189 # give it one last try from the start, but stop wasting
190 # time here regardless of the outcome.
191 m = _synchre(str)
192 if m and not is_char_in_string(m.start()):
193 pos = m.start()
194 return pos
195
196 # Peeking back worked; look forward until _synchre no longer
197 # matches.
198 i = pos + 1
199 while 1:
200 m = _synchre(str, i)
201 if m:
202 s, i = m.span()
203 if not is_char_in_string(s):
204 pos = s
205 else:
206 break
207 return pos
208
209 # Throw away the start of the string. Intended to be called with
210 # find_good_parse_start's result.
211
212 def set_lo(self, lo):
213 assert lo == 0 or self.str[lo-1] == '\n'
214 if lo > 0:
215 self.str = self.str[lo:]
216
Tal Einat9b7f9e62014-07-16 16:33:36 +0300217 # Build a translation table to map uninteresting chars to 'x', open
218 # brackets to '(', close brackets to ')' while preserving quotes,
219 # backslashes, newlines and hashes. This is to be passed to
220 # str.translate() in _study1().
221 _tran = {}
222 _tran.update((ord(c), ord('(')) for c in "({[")
223 _tran.update((ord(c), ord(')')) for c in ")}]")
224 _tran.update((ord(c), ord(c)) for c in "\"'\\\n#")
225 _tran = StringTranslatePseudoMapping(_tran, default_value=ord('x'))
226
David Scherer7aced172000-08-15 01:13:23 +0000227 # As quickly as humanly possible <wink>, find the line numbers (0-
228 # based) of the non-continuation lines.
229 # Creates self.{goodlines, continuation}.
230
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000231 def _study1(self):
David Scherer7aced172000-08-15 01:13:23 +0000232 if self.study_level >= 1:
233 return
234 self.study_level = 1
235
236 # Map all uninteresting characters to "x", all open brackets
237 # to "(", all close brackets to ")", then collapse runs of
238 # uninteresting characters. This can cut the number of chars
239 # by a factor of 10-40, and so greatly speed the following loop.
240 str = self.str
Tal Einat9b7f9e62014-07-16 16:33:36 +0300241 str = str.translate(self._tran)
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000242 str = str.replace('xxxxxxxx', 'x')
243 str = str.replace('xxxx', 'x')
244 str = str.replace('xx', 'x')
245 str = str.replace('xx', 'x')
246 str = str.replace('\nx', '\n')
David Scherer7aced172000-08-15 01:13:23 +0000247 # note that replacing x\n with \n would be incorrect, because
248 # x may be preceded by a backslash
249
250 # March over the squashed version of the program, accumulating
251 # the line numbers of non-continued stmts, and determining
252 # whether & why the last stmt is a continuation.
253 continuation = C_NONE
254 level = lno = 0 # level is nesting level; lno is line number
255 self.goodlines = goodlines = [0]
256 push_good = goodlines.append
257 i, n = 0, len(str)
258 while i < n:
259 ch = str[i]
260 i = i+1
261
262 # cases are checked in decreasing order of frequency
263 if ch == 'x':
264 continue
265
266 if ch == '\n':
267 lno = lno + 1
268 if level == 0:
269 push_good(lno)
270 # else we're in an unclosed bracket structure
271 continue
272
273 if ch == '(':
274 level = level + 1
275 continue
276
277 if ch == ')':
278 if level:
279 level = level - 1
280 # else the program is invalid, but we can't complain
281 continue
282
283 if ch == '"' or ch == "'":
284 # consume the string
285 quote = ch
286 if str[i-1:i+2] == quote * 3:
287 quote = quote * 3
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +0000288 firstlno = lno
David Scherer7aced172000-08-15 01:13:23 +0000289 w = len(quote) - 1
290 i = i+w
291 while i < n:
292 ch = str[i]
293 i = i+1
294
295 if ch == 'x':
296 continue
297
298 if str[i-1:i+w] == quote:
299 i = i+w
300 break
301
302 if ch == '\n':
303 lno = lno + 1
304 if w == 0:
305 # unterminated single-quoted string
306 if level == 0:
307 push_good(lno)
308 break
309 continue
310
311 if ch == '\\':
312 assert i < n
313 if str[i] == '\n':
314 lno = lno + 1
315 i = i+1
316 continue
317
318 # else comment char or paren inside string
319
320 else:
321 # didn't break out of the loop, so we're still
322 # inside a string
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +0000323 if (lno - 1) == firstlno:
324 # before the previous \n in str, we were in the first
325 # line of the string
326 continuation = C_STRING_FIRST_LINE
327 else:
328 continuation = C_STRING_NEXT_LINES
David Scherer7aced172000-08-15 01:13:23 +0000329 continue # with outer loop
330
331 if ch == '#':
332 # consume the comment
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000333 i = str.find('\n', i)
David Scherer7aced172000-08-15 01:13:23 +0000334 assert i >= 0
335 continue
336
337 assert ch == '\\'
338 assert i < n
339 if str[i] == '\n':
340 lno = lno + 1
341 if i+1 == n:
342 continuation = C_BACKSLASH
343 i = i+1
344
345 # The last stmt may be continued for all 3 reasons.
346 # String continuation takes precedence over bracket
347 # continuation, which beats backslash continuation.
Kurt B. Kaiserb61602c2005-11-15 07:20:06 +0000348 if (continuation != C_STRING_FIRST_LINE
349 and continuation != C_STRING_NEXT_LINES and level > 0):
David Scherer7aced172000-08-15 01:13:23 +0000350 continuation = C_BRACKET
351 self.continuation = continuation
352
353 # Push the final line number as a sentinel value, regardless of
354 # whether it's continued.
355 assert (continuation == C_NONE) == (goodlines[-1] == lno)
356 if goodlines[-1] != lno:
357 push_good(lno)
358
359 def get_continuation_type(self):
360 self._study1()
361 return self.continuation
362
363 # study1 was sufficient to determine the continuation status,
364 # but doing more requires looking at every character. study2
365 # does this for the last interesting statement in the block.
366 # Creates:
367 # self.stmt_start, stmt_end
368 # slice indices of last interesting stmt
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000369 # self.stmt_bracketing
370 # the bracketing structure of the last interesting stmt;
371 # for example, for the statement "say(boo) or die", stmt_bracketing
372 # will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are
373 # treated as brackets, for the matter.
David Scherer7aced172000-08-15 01:13:23 +0000374 # self.lastch
375 # last non-whitespace character before optional trailing
376 # comment
377 # self.lastopenbracketpos
378 # if continuation is C_BRACKET, index of last open bracket
379
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000380 def _study2(self):
David Scherer7aced172000-08-15 01:13:23 +0000381 if self.study_level >= 2:
382 return
383 self._study1()
384 self.study_level = 2
385
386 # Set p and q to slice indices of last interesting stmt.
387 str, goodlines = self.str, self.goodlines
388 i = len(goodlines) - 1
389 p = len(str) # index of newest line
390 while i:
391 assert p
392 # p is the index of the stmt at line number goodlines[i].
393 # Move p back to the stmt at line number goodlines[i-1].
394 q = p
395 for nothing in range(goodlines[i-1], goodlines[i]):
396 # tricky: sets p to 0 if no preceding newline
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000397 p = str.rfind('\n', 0, p-1) + 1
David Scherer7aced172000-08-15 01:13:23 +0000398 # The stmt str[p:q] isn't a continuation, but may be blank
399 # or a non-indenting comment line.
400 if _junkre(str, p):
401 i = i-1
402 else:
403 break
404 if i == 0:
405 # nothing but junk!
406 assert p == 0
407 q = p
408 self.stmt_start, self.stmt_end = p, q
409
410 # Analyze this stmt, to find the last open bracket (if any)
411 # and last interesting character (if any).
412 lastch = ""
413 stack = [] # stack of open bracket indices
414 push_stack = stack.append
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000415 bracketing = [(p, 0)]
David Scherer7aced172000-08-15 01:13:23 +0000416 while p < q:
417 # suck up all except ()[]{}'"#\\
418 m = _chew_ordinaryre(str, p, q)
419 if m:
420 # we skipped at least one boring char
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000421 newp = m.end()
David Scherer7aced172000-08-15 01:13:23 +0000422 # back up over totally boring whitespace
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000423 i = newp - 1 # index of last boring char
424 while i >= p and str[i] in " \t\n":
David Scherer7aced172000-08-15 01:13:23 +0000425 i = i-1
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000426 if i >= p:
David Scherer7aced172000-08-15 01:13:23 +0000427 lastch = str[i]
Kurt B. Kaiser3269cc82001-07-13 20:33:46 +0000428 p = newp
David Scherer7aced172000-08-15 01:13:23 +0000429 if p >= q:
430 break
431
432 ch = str[p]
433
434 if ch in "([{":
435 push_stack(p)
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000436 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000437 lastch = ch
438 p = p+1
439 continue
440
441 if ch in ")]}":
442 if stack:
443 del stack[-1]
444 lastch = ch
445 p = p+1
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000446 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000447 continue
448
449 if ch == '"' or ch == "'":
450 # consume string
451 # Note that study1 did this with a Python loop, but
452 # we use a regexp here; the reason is speed in both
453 # cases; the string may be huge, but study1 pre-squashed
454 # strings to a couple of characters per line. study1
455 # also needed to keep track of newlines, and we don't
456 # have to.
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000457 bracketing.append((p, len(stack)+1))
David Scherer7aced172000-08-15 01:13:23 +0000458 lastch = ch
459 p = _match_stringre(str, p, q).end()
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000460 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000461 continue
462
463 if ch == '#':
464 # consume comment and trailing newline
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000465 bracketing.append((p, len(stack)+1))
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000466 p = str.find('\n', p, q) + 1
David Scherer7aced172000-08-15 01:13:23 +0000467 assert p > 0
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000468 bracketing.append((p, len(stack)))
David Scherer7aced172000-08-15 01:13:23 +0000469 continue
470
471 assert ch == '\\'
472 p = p+1 # beyond backslash
473 assert p < q
474 if str[p] != '\n':
475 # the program is invalid, but can't complain
476 lastch = ch + str[p]
477 p = p+1 # beyond escaped char
478
479 # end while p < q:
480
481 self.lastch = lastch
482 if stack:
483 self.lastopenbracketpos = stack[-1]
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000484 self.stmt_bracketing = tuple(bracketing)
David Scherer7aced172000-08-15 01:13:23 +0000485
486 # Assuming continuation is C_BRACKET, return the number
487 # of spaces the next line should be indented.
488
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000489 def compute_bracket_indent(self):
David Scherer7aced172000-08-15 01:13:23 +0000490 self._study2()
491 assert self.continuation == C_BRACKET
492 j = self.lastopenbracketpos
493 str = self.str
494 n = len(str)
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000495 origi = i = str.rfind('\n', 0, j) + 1
David Scherer7aced172000-08-15 01:13:23 +0000496 j = j+1 # one beyond open bracket
497 # find first list item; set i to start of its line
498 while j < n:
499 m = _itemre(str, j)
500 if m:
501 j = m.end() - 1 # index of first interesting char
502 extra = 0
503 break
504 else:
505 # this line is junk; advance to next line
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000506 i = j = str.find('\n', j) + 1
David Scherer7aced172000-08-15 01:13:23 +0000507 else:
508 # nothing interesting follows the bracket;
509 # reproduce the bracket line's indentation + a level
510 j = i = origi
511 while str[j] in " \t":
512 j = j+1
513 extra = self.indentwidth
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000514 return len(str[i:j].expandtabs(self.tabwidth)) + extra
David Scherer7aced172000-08-15 01:13:23 +0000515
516 # Return number of physical lines in last stmt (whether or not
517 # it's an interesting stmt! this is intended to be called when
518 # continuation is C_BACKSLASH).
519
520 def get_num_lines_in_stmt(self):
521 self._study1()
522 goodlines = self.goodlines
523 return goodlines[-1] - goodlines[-2]
524
525 # Assuming continuation is C_BACKSLASH, return the number of spaces
526 # the next line should be indented. Also assuming the new line is
527 # the first one following the initial line of the stmt.
528
529 def compute_backslash_indent(self):
530 self._study2()
531 assert self.continuation == C_BACKSLASH
532 str = self.str
533 i = self.stmt_start
534 while str[i] in " \t":
535 i = i+1
536 startpos = i
537
538 # See whether the initial line starts an assignment stmt; i.e.,
539 # look for an = operator
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000540 endpos = str.find('\n', startpos) + 1
David Scherer7aced172000-08-15 01:13:23 +0000541 found = level = 0
542 while i < endpos:
543 ch = str[i]
544 if ch in "([{":
545 level = level + 1
546 i = i+1
547 elif ch in ")]}":
548 if level:
549 level = level - 1
550 i = i+1
551 elif ch == '"' or ch == "'":
552 i = _match_stringre(str, i, endpos).end()
553 elif ch == '#':
554 break
555 elif level == 0 and ch == '=' and \
556 (i == 0 or str[i-1] not in "=<>!") and \
557 str[i+1] != '=':
558 found = 1
559 break
560 else:
561 i = i+1
562
563 if found:
564 # found a legit =, but it may be the last interesting
565 # thing on the line
566 i = i+1 # move beyond the =
567 found = re.match(r"\s*\\", str[i:endpos]) is None
568
569 if not found:
570 # oh well ... settle for moving beyond the first chunk
571 # of non-whitespace chars
572 i = startpos
573 while str[i] not in " \t\n":
574 i = i+1
575
Kurt B. Kaiser254eb532002-09-17 03:55:13 +0000576 return len(str[self.stmt_start:i].expandtabs(\
David Scherer7aced172000-08-15 01:13:23 +0000577 self.tabwidth)) + 1
578
579 # Return the leading whitespace on the initial line of the last
580 # interesting stmt.
581
582 def get_base_indent_string(self):
583 self._study2()
584 i, n = self.stmt_start, self.stmt_end
585 j = i
586 str = self.str
587 while j < n and str[j] in " \t":
588 j = j + 1
589 return str[i:j]
590
591 # Did the last interesting stmt open a block?
592
593 def is_block_opener(self):
594 self._study2()
595 return self.lastch == ':'
596
597 # Did the last interesting stmt close a block?
598
599 def is_block_closer(self):
600 self._study2()
601 return _closere(self.str, self.stmt_start) is not None
602
603 # index of last open bracket ({[, or None if none
604 lastopenbracketpos = None
605
606 def get_last_open_bracket_pos(self):
607 self._study2()
608 return self.lastopenbracketpos
Kurt B. Kaiserb1754452005-11-18 22:05:48 +0000609
610 # the structure of the bracketing of the last interesting statement,
611 # in the format defined in _study2, or None if the text didn't contain
612 # anything
613 stmt_bracketing = None
614
615 def get_last_stmt_bracketing(self):
616 self._study2()
617 return self.stmt_bracketing