Blame - Lib/idlelib/PyParse.py - platform/external/python/cpython2

blob: 1bf4919c21ac9803947d2f51b3477c398ff994d1 [file] [log] [blame]

David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	1	import re
				2	import sys
				3
				4	# Reason last stmt is continued (or C_NONE if it's not).
Kurt B. Kaiser	b61602c	2005-11-15 07:20:06 +0000	[diff] [blame^]	5	(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
				6	C_STRING_NEXT_LINES, C_BRACKET) = range(5)
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	7
				8	if 0: # for throwaway debugging output
				9	def dump(*stuff):
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	10	sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	11
				12	# Find what looks like the start of a popular stmt.
				13
				14	_synchre = re.compile(r"""
				15	^
				16	[ \t]*
				17	(?: if
				18	\| for
				19	\| while
				20	\| else
				21	\| def
				22	\| return
				23	\| assert
				24	\| break
				25	\| class
				26	\| continue
				27	\| elif
				28	\| try
				29	\| except
				30	\| raise
				31	\| import
Kurt B. Kaiser	752e4d5	2001-07-14 04:59:24 +0000	[diff] [blame]	32	\| yield
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	33	)
				34	\b
				35	""", re.VERBOSE \| re.MULTILINE).search
				36
				37	# Match blank line or non-indenting comment line.
				38
				39	_junkre = re.compile(r"""
				40	[ \t]*
				41	(?: \# \S .* )?
				42	\n
				43	""", re.VERBOSE).match
				44
				45	# Match any flavor of string; the terminating quote is optional
				46	# so that we're robust in the face of incomplete program text.
				47
				48	_match_stringre = re.compile(r"""
				49	\""" [^"\\]* (?:
				50	(?: \\. \| "(?!"") )
				51	[^"\\]*
				52	)*
				53	(?: \""" )?
				54
				55	\| " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
				56
				57	\| ''' [^'\\]* (?:
				58	(?: \\. \| '(?!'') )
				59	[^'\\]*
				60	)*
				61	(?: ''' )?
				62
				63	\| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
				64	""", re.VERBOSE \| re.DOTALL).match
				65
				66	# Match a line that starts with something interesting;
				67	# used to find the first item of a bracket structure.
				68
				69	_itemre = re.compile(r"""
				70	[ \t]*
				71	[^\s#\\] # if we match, m.end()-1 is the interesting char
				72	""", re.VERBOSE).match
				73
				74	# Match start of stmts that should be followed by a dedent.
				75
				76	_closere = re.compile(r"""
				77	\s*
				78	(?: return
				79	\| break
				80	\| continue
				81	\| raise
				82	\| pass
				83	)
				84	\b
				85	""", re.VERBOSE).match
				86
				87	# Chew up non-special chars as quickly as possible. If match is
				88	# successful, m.end() less 1 is the index of the last boring char
				89	# matched. If match is unsuccessful, the string starts with an
				90	# interesting char.
				91
				92	_chew_ordinaryre = re.compile(r"""
				93	[^[\](){}#'"\\]+
				94	""", re.VERBOSE).match
				95
				96	# Build translation table to map uninteresting chars to "x", open
				97	# brackets to "(", and close brackets to ")".
				98
				99	_tran = ['x'] * 256
				100	for ch in "({[":
				101	_tran[ord(ch)] = '('
				102	for ch in ")}]":
				103	_tran[ord(ch)] = ')'
				104	for ch in "\"'\\\n#":
				105	_tran[ord(ch)] = ch
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	106	_tran = ''.join(_tran)
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	107	del ch
				108
Kurt B. Kaiser	3269cc8	2001-07-13 20:33:46 +0000	[diff] [blame]	109	try:
				110	UnicodeType = type(unicode(""))
				111	except NameError:
				112	UnicodeType = None
				113
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	114	class Parser:
				115
				116	def __init__(self, indentwidth, tabwidth):
				117	self.indentwidth = indentwidth
				118	self.tabwidth = tabwidth
				119
				120	def set_str(self, str):
				121	assert len(str) == 0 or str[-1] == '\n'
Kurt B. Kaiser	3269cc8	2001-07-13 20:33:46 +0000	[diff] [blame]	122	if type(str) is UnicodeType:
				123	# The parse functions have no idea what to do with Unicode, so
				124	# replace all Unicode characters with "x". This is "safe"
				125	# so long as the only characters germane to parsing the structure
				126	# of Python are 7-bit ASCII. It's necessary because Unicode
				127	# strings don't have a .translate() method that supports
				128	# deletechars.
				129	uniphooey = str
				130	str = []
				131	push = str.append
				132	for raw in map(ord, uniphooey):
				133	push(raw < 127 and chr(raw) or "x")
				134	str = "".join(str)
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	135	self.str = str
				136	self.study_level = 0
				137
				138	# Return index of a good place to begin parsing, as close to the
				139	# end of the string as possible. This will be the start of some
				140	# popular stmt like "if" or "def". Return None if none found:
				141	# the caller should pass more prior context then, if possible, or
				142	# if not (the entire program text up until the point of interest
				143	# has already been tried) pass 0 to set_lo.
				144	#
				145	# This will be reliable iff given a reliable is_char_in_string
				146	# function, meaning that when it says "no", it's absolutely
				147	# guaranteed that the char is not in a string.
				148	#
				149	# Ack, hack: in the shell window this kills us, because there's
				150	# no way to tell the differences between output, >>> etc and
				151	# user input. Indeed, IDLE's first output line makes the rest
				152	# look like it's in an unclosed paren!:
				153	# Python 1.5.2 (#0, Apr 13 1999, ...
				154
				155	def find_good_parse_start(self, use_ps1, is_char_in_string=None,
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	156	_synchre=_synchre):
				157	str, pos = self.str, None
				158	if use_ps1:
				159	# shell window
				160	ps1 = '\n' + sys.ps1
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	161	i = str.rfind(ps1)
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	162	if i >= 0:
				163	pos = i + len(ps1)
				164	# make it look like there's a newline instead
				165	# of ps1 at the start -- hacking here once avoids
				166	# repeated hackery later
				167	self.str = str[:pos-1] + '\n' + str[pos:]
				168	return pos
				169
				170	# File window -- real work.
				171	if not is_char_in_string:
				172	# no clue -- make the caller pass everything
				173	return None
				174
				175	# Peek back from the end for a good place to start,
				176	# but don't try too often; pos will be left None, or
				177	# bumped to a legitimate synch point.
				178	limit = len(str)
				179	for tries in range(5):
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	180	i = str.rfind(":\n", 0, limit)
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	181	if i < 0:
				182	break
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	183	i = str.rfind('\n', 0, i) + 1 # start of colon line
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	184	m = _synchre(str, i, limit)
				185	if m and not is_char_in_string(m.start()):
				186	pos = m.start()
				187	break
				188	limit = i
				189	if pos is None:
				190	# Nothing looks like a block-opener, or stuff does
				191	# but is_char_in_string keeps returning true; most likely
				192	# we're in or near a giant string, the colorizer hasn't
				193	# caught up enough to be helpful, or there simply aren't
				194	# any interesting stmts. In any of these cases we're
				195	# going to have to parse the whole thing to be sure, so
				196	# give it one last try from the start, but stop wasting
				197	# time here regardless of the outcome.
				198	m = _synchre(str)
				199	if m and not is_char_in_string(m.start()):
				200	pos = m.start()
				201	return pos
				202
				203	# Peeking back worked; look forward until _synchre no longer
				204	# matches.
				205	i = pos + 1
				206	while 1:
				207	m = _synchre(str, i)
				208	if m:
				209	s, i = m.span()
				210	if not is_char_in_string(s):
				211	pos = s
				212	else:
				213	break
				214	return pos
				215
				216	# Throw away the start of the string. Intended to be called with
				217	# find_good_parse_start's result.
				218
				219	def set_lo(self, lo):
				220	assert lo == 0 or self.str[lo-1] == '\n'
				221	if lo > 0:
				222	self.str = self.str[lo:]
				223
				224	# As quickly as humanly possible <wink>, find the line numbers (0-
				225	# based) of the non-continuation lines.
				226	# Creates self.{goodlines, continuation}.
				227
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	228	def _study1(self):
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	229	if self.study_level >= 1:
				230	return
				231	self.study_level = 1
				232
				233	# Map all uninteresting characters to "x", all open brackets
				234	# to "(", all close brackets to ")", then collapse runs of
				235	# uninteresting characters. This can cut the number of chars
				236	# by a factor of 10-40, and so greatly speed the following loop.
				237	str = self.str
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	238	str = str.translate(_tran)
				239	str = str.replace('xxxxxxxx', 'x')
				240	str = str.replace('xxxx', 'x')
				241	str = str.replace('xx', 'x')
				242	str = str.replace('xx', 'x')
				243	str = str.replace('\nx', '\n')
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	244	# note that replacing x\n with \n would be incorrect, because
				245	# x may be preceded by a backslash
				246
				247	# March over the squashed version of the program, accumulating
				248	# the line numbers of non-continued stmts, and determining
				249	# whether & why the last stmt is a continuation.
				250	continuation = C_NONE
				251	level = lno = 0 # level is nesting level; lno is line number
				252	self.goodlines = goodlines = [0]
				253	push_good = goodlines.append
				254	i, n = 0, len(str)
				255	while i < n:
				256	ch = str[i]
				257	i = i+1
				258
				259	# cases are checked in decreasing order of frequency
				260	if ch == 'x':
				261	continue
				262
				263	if ch == '\n':
				264	lno = lno + 1
				265	if level == 0:
				266	push_good(lno)
				267	# else we're in an unclosed bracket structure
				268	continue
				269
				270	if ch == '(':
				271	level = level + 1
				272	continue
				273
				274	if ch == ')':
				275	if level:
				276	level = level - 1
				277	# else the program is invalid, but we can't complain
				278	continue
				279
				280	if ch == '"' or ch == "'":
				281	# consume the string
				282	quote = ch
				283	if str[i-1:i+2] == quote * 3:
				284	quote = quote * 3
Kurt B. Kaiser	b61602c	2005-11-15 07:20:06 +0000	[diff] [blame^]	285	firstlno = lno
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	286	w = len(quote) - 1
				287	i = i+w
				288	while i < n:
				289	ch = str[i]
				290	i = i+1
				291
				292	if ch == 'x':
				293	continue
				294
				295	if str[i-1:i+w] == quote:
				296	i = i+w
				297	break
				298
				299	if ch == '\n':
				300	lno = lno + 1
				301	if w == 0:
				302	# unterminated single-quoted string
				303	if level == 0:
				304	push_good(lno)
				305	break
				306	continue
				307
				308	if ch == '\\':
				309	assert i < n
				310	if str[i] == '\n':
				311	lno = lno + 1
				312	i = i+1
				313	continue
				314
				315	# else comment char or paren inside string
				316
				317	else:
				318	# didn't break out of the loop, so we're still
				319	# inside a string
Kurt B. Kaiser	b61602c	2005-11-15 07:20:06 +0000	[diff] [blame^]	320	if (lno - 1) == firstlno:
				321	# before the previous \n in str, we were in the first
				322	# line of the string
				323	continuation = C_STRING_FIRST_LINE
				324	else:
				325	continuation = C_STRING_NEXT_LINES
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	326	continue # with outer loop
				327
				328	if ch == '#':
				329	# consume the comment
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	330	i = str.find('\n', i)
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	331	assert i >= 0
				332	continue
				333
				334	assert ch == '\\'
				335	assert i < n
				336	if str[i] == '\n':
				337	lno = lno + 1
				338	if i+1 == n:
				339	continuation = C_BACKSLASH
				340	i = i+1
				341
				342	# The last stmt may be continued for all 3 reasons.
				343	# String continuation takes precedence over bracket
				344	# continuation, which beats backslash continuation.
Kurt B. Kaiser	b61602c	2005-11-15 07:20:06 +0000	[diff] [blame^]	345	if (continuation != C_STRING_FIRST_LINE
				346	and continuation != C_STRING_NEXT_LINES and level > 0):
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	347	continuation = C_BRACKET
				348	self.continuation = continuation
				349
				350	# Push the final line number as a sentinel value, regardless of
				351	# whether it's continued.
				352	assert (continuation == C_NONE) == (goodlines[-1] == lno)
				353	if goodlines[-1] != lno:
				354	push_good(lno)
				355
				356	def get_continuation_type(self):
				357	self._study1()
				358	return self.continuation
				359
				360	# study1 was sufficient to determine the continuation status,
				361	# but doing more requires looking at every character. study2
				362	# does this for the last interesting statement in the block.
				363	# Creates:
				364	# self.stmt_start, stmt_end
				365	# slice indices of last interesting stmt
				366	# self.lastch
				367	# last non-whitespace character before optional trailing
				368	# comment
				369	# self.lastopenbracketpos
				370	# if continuation is C_BRACKET, index of last open bracket
				371
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	372	def _study2(self):
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	373	if self.study_level >= 2:
				374	return
				375	self._study1()
				376	self.study_level = 2
				377
				378	# Set p and q to slice indices of last interesting stmt.
				379	str, goodlines = self.str, self.goodlines
				380	i = len(goodlines) - 1
				381	p = len(str) # index of newest line
				382	while i:
				383	assert p
				384	# p is the index of the stmt at line number goodlines[i].
				385	# Move p back to the stmt at line number goodlines[i-1].
				386	q = p
				387	for nothing in range(goodlines[i-1], goodlines[i]):
				388	# tricky: sets p to 0 if no preceding newline
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	389	p = str.rfind('\n', 0, p-1) + 1
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	390	# The stmt str[p:q] isn't a continuation, but may be blank
				391	# or a non-indenting comment line.
				392	if _junkre(str, p):
				393	i = i-1
				394	else:
				395	break
				396	if i == 0:
				397	# nothing but junk!
				398	assert p == 0
				399	q = p
				400	self.stmt_start, self.stmt_end = p, q
				401
				402	# Analyze this stmt, to find the last open bracket (if any)
				403	# and last interesting character (if any).
				404	lastch = ""
				405	stack = [] # stack of open bracket indices
				406	push_stack = stack.append
				407	while p < q:
				408	# suck up all except ()[]{}'"#\\
				409	m = _chew_ordinaryre(str, p, q)
				410	if m:
				411	# we skipped at least one boring char
Kurt B. Kaiser	3269cc8	2001-07-13 20:33:46 +0000	[diff] [blame]	412	newp = m.end()
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	413	# back up over totally boring whitespace
Kurt B. Kaiser	3269cc8	2001-07-13 20:33:46 +0000	[diff] [blame]	414	i = newp - 1 # index of last boring char
				415	while i >= p and str[i] in " \t\n":
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	416	i = i-1
Kurt B. Kaiser	3269cc8	2001-07-13 20:33:46 +0000	[diff] [blame]	417	if i >= p:
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	418	lastch = str[i]
Kurt B. Kaiser	3269cc8	2001-07-13 20:33:46 +0000	[diff] [blame]	419	p = newp
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	420	if p >= q:
				421	break
				422
				423	ch = str[p]
				424
				425	if ch in "([{":
				426	push_stack(p)
				427	lastch = ch
				428	p = p+1
				429	continue
				430
				431	if ch in ")]}":
				432	if stack:
				433	del stack[-1]
				434	lastch = ch
				435	p = p+1
				436	continue
				437
				438	if ch == '"' or ch == "'":
				439	# consume string
				440	# Note that study1 did this with a Python loop, but
				441	# we use a regexp here; the reason is speed in both
				442	# cases; the string may be huge, but study1 pre-squashed
				443	# strings to a couple of characters per line. study1
				444	# also needed to keep track of newlines, and we don't
				445	# have to.
				446	lastch = ch
				447	p = _match_stringre(str, p, q).end()
				448	continue
				449
				450	if ch == '#':
				451	# consume comment and trailing newline
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	452	p = str.find('\n', p, q) + 1
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	453	assert p > 0
				454	continue
				455
				456	assert ch == '\\'
				457	p = p+1 # beyond backslash
				458	assert p < q
				459	if str[p] != '\n':
				460	# the program is invalid, but can't complain
				461	lastch = ch + str[p]
				462	p = p+1 # beyond escaped char
				463
				464	# end while p < q:
				465
				466	self.lastch = lastch
				467	if stack:
				468	self.lastopenbracketpos = stack[-1]
				469
				470	# Assuming continuation is C_BRACKET, return the number
				471	# of spaces the next line should be indented.
				472
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	473	def compute_bracket_indent(self):
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	474	self._study2()
				475	assert self.continuation == C_BRACKET
				476	j = self.lastopenbracketpos
				477	str = self.str
				478	n = len(str)
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	479	origi = i = str.rfind('\n', 0, j) + 1
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	480	j = j+1 # one beyond open bracket
				481	# find first list item; set i to start of its line
				482	while j < n:
				483	m = _itemre(str, j)
				484	if m:
				485	j = m.end() - 1 # index of first interesting char
				486	extra = 0
				487	break
				488	else:
				489	# this line is junk; advance to next line
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	490	i = j = str.find('\n', j) + 1
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	491	else:
				492	# nothing interesting follows the bracket;
				493	# reproduce the bracket line's indentation + a level
				494	j = i = origi
				495	while str[j] in " \t":
				496	j = j+1
				497	extra = self.indentwidth
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	498	return len(str[i:j].expandtabs(self.tabwidth)) + extra
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	499
				500	# Return number of physical lines in last stmt (whether or not
				501	# it's an interesting stmt! this is intended to be called when
				502	# continuation is C_BACKSLASH).
				503
				504	def get_num_lines_in_stmt(self):
				505	self._study1()
				506	goodlines = self.goodlines
				507	return goodlines[-1] - goodlines[-2]
				508
				509	# Assuming continuation is C_BACKSLASH, return the number of spaces
				510	# the next line should be indented. Also assuming the new line is
				511	# the first one following the initial line of the stmt.
				512
				513	def compute_backslash_indent(self):
				514	self._study2()
				515	assert self.continuation == C_BACKSLASH
				516	str = self.str
				517	i = self.stmt_start
				518	while str[i] in " \t":
				519	i = i+1
				520	startpos = i
				521
				522	# See whether the initial line starts an assignment stmt; i.e.,
				523	# look for an = operator
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	524	endpos = str.find('\n', startpos) + 1
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	525	found = level = 0
				526	while i < endpos:
				527	ch = str[i]
				528	if ch in "([{":
				529	level = level + 1
				530	i = i+1
				531	elif ch in ")]}":
				532	if level:
				533	level = level - 1
				534	i = i+1
				535	elif ch == '"' or ch == "'":
				536	i = _match_stringre(str, i, endpos).end()
				537	elif ch == '#':
				538	break
				539	elif level == 0 and ch == '=' and \
				540	(i == 0 or str[i-1] not in "=<>!") and \
				541	str[i+1] != '=':
				542	found = 1
				543	break
				544	else:
				545	i = i+1
				546
				547	if found:
				548	# found a legit =, but it may be the last interesting
				549	# thing on the line
				550	i = i+1 # move beyond the =
				551	found = re.match(r"\s*\\", str[i:endpos]) is None
				552
				553	if not found:
				554	# oh well ... settle for moving beyond the first chunk
				555	# of non-whitespace chars
				556	i = startpos
				557	while str[i] not in " \t\n":
				558	i = i+1
				559
Kurt B. Kaiser	254eb53	2002-09-17 03:55:13 +0000	[diff] [blame]	560	return len(str[self.stmt_start:i].expandtabs(\
David Scherer	7aced17	2000-08-15 01:13:23 +0000	[diff] [blame]	561	self.tabwidth)) + 1
				562
				563	# Return the leading whitespace on the initial line of the last
				564	# interesting stmt.
				565
				566	def get_base_indent_string(self):
				567	self._study2()
				568	i, n = self.stmt_start, self.stmt_end
				569	j = i
				570	str = self.str
				571	while j < n and str[j] in " \t":
				572	j = j + 1
				573	return str[i:j]
				574
				575	# Did the last interesting stmt open a block?
				576
				577	def is_block_opener(self):
				578	self._study2()
				579	return self.lastch == ':'
				580
				581	# Did the last interesting stmt close a block?
				582
				583	def is_block_closer(self):
				584	self._study2()
				585	return _closere(self.str, self.stmt_start) is not None
				586
				587	# index of last open bracket ({[, or None if none
				588	lastopenbracketpos = None
				589
				590	def get_last_open_bracket_pos(self):
				591	self._study2()
				592	return self.lastopenbracketpos