Blame - Parser/pgen/metaparser.py - platform/external/python/cpython3

blob: 074a083fb74b8b73592777f35f573b3a0b9de098 [file] [log] [blame]

Pablo Galindo	71876fa	2019-08-22 02:38:39 +0100	[diff] [blame]	1	"""Parser for the Python metagrammar"""
				2
				3	import io
				4	import tokenize # from stdlib
				5
				6	from .automata import NFA, NFAState
				7
				8
				9	class GrammarParser:
				10	"""Parser for Python grammar files."""
				11
				12	_translation_table = {
				13	tokenize.NAME: "NAME",
				14	tokenize.STRING: "STRING",
				15	tokenize.NEWLINE: "NEWLINE",
				16	tokenize.NL: "NL",
				17	tokenize.OP: "OP",
				18	tokenize.ENDMARKER: "ENDMARKER",
				19	tokenize.COMMENT: "COMMENT",
				20	}
				21
				22	def __init__(self, grammar):
				23	self.grammar = grammar
				24	grammar_adaptor = io.StringIO(grammar)
				25	self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
				26	self._gettoken() # Initialize lookahead
				27	self._current_rule_name = None
				28
				29	def parse(self):
				30	"""Turn the grammar into a collection of NFAs"""
				31	# grammar: (NEWLINE \| rule)* ENDMARKER
				32	while self.type != tokenize.ENDMARKER:
				33	while self.type == tokenize.NEWLINE:
				34	self._gettoken()
				35	# rule: NAME ':' rhs NEWLINE
				36	self._current_rule_name = self._expect(tokenize.NAME)
				37	self._expect(tokenize.OP, ":")
				38	a, z = self._parse_rhs()
				39	self._expect(tokenize.NEWLINE)
				40
				41	yield NFA(a, z)
				42
				43	def _parse_rhs(self):
				44	# rhs: items ('\|' items)*
				45	a, z = self._parse_items()
				46	if self.value != "\|":
				47	return a, z
				48	else:
				49	aa = NFAState(self._current_rule_name)
				50	zz = NFAState(self._current_rule_name)
				51	while True:
				52	# Allow to transit directly to the previous state and connect the end of the
				53	# previous state to the end of the current one, effectively allowing to skip
				54	# the current state.
				55	aa.add_arc(a)
				56	z.add_arc(zz)
				57	if self.value != "\|":
				58	break
				59
				60	self._gettoken()
				61	a, z = self._parse_items()
				62	return aa, zz
				63
				64	def _parse_items(self):
				65	# items: item+
				66	a, b = self._parse_item()
				67	while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
				68	c, d = self._parse_item()
				69	# Allow a transition between the end of the previous state
				70	# and the beginning of the new one, connecting all the items
				71	# together. In this way we can only reach the end if we visit
				72	# all the items.
				73	b.add_arc(c)
				74	b = d
				75	return a, b
				76
				77	def _parse_item(self):
				78	# item: '[' rhs ']' \| atom ['+' \| '*']
				79	if self.value == "[":
				80	self._gettoken()
				81	a, z = self._parse_rhs()
				82	self._expect(tokenize.OP, "]")
				83	# Make a transition from the beginning to the end so it is possible to
				84	# advance for free to the next state of this item # without consuming
				85	# anything from the rhs.
				86	a.add_arc(z)
				87	return a, z
				88	else:
				89	a, z = self._parse_atom()
				90	value = self.value
				91	if value not in ("+", "*"):
				92	return a, z
				93	self._gettoken()
				94	z.add_arc(a)
				95	if value == "+":
				96	# Create a cycle to the beginning so we go back to the old state in this
				97	# item and repeat.
				98	return a, z
				99	else:
				100	# The end state is the same as the beginning, so we can cycle arbitrarily
				101	# and end in the beginning if necessary.
				102	return a, a
				103
				104	def _parse_atom(self):
				105	# atom: '(' rhs ')' \| NAME \| STRING
				106	if self.value == "(":
				107	self._gettoken()
				108	a, z = self._parse_rhs()
				109	self._expect(tokenize.OP, ")")
				110	return a, z
				111	elif self.type in (tokenize.NAME, tokenize.STRING):
				112	a = NFAState(self._current_rule_name)
				113	z = NFAState(self._current_rule_name)
				114	# We can transit to the next state only if we consume the value.
				115	a.add_arc(z, self.value)
				116	self._gettoken()
				117	return a, z
				118	else:
				119	self._raise_error(
				120	"expected (...) or NAME or STRING, got {} ({})",
				121	self._translation_table.get(self.type, self.type),
				122	self.value,
				123	)
				124
				125	def _expect(self, type_, value=None):
				126	if self.type != type_:
				127	self._raise_error(
				128	"expected {}, got {} ({})",
				129	self._translation_table.get(type_, type_),
				130	self._translation_table.get(self.type, self.type),
				131	self.value,
				132	)
				133	if value is not None and self.value != value:
				134	self._raise_error("expected {}, got {}", value, self.value)
				135	value = self.value
				136	self._gettoken()
				137	return value
				138
				139	def _gettoken(self):
				140	tup = next(self.generator)
				141	while tup[0] in (tokenize.COMMENT, tokenize.NL):
				142	tup = next(self.generator)
				143	self.type, self.value, self.begin, self.end, self.line = tup
				144
				145	def _raise_error(self, msg, *args):
				146	if args:
				147	try:
				148	msg = msg.format(*args)
				149	except Exception:
				150	msg = " ".join([msg] + list(map(str, args)))
				151	line = self.grammar.splitlines()[self.begin[0] - 1]
				152	raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))