Blame - pycparser/c_lexer.py - platform/external/python/pycparser

blob: b5327cbdb26a611c5ce7b1f09ecc3bd44f1837b3 [file] [log] [blame]

Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame^]	1	#-----------------------------------------------------------------
				2	# pycparser: c_lexer.py
				3	#
				4	# CLexer class: lexer for the C language
				5	#
				6	# Copyright (C) 2008-2010, Eli Bendersky
				7	# License: LGPL
				8	#-----------------------------------------------------------------
				9
				10	import re
				11	import sys
				12
				13	import ply.lex
				14	from ply.lex import TOKEN
				15
				16
				17	class CLexer(object):
				18	""" A lexer for the C language. After building it, set the
				19	input text with input(), and call token() to get new
				20	tokens.
				21
				22	The public attribute filename can be set to an initial
				23	filaneme, but the lexer will update it upon #line
				24	directives.
				25	"""
				26	def __init__(self, error_func, type_lookup_func):
				27	""" Create a new Lexer.
				28
				29	error_func:
				30	An error function. Will be called with an error
				31	message, line and column as arguments, in case of
				32	an error during lexing.
				33
				34	type_lookup_func:
				35	A type lookup function. Given a string, it must
				36	return True IFF this string is a name of a type
				37	that was defined with a typedef earlier.
				38	"""
				39	self.error_func = error_func
				40	self.type_lookup_func = type_lookup_func
				41	self.filename = ''
				42
				43	# Allow either "# line" or "# <num>" to support GCC's
				44	# cpp output
				45	#
				46	self.line_pattern = re.compile('([ \t]line\W)\|([ \t]\d+)')
				47
				48	def build(self, **kwargs):
				49	""" Builds the lexer from the specification. Must be
				50	called after the lexer object is created.
				51
				52	This method exists separately, because the PLY
				53	manual warns against calling lex.lex inside
				54	__init__
				55	"""
				56	self.lexer = ply.lex.lex(object=self, **kwargs)
				57
				58	def reset_lineno(self):
				59	""" Resets the internal line number counter of the lexer.
				60	"""
				61	self.lexer.lineno = 1
				62
				63	def input(self, text):
				64	self.lexer.input(text)
				65
				66	def token(self):
				67	g = self.lexer.token()
				68	return g
				69
				70	######################-- PRIVATE --######################
				71
				72	##
				73	## Internal auxiliary methods
				74	##
				75	def _error(self, msg, token):
				76	location = self._make_tok_location(token)
				77	self.error_func(msg, location[0], location[1])
				78	self.lexer.skip(1)
				79
				80	def _find_tok_column(self, token):
				81	i = token.lexpos
				82	while i > 0:
				83	if self.lexer.lexdata[i] == '\n': break
				84	i -= 1
				85	return (token.lexpos - i) + 1
				86
				87	def _make_tok_location(self, token):
				88	return (token.lineno, self._find_tok_column(token))
				89
				90	##
				91	## Reserved keywords
				92	##
				93	keywords = (
				94	'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE',
				95	'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
				96	'FLOAT', 'FOR', 'GOTO', 'IF', 'INT', 'LONG', 'REGISTER',
				97	'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
				98	'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
				99	'VOLATILE', 'WHILE',
				100	)
				101
				102	keyword_map = {}
				103	for r in keywords:
				104	keyword_map[r.lower()] = r
				105
				106	##
				107	## All the tokens recognized by the lexer
				108	##
				109	tokens = keywords + (
				110	# Identifiers
				111	'ID',
				112
				113	# Type identifiers (identifiers previously defined as
				114	# types with typedef)
				115	'TYPEID',
				116
				117	# constants
				118	'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX',
				119	'FLOAT_CONST',
				120	'CHAR_CONST',
				121	'WCHAR_CONST',
				122
				123	# String literals
				124	'STRING_LITERAL',
				125	'WSTRING_LITERAL',
				126
				127	# Operators
				128	'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
				129	'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
				130	'LOR', 'LAND', 'LNOT',
				131	'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
				132
				133	# Assignment
				134	'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
				135	'PLUSEQUAL', 'MINUSEQUAL',
				136	'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
				137	'OREQUAL',
				138
				139	# Increment/decrement
				140	'PLUSPLUS', 'MINUSMINUS',
				141
				142	# Structure dereference (->)
				143	'ARROW',
				144
				145	# Conditional operator (?)
				146	'CONDOP',
				147
				148	# Delimeters
				149	'LPAREN', 'RPAREN', # ( )
				150	'LBRACKET', 'RBRACKET', # [ ]
				151	'LBRACE', 'RBRACE', # { }
				152	'COMMA', 'PERIOD', # . ,
				153	'SEMI', 'COLON', # ; :
				154
				155	# Ellipsis (...)
				156	'ELLIPSIS',
				157
				158	# pre-processor
				159	'PPHASH', # '#'
				160	)
				161
				162	##
				163	## Regexes for use in tokens
				164	##
				165	##
				166
				167	# valid C identifiers (K&R2: A.2.3)
				168	identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
				169
				170	# integer constants (K&R2: A.2.5.1)
				171	integer_suffix_opt = r'(([uU][lL])\|([lL][uU])\|[uU]\|[lL])?'
				172	decimal_constant = '(0'+integer_suffix_opt+')\|([1-9][0-9]*'+integer_suffix_opt+')'
				173	octal_constant = '0[0-7]*'+integer_suffix_opt
				174	hex_constant = '0[xX][0-9a-fA-F]+'+integer_suffix_opt
				175
				176	bad_octal_constant = '0[0-7]*[89]'
				177
				178	# character constants (K&R2: A.2.5.2)
				179	# Note: a-zA-Z are allowed as escape chars to support #line
				180	# directives with Windows paths as filenames (\dir\file...)
				181	#
				182	simple_escape = r"""([a-zA-Z\\?'"])"""
				183	octal_escape = r"""([0-7]{1,3})"""
				184	hex_escape = r"""(x[0-9a-fA-F]+)"""
				185	bad_escape = r"""([\\][^a-zA-Z\\?'"x0-7])"""
				186
				187	escape_sequence = r"""(\\("""+simple_escape+'\|'+octal_escape+'\|'+hex_escape+'))'
				188	cconst_char = r"""([^'\\\n]\|"""+escape_sequence+')'
				189	char_const = "'"+cconst_char+"'"
				190	wchar_const = 'L'+char_const
				191	unmatched_quote = "('"+cconst_char+"\\n)\|('"+cconst_char+"$)"
				192	bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')\|('')\|('"""+bad_escape+r"""[^'\n]*')"""
				193
				194	# string literals (K&R2: A.2.6)
				195	string_char = r"""([^"\\\n]\|"""+escape_sequence+')'
				196	string_literal = '"'+string_char+'*"'
				197	wstring_literal = 'L'+string_literal
				198	bad_string_literal = '"'+string_char+''+bad_escape+string_char+'"'
				199
				200	# floating constants (K&R2: A.2.5.3)
				201	exponent_part = r"""([eE][-+]?[0-9]+)"""
				202	fractional_constant = r"""([0-9]*\.[0-9]+)\|([0-9]+\.)"""
				203	floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)\|([0-9]+'+exponent_part+'))[FfLl]?)'
				204
				205	##
				206	## Lexer states
				207	##
				208	states = (
				209	# ppline: preprocessor line directives
				210	#
				211	('ppline', 'exclusive'),
				212	)
				213
				214	def t_PPHASH(self, t):
				215	r'[ \t]*\#'
				216	m = self.line_pattern.match(
				217	t.lexer.lexdata, pos=t.lexer.lexpos)
				218
				219	if m:
				220	t.lexer.begin('ppline')
				221	self.pp_line = self.pp_filename = None
				222	#~ print "ppline starts on line %s" % t.lexer.lineno
				223	else:
				224	t.type = 'PPHASH'
				225	return t
				226
				227	##
				228	## Rules for the ppline state
				229	##
				230	@TOKEN(string_literal)
				231	def t_ppline_FILENAME(self, t):
				232	if self.pp_line is None:
				233	self._error('filename before line number in #line', t)
				234	else:
				235	self.pp_filename = t.value.lstrip('"').rstrip('"')
				236	#~ print "PP got filename: ", self.pp_filename
				237
				238	@TOKEN(decimal_constant)
				239	def t_ppline_LINE_NUMBER(self, t):
				240	if self.pp_line is None:
				241	self.pp_line = t.value
				242	else:
				243	# Ignore: GCC's cpp sometimes inserts a numeric flag
				244	# after the file name
				245	pass
				246
				247	def t_ppline_NEWLINE(self, t):
				248	r'\n'
				249
				250	if self.pp_line is None:
				251	self._error('line number missing in #line', t)
				252	else:
				253	self.lexer.lineno = int(self.pp_line)
				254
				255	if self.pp_filename is not None:
				256	self.filename = self.pp_filename
				257
				258	t.lexer.begin('INITIAL')
				259
				260	def t_ppline_PPLINE(self, t):
				261	r'line'
				262	pass
				263
				264	t_ppline_ignore = ' \t'
				265
				266	def t_ppline_error(self, t):
				267	msg = 'invalid #line directive'
				268	self._error(msg, t)
				269
				270	##
				271	## Rules for the normal state
				272	##
				273	t_ignore = ' \t'
				274
				275	# Newlines
				276	def t_NEWLINE(self, t):
				277	r'\n+'
				278	t.lexer.lineno += t.value.count("\n")
				279
				280	# Operators
				281	t_PLUS = r'\+'
				282	t_MINUS = r'-'
				283	t_TIMES = r'\*'
				284	t_DIVIDE = r'/'
				285	t_MOD = r'%'
				286	t_OR = r'\\|'
				287	t_AND = r'&'
				288	t_NOT = r'~'
				289	t_XOR = r'\^'
				290	t_LSHIFT = r'<<'
				291	t_RSHIFT = r'>>'
				292	t_LOR = r'\\|\\|'
				293	t_LAND = r'&&'
				294	t_LNOT = r'!'
				295	t_LT = r'<'
				296	t_GT = r'>'
				297	t_LE = r'<='
				298	t_GE = r'>='
				299	t_EQ = r'=='
				300	t_NE = r'!='
				301
				302	# Assignment operators
				303	t_EQUALS = r'='
				304	t_TIMESEQUAL = r'\*='
				305	t_DIVEQUAL = r'/='
				306	t_MODEQUAL = r'%='
				307	t_PLUSEQUAL = r'\+='
				308	t_MINUSEQUAL = r'-='
				309	t_LSHIFTEQUAL = r'<<='
				310	t_RSHIFTEQUAL = r'>>='
				311	t_ANDEQUAL = r'&='
				312	t_OREQUAL = r'\\|='
				313	t_XOREQUAL = r'\^='
				314
				315	# Increment/decrement
				316	t_PLUSPLUS = r'\+\+'
				317	t_MINUSMINUS = r'--'
				318
				319	# ->
				320	t_ARROW = r'->'
				321
				322	# ?
				323	t_CONDOP = r'\?'
				324
				325	# Delimeters
				326	t_LPAREN = r'\('
				327	t_RPAREN = r'\)'
				328	t_LBRACKET = r'\['
				329	t_RBRACKET = r'\]'
				330	t_LBRACE = r'\{'
				331	t_RBRACE = r'\}'
				332	t_COMMA = r','
				333	t_PERIOD = r'\.'
				334	t_SEMI = r';'
				335	t_COLON = r':'
				336	t_ELLIPSIS = r'\.\.\.'
				337
				338	t_STRING_LITERAL = string_literal
				339
				340	# The following floating and integer constants are defined as
				341	# functions to impose a strict order (otherwise, decimal
				342	# is placed before the others because its regex is longer,
				343	# and this is bad)
				344	#
				345	@TOKEN(floating_constant)
				346	def t_FLOAT_CONST(self, t):
				347	return t
				348
				349	@TOKEN(hex_constant)
				350	def t_INT_CONST_HEX(self, t):
				351	return t
				352
				353	@TOKEN(bad_octal_constant)
				354	def t_BAD_CONST_OCT(self, t):
				355	msg = "Invalid octal constant"
				356	self._error(msg, t)
				357
				358	@TOKEN(octal_constant)
				359	def t_INT_CONST_OCT(self, t):
				360	return t
				361
				362	@TOKEN(decimal_constant)
				363	def t_INT_CONST_DEC(self, t):
				364	return t
				365
				366	# Must come before bad_char_const, to prevent it from
				367	# catching valid char constants as invalid
				368	#
				369	@TOKEN(char_const)
				370	def t_CHAR_CONST(self, t):
				371	return t
				372
				373	@TOKEN(wchar_const)
				374	def t_WCHAR_CONST(self, t):
				375	return t
				376
				377	@TOKEN(unmatched_quote)
				378	def t_UNMATCHED_QUOTE(self, t):
				379	msg = "Unmatched '"
				380	self._error(msg, t)
				381
				382	@TOKEN(bad_char_const)
				383	def t_BAD_CHAR_CONST(self, t):
				384	msg = "Invalid char constant %s" % t.value
				385	self._error(msg, t)
				386
				387	@TOKEN(wstring_literal)
				388	def t_WSTRING_LITERAL(self, t):
				389	return t
				390
				391	# unmatched string literals are caught by the preprocessor
				392
				393	@TOKEN(bad_string_literal)
				394	def t_BAD_STRING_LITERAL(self, t):
				395	msg = "String contains invalid escape code"
				396	self._error(msg, t)
				397
				398	@TOKEN(identifier)
				399	def t_ID(self, t):
				400	t.type = self.keyword_map.get(t.value, "ID")
				401
				402	if t.type == 'ID' and self.type_lookup_func(t.value):
				403	t.type = "TYPEID"
				404
				405	return t
				406
				407	def t_error(self, t):
				408	msg = 'Illegal character %s' % repr(t.value[0])
				409	self._error(msg, t)
				410
				411
				412	if __name__ == "__main__":
				413	from portability import printme
				414	filename = '../zp.c'
				415	text = open(filename).read()
				416
				417	#~ text = '"'+r"""ka \p ka"""+'"'
				418	text = r"""
				419	546
				420	#line 66 "kwas\df.h"
				421	id 4
				422	# 5
				423	dsf
				424	"""
				425
				426	def errfoo(msg, a, b):
				427	printme(msg)
				428	sys.exit()
				429
				430	def typelookup(namd):
				431	return False
				432
				433	clex = CLexer(errfoo, typelookup)
				434	clex.build()
				435	clex.input(text)
				436
				437	while 1:
				438	tok = clex.token()
				439	if not tok: break
				440
				441	#~ print type(tok)
				442	printme([tok.value, tok.type, tok.lineno, clex.filename, tok.lexpos])
				443
				444
				445