Blame - pycparser/c_lexer.py - platform/external/python/pycparser

blob: 235f8f03814b30c7fc8451c198e1e9124f313f33 [file] [log] [blame]

Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	1	#-----------------------------------------------------------------
				2	# pycparser: c_lexer.py
				3	#
				4	# CLexer class: lexer for the C language
				5	#
eli.bendersky	1a1e46b	2011-02-18 15:32:18 +0200	[diff] [blame]	6	# Copyright (C) 2008-2011, Eli Bendersky
eli.bendersky	84a6a63	2011-04-29 09:00:43 +0300	[diff] [blame]	7	# License: BSD
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	8	#-----------------------------------------------------------------
				9
				10	import re
				11	import sys
				12
				13	import ply.lex
				14	from ply.lex import TOKEN
				15
				16
				17	class CLexer(object):
				18	""" A lexer for the C language. After building it, set the
				19	input text with input(), and call token() to get new
				20	tokens.
				21
				22	The public attribute filename can be set to an initial
				23	filaneme, but the lexer will update it upon #line
				24	directives.
				25	"""
				26	def __init__(self, error_func, type_lookup_func):
				27	""" Create a new Lexer.
				28
				29	error_func:
				30	An error function. Will be called with an error
				31	message, line and column as arguments, in case of
				32	an error during lexing.
				33
				34	type_lookup_func:
				35	A type lookup function. Given a string, it must
				36	return True IFF this string is a name of a type
				37	that was defined with a typedef earlier.
				38	"""
				39	self.error_func = error_func
				40	self.type_lookup_func = type_lookup_func
				41	self.filename = ''
				42
				43	# Allow either "# line" or "# <num>" to support GCC's
				44	# cpp output
				45	#
				46	self.line_pattern = re.compile('([ \t]line\W)\|([ \t]\d+)')
				47
				48	def build(self, **kwargs):
				49	""" Builds the lexer from the specification. Must be
				50	called after the lexer object is created.
				51
				52	This method exists separately, because the PLY
				53	manual warns against calling lex.lex inside
				54	__init__
				55	"""
				56	self.lexer = ply.lex.lex(object=self, **kwargs)
				57
				58	def reset_lineno(self):
				59	""" Resets the internal line number counter of the lexer.
				60	"""
				61	self.lexer.lineno = 1
				62
				63	def input(self, text):
				64	self.lexer.input(text)
				65
				66	def token(self):
				67	g = self.lexer.token()
				68	return g
				69
				70	######################-- PRIVATE --######################
				71
				72	##
				73	## Internal auxiliary methods
				74	##
				75	def _error(self, msg, token):
				76	location = self._make_tok_location(token)
				77	self.error_func(msg, location[0], location[1])
				78	self.lexer.skip(1)
				79
				80	def _find_tok_column(self, token):
				81	i = token.lexpos
				82	while i > 0:
				83	if self.lexer.lexdata[i] == '\n': break
				84	i -= 1
				85	return (token.lexpos - i) + 1
				86
				87	def _make_tok_location(self, token):
				88	return (token.lineno, self._find_tok_column(token))
				89
				90	##
				91	## Reserved keywords
				92	##
				93	keywords = (
Even	f08560d	2011-09-18 15:14:08 +0200	[diff] [blame]	94	'AUTO', '_BOOL', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE',
				95	'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
eli.bendersky	145890d	2010-10-29 12:02:32 +0200	[diff] [blame]	96	'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'REGISTER',
				97	'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
Even	f08560d	2011-09-18 15:14:08 +0200	[diff] [blame]	98	'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	99	'VOLATILE', 'WHILE',
				100	)
				101
				102	keyword_map = {}
eli.bendersky	affe032	2011-09-24 06:16:55 +0300	[diff] [blame]	103	for keyword in keywords:
				104	if keyword == '_BOOL':
				105	keyword_map['_Bool'] = keyword
				106	else:
				107	keyword_map[keyword.lower()] = keyword
Even	f08560d	2011-09-18 15:14:08 +0200	[diff] [blame]	108
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	109	##
				110	## All the tokens recognized by the lexer
				111	##
				112	tokens = keywords + (
				113	# Identifiers
				114	'ID',
				115
				116	# Type identifiers (identifiers previously defined as
				117	# types with typedef)
				118	'TYPEID',
				119
				120	# constants
				121	'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX',
				122	'FLOAT_CONST',
				123	'CHAR_CONST',
				124	'WCHAR_CONST',
				125
				126	# String literals
				127	'STRING_LITERAL',
				128	'WSTRING_LITERAL',
				129
				130	# Operators
				131	'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
				132	'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
				133	'LOR', 'LAND', 'LNOT',
				134	'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
				135
				136	# Assignment
				137	'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
				138	'PLUSEQUAL', 'MINUSEQUAL',
				139	'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
				140	'OREQUAL',
				141
				142	# Increment/decrement
				143	'PLUSPLUS', 'MINUSMINUS',
				144
				145	# Structure dereference (->)
				146	'ARROW',
				147
				148	# Conditional operator (?)
				149	'CONDOP',
				150
				151	# Delimeters
				152	'LPAREN', 'RPAREN', # ( )
				153	'LBRACKET', 'RBRACKET', # [ ]
				154	'LBRACE', 'RBRACE', # { }
				155	'COMMA', 'PERIOD', # . ,
				156	'SEMI', 'COLON', # ; :
				157
				158	# Ellipsis (...)
				159	'ELLIPSIS',
				160
				161	# pre-processor
				162	'PPHASH', # '#'
				163	)
				164
				165	##
				166	## Regexes for use in tokens
				167	##
				168	##
				169
				170	# valid C identifiers (K&R2: A.2.3)
				171	identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
				172
				173	# integer constants (K&R2: A.2.5.1)
eli.bendersky	98f4537	2010-10-30 09:46:29 +0200	[diff] [blame]	174	integer_suffix_opt = r'(u?ll\|U?LL\|([uU][lL])\|([lL][uU])\|[uU]\|[lL])?'
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	175	decimal_constant = '(0'+integer_suffix_opt+')\|([1-9][0-9]*'+integer_suffix_opt+')'
				176	octal_constant = '0[0-7]*'+integer_suffix_opt
				177	hex_constant = '0[xX][0-9a-fA-F]+'+integer_suffix_opt
				178
				179	bad_octal_constant = '0[0-7]*[89]'
				180
				181	# character constants (K&R2: A.2.5.2)
eli.bendersky	49f3b63	2011-10-31 06:38:41 +0200	[diff] [blame^]	182	# Note: a-zA-Z and '.' are allowed as escape chars to support #line
				183	# directives with Windows paths as filenames (..\..\dir\file)
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	184	#
eli.bendersky	49f3b63	2011-10-31 06:38:41 +0200	[diff] [blame^]	185	simple_escape = r"""([a-zA-Z.\\?'"])"""
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	186	octal_escape = r"""([0-7]{1,3})"""
				187	hex_escape = r"""(x[0-9a-fA-F]+)"""
eli.bendersky	49f3b63	2011-10-31 06:38:41 +0200	[diff] [blame^]	188	bad_escape = r"""([\\][^a-zA-Z.\\?'"x0-7])"""
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	189
				190	escape_sequence = r"""(\\("""+simple_escape+'\|'+octal_escape+'\|'+hex_escape+'))'
				191	cconst_char = r"""([^'\\\n]\|"""+escape_sequence+')'
				192	char_const = "'"+cconst_char+"'"
				193	wchar_const = 'L'+char_const
				194	unmatched_quote = "('"+cconst_char+"\\n)\|('"+cconst_char+"$)"
				195	bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')\|('')\|('"""+bad_escape+r"""[^'\n]*')"""
				196
				197	# string literals (K&R2: A.2.6)
				198	string_char = r"""([^"\\\n]\|"""+escape_sequence+')'
				199	string_literal = '"'+string_char+'*"'
				200	wstring_literal = 'L'+string_literal
				201	bad_string_literal = '"'+string_char+''+bad_escape+string_char+'"'
				202
				203	# floating constants (K&R2: A.2.5.3)
				204	exponent_part = r"""([eE][-+]?[0-9]+)"""
				205	fractional_constant = r"""([0-9]*\.[0-9]+)\|([0-9]+\.)"""
				206	floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)\|([0-9]+'+exponent_part+'))[FfLl]?)'
				207
				208	##
				209	## Lexer states
				210	##
				211	states = (
				212	# ppline: preprocessor line directives
				213	#
				214	('ppline', 'exclusive'),
				215	)
				216
				217	def t_PPHASH(self, t):
				218	r'[ \t]*\#'
				219	m = self.line_pattern.match(
				220	t.lexer.lexdata, pos=t.lexer.lexpos)
				221
				222	if m:
				223	t.lexer.begin('ppline')
				224	self.pp_line = self.pp_filename = None
				225	#~ print "ppline starts on line %s" % t.lexer.lineno
				226	else:
				227	t.type = 'PPHASH'
				228	return t
				229
				230	##
				231	## Rules for the ppline state
				232	##
				233	@TOKEN(string_literal)
				234	def t_ppline_FILENAME(self, t):
				235	if self.pp_line is None:
				236	self._error('filename before line number in #line', t)
				237	else:
				238	self.pp_filename = t.value.lstrip('"').rstrip('"')
				239	#~ print "PP got filename: ", self.pp_filename
				240
				241	@TOKEN(decimal_constant)
				242	def t_ppline_LINE_NUMBER(self, t):
				243	if self.pp_line is None:
				244	self.pp_line = t.value
				245	else:
				246	# Ignore: GCC's cpp sometimes inserts a numeric flag
				247	# after the file name
				248	pass
				249
				250	def t_ppline_NEWLINE(self, t):
				251	r'\n'
				252
				253	if self.pp_line is None:
				254	self._error('line number missing in #line', t)
				255	else:
				256	self.lexer.lineno = int(self.pp_line)
				257
				258	if self.pp_filename is not None:
				259	self.filename = self.pp_filename
				260
				261	t.lexer.begin('INITIAL')
				262
				263	def t_ppline_PPLINE(self, t):
				264	r'line'
				265	pass
				266
				267	t_ppline_ignore = ' \t'
				268
				269	def t_ppline_error(self, t):
				270	msg = 'invalid #line directive'
				271	self._error(msg, t)
				272
				273	##
				274	## Rules for the normal state
				275	##
				276	t_ignore = ' \t'
				277
				278	# Newlines
				279	def t_NEWLINE(self, t):
				280	r'\n+'
				281	t.lexer.lineno += t.value.count("\n")
				282
				283	# Operators
				284	t_PLUS = r'\+'
				285	t_MINUS = r'-'
				286	t_TIMES = r'\*'
				287	t_DIVIDE = r'/'
				288	t_MOD = r'%'
				289	t_OR = r'\\|'
				290	t_AND = r'&'
				291	t_NOT = r'~'
				292	t_XOR = r'\^'
				293	t_LSHIFT = r'<<'
				294	t_RSHIFT = r'>>'
				295	t_LOR = r'\\|\\|'
				296	t_LAND = r'&&'
				297	t_LNOT = r'!'
				298	t_LT = r'<'
				299	t_GT = r'>'
				300	t_LE = r'<='
				301	t_GE = r'>='
				302	t_EQ = r'=='
				303	t_NE = r'!='
				304
				305	# Assignment operators
				306	t_EQUALS = r'='
				307	t_TIMESEQUAL = r'\*='
				308	t_DIVEQUAL = r'/='
				309	t_MODEQUAL = r'%='
				310	t_PLUSEQUAL = r'\+='
				311	t_MINUSEQUAL = r'-='
				312	t_LSHIFTEQUAL = r'<<='
				313	t_RSHIFTEQUAL = r'>>='
				314	t_ANDEQUAL = r'&='
				315	t_OREQUAL = r'\\|='
				316	t_XOREQUAL = r'\^='
				317
				318	# Increment/decrement
				319	t_PLUSPLUS = r'\+\+'
				320	t_MINUSMINUS = r'--'
				321
				322	# ->
				323	t_ARROW = r'->'
				324
				325	# ?
				326	t_CONDOP = r'\?'
				327
				328	# Delimeters
				329	t_LPAREN = r'\('
				330	t_RPAREN = r'\)'
				331	t_LBRACKET = r'\['
				332	t_RBRACKET = r'\]'
				333	t_LBRACE = r'\{'
				334	t_RBRACE = r'\}'
				335	t_COMMA = r','
				336	t_PERIOD = r'\.'
				337	t_SEMI = r';'
				338	t_COLON = r':'
				339	t_ELLIPSIS = r'\.\.\.'
				340
				341	t_STRING_LITERAL = string_literal
				342
				343	# The following floating and integer constants are defined as
				344	# functions to impose a strict order (otherwise, decimal
				345	# is placed before the others because its regex is longer,
				346	# and this is bad)
				347	#
				348	@TOKEN(floating_constant)
				349	def t_FLOAT_CONST(self, t):
				350	return t
				351
				352	@TOKEN(hex_constant)
				353	def t_INT_CONST_HEX(self, t):
				354	return t
				355
				356	@TOKEN(bad_octal_constant)
				357	def t_BAD_CONST_OCT(self, t):
				358	msg = "Invalid octal constant"
				359	self._error(msg, t)
				360
				361	@TOKEN(octal_constant)
				362	def t_INT_CONST_OCT(self, t):
				363	return t
				364
				365	@TOKEN(decimal_constant)
				366	def t_INT_CONST_DEC(self, t):
				367	return t
				368
				369	# Must come before bad_char_const, to prevent it from
				370	# catching valid char constants as invalid
				371	#
				372	@TOKEN(char_const)
				373	def t_CHAR_CONST(self, t):
				374	return t
				375
				376	@TOKEN(wchar_const)
				377	def t_WCHAR_CONST(self, t):
				378	return t
				379
				380	@TOKEN(unmatched_quote)
				381	def t_UNMATCHED_QUOTE(self, t):
				382	msg = "Unmatched '"
				383	self._error(msg, t)
				384
				385	@TOKEN(bad_char_const)
				386	def t_BAD_CHAR_CONST(self, t):
				387	msg = "Invalid char constant %s" % t.value
				388	self._error(msg, t)
				389
				390	@TOKEN(wstring_literal)
				391	def t_WSTRING_LITERAL(self, t):
				392	return t
				393
				394	# unmatched string literals are caught by the preprocessor
				395
				396	@TOKEN(bad_string_literal)
				397	def t_BAD_STRING_LITERAL(self, t):
				398	msg = "String contains invalid escape code"
				399	self._error(msg, t)
				400
				401	@TOKEN(identifier)
				402	def t_ID(self, t):
				403	t.type = self.keyword_map.get(t.value, "ID")
				404
				405	if t.type == 'ID' and self.type_lookup_func(t.value):
				406	t.type = "TYPEID"
				407
				408	return t
				409
				410	def t_error(self, t):
				411	msg = 'Illegal character %s' % repr(t.value[0])
				412	self._error(msg, t)
				413
				414
				415	if __name__ == "__main__":
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	416	filename = '../zp.c'
				417	text = open(filename).read()
				418
				419	#~ text = '"'+r"""ka \p ka"""+'"'
				420	text = r"""
				421	546
				422	#line 66 "kwas\df.h"
				423	id 4
				424	# 5
				425	dsf
				426	"""
				427
				428	def errfoo(msg, a, b):
eli.bendersky	1a1e46b	2011-02-18 15:32:18 +0200	[diff] [blame]	429	sys.write(msg + "\n")
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	430	sys.exit()
				431
				432	def typelookup(namd):
				433	return False
				434
				435	clex = CLexer(errfoo, typelookup)
				436	clex.build()
				437	clex.input(text)
				438
				439	while 1:
				440	tok = clex.token()
				441	if not tok: break
				442
				443	#~ print type(tok)
				444	printme([tok.value, tok.type, tok.lineno, clex.filename, tok.lexpos])
				445
				446
				447