Blame - pycparser/c_lexer.py - platform/external/python/pycparser

blob: 0f3e7e0e29511831ba919b3be818be3de062bf4c [file] [log] [blame]

Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	1	#-----------------------------------------------------------------
				2	# pycparser: c_lexer.py
				3	#
				4	# CLexer class: lexer for the C language
				5	#
eli.bendersky	1a1e46b	2011-02-18 15:32:18 +0200	[diff] [blame]	6	# Copyright (C) 2008-2011, Eli Bendersky
eli.bendersky	84a6a63	2011-04-29 09:00:43 +0300	[diff] [blame]	7	# License: BSD
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	8	#-----------------------------------------------------------------
				9
				10	import re
				11	import sys
				12
				13	import ply.lex
				14	from ply.lex import TOKEN
				15
				16
				17	class CLexer(object):
				18	""" A lexer for the C language. After building it, set the
				19	input text with input(), and call token() to get new
				20	tokens.
				21
				22	The public attribute filename can be set to an initial
				23	filaneme, but the lexer will update it upon #line
				24	directives.
				25	"""
				26	def __init__(self, error_func, type_lookup_func):
				27	""" Create a new Lexer.
				28
				29	error_func:
				30	An error function. Will be called with an error
				31	message, line and column as arguments, in case of
				32	an error during lexing.
				33
				34	type_lookup_func:
				35	A type lookup function. Given a string, it must
				36	return True IFF this string is a name of a type
				37	that was defined with a typedef earlier.
				38	"""
				39	self.error_func = error_func
				40	self.type_lookup_func = type_lookup_func
				41	self.filename = ''
				42
				43	# Allow either "# line" or "# <num>" to support GCC's
				44	# cpp output
				45	#
				46	self.line_pattern = re.compile('([ \t]line\W)\|([ \t]\d+)')
				47
				48	def build(self, **kwargs):
				49	""" Builds the lexer from the specification. Must be
				50	called after the lexer object is created.
				51
				52	This method exists separately, because the PLY
				53	manual warns against calling lex.lex inside
				54	__init__
				55	"""
				56	self.lexer = ply.lex.lex(object=self, **kwargs)
				57
				58	def reset_lineno(self):
				59	""" Resets the internal line number counter of the lexer.
				60	"""
				61	self.lexer.lineno = 1
				62
				63	def input(self, text):
				64	self.lexer.input(text)
				65
				66	def token(self):
				67	g = self.lexer.token()
				68	return g
				69
				70	######################-- PRIVATE --######################
				71
				72	##
				73	## Internal auxiliary methods
				74	##
				75	def _error(self, msg, token):
				76	location = self._make_tok_location(token)
				77	self.error_func(msg, location[0], location[1])
				78	self.lexer.skip(1)
				79
				80	def _find_tok_column(self, token):
				81	i = token.lexpos
				82	while i > 0:
				83	if self.lexer.lexdata[i] == '\n': break
				84	i -= 1
				85	return (token.lexpos - i) + 1
				86
				87	def _make_tok_location(self, token):
				88	return (token.lineno, self._find_tok_column(token))
				89
				90	##
				91	## Reserved keywords
				92	##
				93	keywords = (
Eli Bendersky	f4d7346	2012-01-19 05:56:27 +0200	[diff] [blame]	94	'_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
				95	'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
eli.bendersky	145890d	2010-10-29 12:02:32 +0200	[diff] [blame]	96	'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'REGISTER',
				97	'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
Even	f08560d	2011-09-18 15:14:08 +0200	[diff] [blame]	98	'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	99	'VOLATILE', 'WHILE',
				100	)
				101
				102	keyword_map = {}
eli.bendersky	affe032	2011-09-24 06:16:55 +0300	[diff] [blame]	103	for keyword in keywords:
				104	if keyword == '_BOOL':
				105	keyword_map['_Bool'] = keyword
Eli Bendersky	f4d7346	2012-01-19 05:56:27 +0200	[diff] [blame]	106	elif keyword == '_COMPLEX':
				107	keyword_map['_Complex'] = keyword
eli.bendersky	affe032	2011-09-24 06:16:55 +0300	[diff] [blame]	108	else:
				109	keyword_map[keyword.lower()] = keyword
Even	f08560d	2011-09-18 15:14:08 +0200	[diff] [blame]	110
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	111	##
				112	## All the tokens recognized by the lexer
				113	##
				114	tokens = keywords + (
				115	# Identifiers
				116	'ID',
				117
				118	# Type identifiers (identifiers previously defined as
				119	# types with typedef)
				120	'TYPEID',
				121
				122	# constants
				123	'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX',
Eli Bendersky	3b1b08d	2012-06-15 12:37:54 +0300	[diff] [blame^]	124	'FLOAT_CONST', 'HEX_FLOAT_CONST',
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	125	'CHAR_CONST',
				126	'WCHAR_CONST',
				127
				128	# String literals
				129	'STRING_LITERAL',
				130	'WSTRING_LITERAL',
				131
				132	# Operators
				133	'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
				134	'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
				135	'LOR', 'LAND', 'LNOT',
				136	'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
				137
				138	# Assignment
				139	'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
				140	'PLUSEQUAL', 'MINUSEQUAL',
				141	'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
				142	'OREQUAL',
				143
				144	# Increment/decrement
				145	'PLUSPLUS', 'MINUSMINUS',
				146
				147	# Structure dereference (->)
				148	'ARROW',
				149
				150	# Conditional operator (?)
				151	'CONDOP',
				152
				153	# Delimeters
				154	'LPAREN', 'RPAREN', # ( )
				155	'LBRACKET', 'RBRACKET', # [ ]
				156	'LBRACE', 'RBRACE', # { }
				157	'COMMA', 'PERIOD', # . ,
				158	'SEMI', 'COLON', # ; :
				159
				160	# Ellipsis (...)
				161	'ELLIPSIS',
				162
				163	# pre-processor
				164	'PPHASH', # '#'
				165	)
				166
				167	##
				168	## Regexes for use in tokens
				169	##
				170	##
				171
				172	# valid C identifiers (K&R2: A.2.3)
				173	identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
				174
Eli Bendersky	3b1b08d	2012-06-15 12:37:54 +0300	[diff] [blame^]	175	hex_prefix = '0[xX]'
				176	hex_digits = '[0-9a-fA-F]+'
				177
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	178	# integer constants (K&R2: A.2.5.1)
eli.bendersky	98f4537	2010-10-30 09:46:29 +0200	[diff] [blame]	179	integer_suffix_opt = r'(u?ll\|U?LL\|([uU][lL])\|([lL][uU])\|[uU]\|[lL])?'
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	180	decimal_constant = '(0'+integer_suffix_opt+')\|([1-9][0-9]*'+integer_suffix_opt+')'
				181	octal_constant = '0[0-7]*'+integer_suffix_opt
Eli Bendersky	3b1b08d	2012-06-15 12:37:54 +0300	[diff] [blame^]	182	hex_constant = hex_prefix+hex_digits+integer_suffix_opt
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	183
				184	bad_octal_constant = '0[0-7]*[89]'
				185
				186	# character constants (K&R2: A.2.5.2)
Even	13ad219	2011-11-06 16:02:43 +0100	[diff] [blame]	187	# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
eli.bendersky	49f3b63	2011-10-31 06:38:41 +0200	[diff] [blame]	188	# directives with Windows paths as filenames (..\..\dir\file)
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	189	#
Even	13ad219	2011-11-06 16:02:43 +0100	[diff] [blame]	190	simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	191	octal_escape = r"""([0-7]{1,3})"""
				192	hex_escape = r"""(x[0-9a-fA-F]+)"""
Even	13ad219	2011-11-06 16:02:43 +0100	[diff] [blame]	193	bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	194
				195	escape_sequence = r"""(\\("""+simple_escape+'\|'+octal_escape+'\|'+hex_escape+'))'
				196	cconst_char = r"""([^'\\\n]\|"""+escape_sequence+')'
				197	char_const = "'"+cconst_char+"'"
				198	wchar_const = 'L'+char_const
				199	unmatched_quote = "('"+cconst_char+"\\n)\|('"+cconst_char+"$)"
				200	bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')\|('')\|('"""+bad_escape+r"""[^'\n]*')"""
				201
				202	# string literals (K&R2: A.2.6)
				203	string_char = r"""([^"\\\n]\|"""+escape_sequence+')'
				204	string_literal = '"'+string_char+'*"'
				205	wstring_literal = 'L'+string_literal
				206	bad_string_literal = '"'+string_char+''+bad_escape+string_char+'"'
				207
				208	# floating constants (K&R2: A.2.5.3)
				209	exponent_part = r"""([eE][-+]?[0-9]+)"""
				210	fractional_constant = r"""([0-9]*\.[0-9]+)\|([0-9]+\.)"""
				211	floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)\|([0-9]+'+exponent_part+'))[FfLl]?)'
Eli Bendersky	3b1b08d	2012-06-15 12:37:54 +0300	[diff] [blame^]	212	binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
				213	hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')\|('+hex_digits+r"""\.))"""
				214	hex_floating_constant = '('+hex_prefix+'('+hex_digits+'\|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	215
				216	##
				217	## Lexer states
				218	##
				219	states = (
				220	# ppline: preprocessor line directives
				221	#
				222	('ppline', 'exclusive'),
				223	)
				224
				225	def t_PPHASH(self, t):
				226	r'[ \t]*\#'
				227	m = self.line_pattern.match(
				228	t.lexer.lexdata, pos=t.lexer.lexpos)
				229
				230	if m:
				231	t.lexer.begin('ppline')
				232	self.pp_line = self.pp_filename = None
				233	#~ print "ppline starts on line %s" % t.lexer.lineno
				234	else:
				235	t.type = 'PPHASH'
				236	return t
				237
				238	##
				239	## Rules for the ppline state
				240	##
				241	@TOKEN(string_literal)
				242	def t_ppline_FILENAME(self, t):
				243	if self.pp_line is None:
				244	self._error('filename before line number in #line', t)
				245	else:
				246	self.pp_filename = t.value.lstrip('"').rstrip('"')
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	247
				248	@TOKEN(decimal_constant)
				249	def t_ppline_LINE_NUMBER(self, t):
				250	if self.pp_line is None:
				251	self.pp_line = t.value
				252	else:
				253	# Ignore: GCC's cpp sometimes inserts a numeric flag
				254	# after the file name
				255	pass
				256
				257	def t_ppline_NEWLINE(self, t):
				258	r'\n'
				259
				260	if self.pp_line is None:
				261	self._error('line number missing in #line', t)
				262	else:
				263	self.lexer.lineno = int(self.pp_line)
				264
				265	if self.pp_filename is not None:
				266	self.filename = self.pp_filename
				267
				268	t.lexer.begin('INITIAL')
				269
				270	def t_ppline_PPLINE(self, t):
				271	r'line'
				272	pass
				273
				274	t_ppline_ignore = ' \t'
				275
				276	def t_ppline_error(self, t):
				277	msg = 'invalid #line directive'
				278	self._error(msg, t)
				279
				280	##
				281	## Rules for the normal state
				282	##
				283	t_ignore = ' \t'
				284
				285	# Newlines
				286	def t_NEWLINE(self, t):
				287	r'\n+'
				288	t.lexer.lineno += t.value.count("\n")
				289
				290	# Operators
				291	t_PLUS = r'\+'
				292	t_MINUS = r'-'
				293	t_TIMES = r'\*'
				294	t_DIVIDE = r'/'
				295	t_MOD = r'%'
				296	t_OR = r'\\|'
				297	t_AND = r'&'
				298	t_NOT = r'~'
				299	t_XOR = r'\^'
				300	t_LSHIFT = r'<<'
				301	t_RSHIFT = r'>>'
				302	t_LOR = r'\\|\\|'
				303	t_LAND = r'&&'
				304	t_LNOT = r'!'
				305	t_LT = r'<'
				306	t_GT = r'>'
				307	t_LE = r'<='
				308	t_GE = r'>='
				309	t_EQ = r'=='
				310	t_NE = r'!='
				311
				312	# Assignment operators
				313	t_EQUALS = r'='
				314	t_TIMESEQUAL = r'\*='
				315	t_DIVEQUAL = r'/='
				316	t_MODEQUAL = r'%='
				317	t_PLUSEQUAL = r'\+='
				318	t_MINUSEQUAL = r'-='
				319	t_LSHIFTEQUAL = r'<<='
				320	t_RSHIFTEQUAL = r'>>='
				321	t_ANDEQUAL = r'&='
				322	t_OREQUAL = r'\\|='
				323	t_XOREQUAL = r'\^='
				324
				325	# Increment/decrement
				326	t_PLUSPLUS = r'\+\+'
				327	t_MINUSMINUS = r'--'
				328
				329	# ->
				330	t_ARROW = r'->'
				331
				332	# ?
				333	t_CONDOP = r'\?'
				334
				335	# Delimeters
				336	t_LPAREN = r'\('
				337	t_RPAREN = r'\)'
				338	t_LBRACKET = r'\['
				339	t_RBRACKET = r'\]'
				340	t_LBRACE = r'\{'
				341	t_RBRACE = r'\}'
				342	t_COMMA = r','
				343	t_PERIOD = r'\.'
				344	t_SEMI = r';'
				345	t_COLON = r':'
				346	t_ELLIPSIS = r'\.\.\.'
				347
				348	t_STRING_LITERAL = string_literal
				349
				350	# The following floating and integer constants are defined as
				351	# functions to impose a strict order (otherwise, decimal
				352	# is placed before the others because its regex is longer,
				353	# and this is bad)
				354	#
				355	@TOKEN(floating_constant)
				356	def t_FLOAT_CONST(self, t):
				357	return t
				358
Eli Bendersky	3b1b08d	2012-06-15 12:37:54 +0300	[diff] [blame^]	359	@TOKEN(hex_floating_constant)
				360	def t_HEX_FLOAT_CONST(self, t):
				361	return t
				362
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	363	@TOKEN(hex_constant)
				364	def t_INT_CONST_HEX(self, t):
				365	return t
				366
				367	@TOKEN(bad_octal_constant)
				368	def t_BAD_CONST_OCT(self, t):
				369	msg = "Invalid octal constant"
				370	self._error(msg, t)
				371
				372	@TOKEN(octal_constant)
				373	def t_INT_CONST_OCT(self, t):
				374	return t
				375
				376	@TOKEN(decimal_constant)
				377	def t_INT_CONST_DEC(self, t):
				378	return t
				379
				380	# Must come before bad_char_const, to prevent it from
				381	# catching valid char constants as invalid
				382	#
				383	@TOKEN(char_const)
				384	def t_CHAR_CONST(self, t):
				385	return t
				386
				387	@TOKEN(wchar_const)
				388	def t_WCHAR_CONST(self, t):
				389	return t
				390
				391	@TOKEN(unmatched_quote)
				392	def t_UNMATCHED_QUOTE(self, t):
				393	msg = "Unmatched '"
				394	self._error(msg, t)
				395
				396	@TOKEN(bad_char_const)
				397	def t_BAD_CHAR_CONST(self, t):
				398	msg = "Invalid char constant %s" % t.value
				399	self._error(msg, t)
				400
				401	@TOKEN(wstring_literal)
				402	def t_WSTRING_LITERAL(self, t):
				403	return t
				404
				405	# unmatched string literals are caught by the preprocessor
				406
				407	@TOKEN(bad_string_literal)
				408	def t_BAD_STRING_LITERAL(self, t):
				409	msg = "String contains invalid escape code"
				410	self._error(msg, t)
				411
				412	@TOKEN(identifier)
				413	def t_ID(self, t):
				414	t.type = self.keyword_map.get(t.value, "ID")
				415
				416	if t.type == 'ID' and self.type_lookup_func(t.value):
				417	t.type = "TYPEID"
				418
				419	return t
				420
				421	def t_error(self, t):
				422	msg = 'Illegal character %s' % repr(t.value[0])
				423	self._error(msg, t)
				424
				425
				426	if __name__ == "__main__":
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	427	filename = '../zp.c'
				428	text = open(filename).read()
				429
				430	#~ text = '"'+r"""ka \p ka"""+'"'
				431	text = r"""
				432	546
				433	#line 66 "kwas\df.h"
				434	id 4
				435	# 5
				436	dsf
				437	"""
				438
				439	def errfoo(msg, a, b):
eli.bendersky	1a1e46b	2011-02-18 15:32:18 +0200	[diff] [blame]	440	sys.write(msg + "\n")
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	441	sys.exit()
				442
				443	def typelookup(namd):
				444	return False
				445
				446	clex = CLexer(errfoo, typelookup)
				447	clex.build()
				448	clex.input(text)
				449
				450	while 1:
				451	tok = clex.token()
				452	if not tok: break
				453
				454	#~ print type(tok)
				455	printme([tok.value, tok.type, tok.lineno, clex.filename, tok.lexpos])
Eli Bendersky	3921e8e	2010-05-21 09:05:39 +0300	[diff] [blame]	456
Eli Bendersky	3b1b08d	2012-06-15 12:37:54 +0300	[diff] [blame^]	457