blob: 0f3e7e0e29511831ba919b3be818be3de062bf4c [file] [log] [blame]
Eli Bendersky3921e8e2010-05-21 09:05:39 +03001#-----------------------------------------------------------------
2# pycparser: c_lexer.py
3#
4# CLexer class: lexer for the C language
5#
eli.bendersky1a1e46b2011-02-18 15:32:18 +02006# Copyright (C) 2008-2011, Eli Bendersky
eli.bendersky84a6a632011-04-29 09:00:43 +03007# License: BSD
Eli Bendersky3921e8e2010-05-21 09:05:39 +03008#-----------------------------------------------------------------
9
10import re
11import sys
12
13import ply.lex
14from ply.lex import TOKEN
15
16
17class CLexer(object):
18 """ A lexer for the C language. After building it, set the
19 input text with input(), and call token() to get new
20 tokens.
21
22 The public attribute filename can be set to an initial
23 filaneme, but the lexer will update it upon #line
24 directives.
25 """
26 def __init__(self, error_func, type_lookup_func):
27 """ Create a new Lexer.
28
29 error_func:
30 An error function. Will be called with an error
31 message, line and column as arguments, in case of
32 an error during lexing.
33
34 type_lookup_func:
35 A type lookup function. Given a string, it must
36 return True IFF this string is a name of a type
37 that was defined with a typedef earlier.
38 """
39 self.error_func = error_func
40 self.type_lookup_func = type_lookup_func
41 self.filename = ''
42
43 # Allow either "# line" or "# <num>" to support GCC's
44 # cpp output
45 #
46 self.line_pattern = re.compile('([ \t]*line\W)|([ \t]*\d+)')
47
48 def build(self, **kwargs):
49 """ Builds the lexer from the specification. Must be
50 called after the lexer object is created.
51
52 This method exists separately, because the PLY
53 manual warns against calling lex.lex inside
54 __init__
55 """
56 self.lexer = ply.lex.lex(object=self, **kwargs)
57
58 def reset_lineno(self):
59 """ Resets the internal line number counter of the lexer.
60 """
61 self.lexer.lineno = 1
62
63 def input(self, text):
64 self.lexer.input(text)
65
66 def token(self):
67 g = self.lexer.token()
68 return g
69
70 ######################-- PRIVATE --######################
71
72 ##
73 ## Internal auxiliary methods
74 ##
75 def _error(self, msg, token):
76 location = self._make_tok_location(token)
77 self.error_func(msg, location[0], location[1])
78 self.lexer.skip(1)
79
80 def _find_tok_column(self, token):
81 i = token.lexpos
82 while i > 0:
83 if self.lexer.lexdata[i] == '\n': break
84 i -= 1
85 return (token.lexpos - i) + 1
86
87 def _make_tok_location(self, token):
88 return (token.lineno, self._find_tok_column(token))
89
90 ##
91 ## Reserved keywords
92 ##
93 keywords = (
Eli Benderskyf4d73462012-01-19 05:56:27 +020094 '_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
95 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
eli.bendersky145890d2010-10-29 12:02:32 +020096 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'REGISTER',
97 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
Evenf08560d2011-09-18 15:14:08 +020098 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
Eli Bendersky3921e8e2010-05-21 09:05:39 +030099 'VOLATILE', 'WHILE',
100 )
101
102 keyword_map = {}
eli.benderskyaffe0322011-09-24 06:16:55 +0300103 for keyword in keywords:
104 if keyword == '_BOOL':
105 keyword_map['_Bool'] = keyword
Eli Benderskyf4d73462012-01-19 05:56:27 +0200106 elif keyword == '_COMPLEX':
107 keyword_map['_Complex'] = keyword
eli.benderskyaffe0322011-09-24 06:16:55 +0300108 else:
109 keyword_map[keyword.lower()] = keyword
Evenf08560d2011-09-18 15:14:08 +0200110
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300111 ##
112 ## All the tokens recognized by the lexer
113 ##
114 tokens = keywords + (
115 # Identifiers
116 'ID',
117
118 # Type identifiers (identifiers previously defined as
119 # types with typedef)
120 'TYPEID',
121
122 # constants
123 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX',
Eli Bendersky3b1b08d2012-06-15 12:37:54 +0300124 'FLOAT_CONST', 'HEX_FLOAT_CONST',
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300125 'CHAR_CONST',
126 'WCHAR_CONST',
127
128 # String literals
129 'STRING_LITERAL',
130 'WSTRING_LITERAL',
131
132 # Operators
133 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
134 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
135 'LOR', 'LAND', 'LNOT',
136 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
137
138 # Assignment
139 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
140 'PLUSEQUAL', 'MINUSEQUAL',
141 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
142 'OREQUAL',
143
144 # Increment/decrement
145 'PLUSPLUS', 'MINUSMINUS',
146
147 # Structure dereference (->)
148 'ARROW',
149
150 # Conditional operator (?)
151 'CONDOP',
152
153 # Delimeters
154 'LPAREN', 'RPAREN', # ( )
155 'LBRACKET', 'RBRACKET', # [ ]
156 'LBRACE', 'RBRACE', # { }
157 'COMMA', 'PERIOD', # . ,
158 'SEMI', 'COLON', # ; :
159
160 # Ellipsis (...)
161 'ELLIPSIS',
162
163 # pre-processor
164 'PPHASH', # '#'
165 )
166
167 ##
168 ## Regexes for use in tokens
169 ##
170 ##
171
172 # valid C identifiers (K&R2: A.2.3)
173 identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
174
Eli Bendersky3b1b08d2012-06-15 12:37:54 +0300175 hex_prefix = '0[xX]'
176 hex_digits = '[0-9a-fA-F]+'
177
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300178 # integer constants (K&R2: A.2.5.1)
eli.bendersky98f45372010-10-30 09:46:29 +0200179 integer_suffix_opt = r'(u?ll|U?LL|([uU][lL])|([lL][uU])|[uU]|[lL])?'
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300180 decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
181 octal_constant = '0[0-7]*'+integer_suffix_opt
Eli Bendersky3b1b08d2012-06-15 12:37:54 +0300182 hex_constant = hex_prefix+hex_digits+integer_suffix_opt
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300183
184 bad_octal_constant = '0[0-7]*[89]'
185
186 # character constants (K&R2: A.2.5.2)
Even13ad2192011-11-06 16:02:43 +0100187 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
eli.bendersky49f3b632011-10-31 06:38:41 +0200188 # directives with Windows paths as filenames (..\..\dir\file)
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300189 #
Even13ad2192011-11-06 16:02:43 +0100190 simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300191 octal_escape = r"""([0-7]{1,3})"""
192 hex_escape = r"""(x[0-9a-fA-F]+)"""
Even13ad2192011-11-06 16:02:43 +0100193 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300194
195 escape_sequence = r"""(\\("""+simple_escape+'|'+octal_escape+'|'+hex_escape+'))'
196 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
197 char_const = "'"+cconst_char+"'"
198 wchar_const = 'L'+char_const
199 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
200 bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
201
202 # string literals (K&R2: A.2.6)
203 string_char = r"""([^"\\\n]|"""+escape_sequence+')'
204 string_literal = '"'+string_char+'*"'
205 wstring_literal = 'L'+string_literal
206 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
207
208 # floating constants (K&R2: A.2.5.3)
209 exponent_part = r"""([eE][-+]?[0-9]+)"""
210 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
211 floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
Eli Bendersky3b1b08d2012-06-15 12:37:54 +0300212 binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
213 hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
214 hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300215
216 ##
217 ## Lexer states
218 ##
219 states = (
220 # ppline: preprocessor line directives
221 #
222 ('ppline', 'exclusive'),
223 )
224
225 def t_PPHASH(self, t):
226 r'[ \t]*\#'
227 m = self.line_pattern.match(
228 t.lexer.lexdata, pos=t.lexer.lexpos)
229
230 if m:
231 t.lexer.begin('ppline')
232 self.pp_line = self.pp_filename = None
233 #~ print "ppline starts on line %s" % t.lexer.lineno
234 else:
235 t.type = 'PPHASH'
236 return t
237
238 ##
239 ## Rules for the ppline state
240 ##
241 @TOKEN(string_literal)
242 def t_ppline_FILENAME(self, t):
243 if self.pp_line is None:
244 self._error('filename before line number in #line', t)
245 else:
246 self.pp_filename = t.value.lstrip('"').rstrip('"')
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300247
248 @TOKEN(decimal_constant)
249 def t_ppline_LINE_NUMBER(self, t):
250 if self.pp_line is None:
251 self.pp_line = t.value
252 else:
253 # Ignore: GCC's cpp sometimes inserts a numeric flag
254 # after the file name
255 pass
256
257 def t_ppline_NEWLINE(self, t):
258 r'\n'
259
260 if self.pp_line is None:
261 self._error('line number missing in #line', t)
262 else:
263 self.lexer.lineno = int(self.pp_line)
264
265 if self.pp_filename is not None:
266 self.filename = self.pp_filename
267
268 t.lexer.begin('INITIAL')
269
270 def t_ppline_PPLINE(self, t):
271 r'line'
272 pass
273
274 t_ppline_ignore = ' \t'
275
276 def t_ppline_error(self, t):
277 msg = 'invalid #line directive'
278 self._error(msg, t)
279
280 ##
281 ## Rules for the normal state
282 ##
283 t_ignore = ' \t'
284
285 # Newlines
286 def t_NEWLINE(self, t):
287 r'\n+'
288 t.lexer.lineno += t.value.count("\n")
289
290 # Operators
291 t_PLUS = r'\+'
292 t_MINUS = r'-'
293 t_TIMES = r'\*'
294 t_DIVIDE = r'/'
295 t_MOD = r'%'
296 t_OR = r'\|'
297 t_AND = r'&'
298 t_NOT = r'~'
299 t_XOR = r'\^'
300 t_LSHIFT = r'<<'
301 t_RSHIFT = r'>>'
302 t_LOR = r'\|\|'
303 t_LAND = r'&&'
304 t_LNOT = r'!'
305 t_LT = r'<'
306 t_GT = r'>'
307 t_LE = r'<='
308 t_GE = r'>='
309 t_EQ = r'=='
310 t_NE = r'!='
311
312 # Assignment operators
313 t_EQUALS = r'='
314 t_TIMESEQUAL = r'\*='
315 t_DIVEQUAL = r'/='
316 t_MODEQUAL = r'%='
317 t_PLUSEQUAL = r'\+='
318 t_MINUSEQUAL = r'-='
319 t_LSHIFTEQUAL = r'<<='
320 t_RSHIFTEQUAL = r'>>='
321 t_ANDEQUAL = r'&='
322 t_OREQUAL = r'\|='
323 t_XOREQUAL = r'\^='
324
325 # Increment/decrement
326 t_PLUSPLUS = r'\+\+'
327 t_MINUSMINUS = r'--'
328
329 # ->
330 t_ARROW = r'->'
331
332 # ?
333 t_CONDOP = r'\?'
334
335 # Delimeters
336 t_LPAREN = r'\('
337 t_RPAREN = r'\)'
338 t_LBRACKET = r'\['
339 t_RBRACKET = r'\]'
340 t_LBRACE = r'\{'
341 t_RBRACE = r'\}'
342 t_COMMA = r','
343 t_PERIOD = r'\.'
344 t_SEMI = r';'
345 t_COLON = r':'
346 t_ELLIPSIS = r'\.\.\.'
347
348 t_STRING_LITERAL = string_literal
349
350 # The following floating and integer constants are defined as
351 # functions to impose a strict order (otherwise, decimal
352 # is placed before the others because its regex is longer,
353 # and this is bad)
354 #
355 @TOKEN(floating_constant)
356 def t_FLOAT_CONST(self, t):
357 return t
358
Eli Bendersky3b1b08d2012-06-15 12:37:54 +0300359 @TOKEN(hex_floating_constant)
360 def t_HEX_FLOAT_CONST(self, t):
361 return t
362
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300363 @TOKEN(hex_constant)
364 def t_INT_CONST_HEX(self, t):
365 return t
366
367 @TOKEN(bad_octal_constant)
368 def t_BAD_CONST_OCT(self, t):
369 msg = "Invalid octal constant"
370 self._error(msg, t)
371
372 @TOKEN(octal_constant)
373 def t_INT_CONST_OCT(self, t):
374 return t
375
376 @TOKEN(decimal_constant)
377 def t_INT_CONST_DEC(self, t):
378 return t
379
380 # Must come before bad_char_const, to prevent it from
381 # catching valid char constants as invalid
382 #
383 @TOKEN(char_const)
384 def t_CHAR_CONST(self, t):
385 return t
386
387 @TOKEN(wchar_const)
388 def t_WCHAR_CONST(self, t):
389 return t
390
391 @TOKEN(unmatched_quote)
392 def t_UNMATCHED_QUOTE(self, t):
393 msg = "Unmatched '"
394 self._error(msg, t)
395
396 @TOKEN(bad_char_const)
397 def t_BAD_CHAR_CONST(self, t):
398 msg = "Invalid char constant %s" % t.value
399 self._error(msg, t)
400
401 @TOKEN(wstring_literal)
402 def t_WSTRING_LITERAL(self, t):
403 return t
404
405 # unmatched string literals are caught by the preprocessor
406
407 @TOKEN(bad_string_literal)
408 def t_BAD_STRING_LITERAL(self, t):
409 msg = "String contains invalid escape code"
410 self._error(msg, t)
411
412 @TOKEN(identifier)
413 def t_ID(self, t):
414 t.type = self.keyword_map.get(t.value, "ID")
415
416 if t.type == 'ID' and self.type_lookup_func(t.value):
417 t.type = "TYPEID"
418
419 return t
420
421 def t_error(self, t):
422 msg = 'Illegal character %s' % repr(t.value[0])
423 self._error(msg, t)
424
425
426if __name__ == "__main__":
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300427 filename = '../zp.c'
428 text = open(filename).read()
429
430 #~ text = '"'+r"""ka \p ka"""+'"'
431 text = r"""
432 546
433 #line 66 "kwas\df.h"
434 id 4
435 # 5
436 dsf
437 """
438
439 def errfoo(msg, a, b):
eli.bendersky1a1e46b2011-02-18 15:32:18 +0200440 sys.write(msg + "\n")
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300441 sys.exit()
442
443 def typelookup(namd):
444 return False
445
446 clex = CLexer(errfoo, typelookup)
447 clex.build()
448 clex.input(text)
449
450 while 1:
451 tok = clex.token()
452 if not tok: break
453
454 #~ print type(tok)
455 printme([tok.value, tok.type, tok.lineno, clex.filename, tok.lexpos])
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300456
Eli Bendersky3b1b08d2012-06-15 12:37:54 +0300457