blob: b5327cbdb26a611c5ce7b1f09ecc3bd44f1837b3 [file] [log] [blame]
Eli Bendersky3921e8e2010-05-21 09:05:39 +03001#-----------------------------------------------------------------
2# pycparser: c_lexer.py
3#
4# CLexer class: lexer for the C language
5#
6# Copyright (C) 2008-2010, Eli Bendersky
7# License: LGPL
8#-----------------------------------------------------------------
9
10import re
11import sys
12
13import ply.lex
14from ply.lex import TOKEN
15
16
17class CLexer(object):
18 """ A lexer for the C language. After building it, set the
19 input text with input(), and call token() to get new
20 tokens.
21
22 The public attribute filename can be set to an initial
23 filaneme, but the lexer will update it upon #line
24 directives.
25 """
26 def __init__(self, error_func, type_lookup_func):
27 """ Create a new Lexer.
28
29 error_func:
30 An error function. Will be called with an error
31 message, line and column as arguments, in case of
32 an error during lexing.
33
34 type_lookup_func:
35 A type lookup function. Given a string, it must
36 return True IFF this string is a name of a type
37 that was defined with a typedef earlier.
38 """
39 self.error_func = error_func
40 self.type_lookup_func = type_lookup_func
41 self.filename = ''
42
43 # Allow either "# line" or "# <num>" to support GCC's
44 # cpp output
45 #
46 self.line_pattern = re.compile('([ \t]*line\W)|([ \t]*\d+)')
47
48 def build(self, **kwargs):
49 """ Builds the lexer from the specification. Must be
50 called after the lexer object is created.
51
52 This method exists separately, because the PLY
53 manual warns against calling lex.lex inside
54 __init__
55 """
56 self.lexer = ply.lex.lex(object=self, **kwargs)
57
58 def reset_lineno(self):
59 """ Resets the internal line number counter of the lexer.
60 """
61 self.lexer.lineno = 1
62
63 def input(self, text):
64 self.lexer.input(text)
65
66 def token(self):
67 g = self.lexer.token()
68 return g
69
70 ######################-- PRIVATE --######################
71
72 ##
73 ## Internal auxiliary methods
74 ##
75 def _error(self, msg, token):
76 location = self._make_tok_location(token)
77 self.error_func(msg, location[0], location[1])
78 self.lexer.skip(1)
79
80 def _find_tok_column(self, token):
81 i = token.lexpos
82 while i > 0:
83 if self.lexer.lexdata[i] == '\n': break
84 i -= 1
85 return (token.lexpos - i) + 1
86
87 def _make_tok_location(self, token):
88 return (token.lineno, self._find_tok_column(token))
89
90 ##
91 ## Reserved keywords
92 ##
93 keywords = (
94 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE',
95 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
96 'FLOAT', 'FOR', 'GOTO', 'IF', 'INT', 'LONG', 'REGISTER',
97 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
98 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
99 'VOLATILE', 'WHILE',
100 )
101
102 keyword_map = {}
103 for r in keywords:
104 keyword_map[r.lower()] = r
105
106 ##
107 ## All the tokens recognized by the lexer
108 ##
109 tokens = keywords + (
110 # Identifiers
111 'ID',
112
113 # Type identifiers (identifiers previously defined as
114 # types with typedef)
115 'TYPEID',
116
117 # constants
118 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX',
119 'FLOAT_CONST',
120 'CHAR_CONST',
121 'WCHAR_CONST',
122
123 # String literals
124 'STRING_LITERAL',
125 'WSTRING_LITERAL',
126
127 # Operators
128 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
129 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
130 'LOR', 'LAND', 'LNOT',
131 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
132
133 # Assignment
134 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
135 'PLUSEQUAL', 'MINUSEQUAL',
136 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
137 'OREQUAL',
138
139 # Increment/decrement
140 'PLUSPLUS', 'MINUSMINUS',
141
142 # Structure dereference (->)
143 'ARROW',
144
145 # Conditional operator (?)
146 'CONDOP',
147
148 # Delimeters
149 'LPAREN', 'RPAREN', # ( )
150 'LBRACKET', 'RBRACKET', # [ ]
151 'LBRACE', 'RBRACE', # { }
152 'COMMA', 'PERIOD', # . ,
153 'SEMI', 'COLON', # ; :
154
155 # Ellipsis (...)
156 'ELLIPSIS',
157
158 # pre-processor
159 'PPHASH', # '#'
160 )
161
162 ##
163 ## Regexes for use in tokens
164 ##
165 ##
166
167 # valid C identifiers (K&R2: A.2.3)
168 identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
169
170 # integer constants (K&R2: A.2.5.1)
171 integer_suffix_opt = r'(([uU][lL])|([lL][uU])|[uU]|[lL])?'
172 decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
173 octal_constant = '0[0-7]*'+integer_suffix_opt
174 hex_constant = '0[xX][0-9a-fA-F]+'+integer_suffix_opt
175
176 bad_octal_constant = '0[0-7]*[89]'
177
178 # character constants (K&R2: A.2.5.2)
179 # Note: a-zA-Z are allowed as escape chars to support #line
180 # directives with Windows paths as filenames (\dir\file...)
181 #
182 simple_escape = r"""([a-zA-Z\\?'"])"""
183 octal_escape = r"""([0-7]{1,3})"""
184 hex_escape = r"""(x[0-9a-fA-F]+)"""
185 bad_escape = r"""([\\][^a-zA-Z\\?'"x0-7])"""
186
187 escape_sequence = r"""(\\("""+simple_escape+'|'+octal_escape+'|'+hex_escape+'))'
188 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
189 char_const = "'"+cconst_char+"'"
190 wchar_const = 'L'+char_const
191 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
192 bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
193
194 # string literals (K&R2: A.2.6)
195 string_char = r"""([^"\\\n]|"""+escape_sequence+')'
196 string_literal = '"'+string_char+'*"'
197 wstring_literal = 'L'+string_literal
198 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
199
200 # floating constants (K&R2: A.2.5.3)
201 exponent_part = r"""([eE][-+]?[0-9]+)"""
202 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
203 floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
204
205 ##
206 ## Lexer states
207 ##
208 states = (
209 # ppline: preprocessor line directives
210 #
211 ('ppline', 'exclusive'),
212 )
213
214 def t_PPHASH(self, t):
215 r'[ \t]*\#'
216 m = self.line_pattern.match(
217 t.lexer.lexdata, pos=t.lexer.lexpos)
218
219 if m:
220 t.lexer.begin('ppline')
221 self.pp_line = self.pp_filename = None
222 #~ print "ppline starts on line %s" % t.lexer.lineno
223 else:
224 t.type = 'PPHASH'
225 return t
226
227 ##
228 ## Rules for the ppline state
229 ##
230 @TOKEN(string_literal)
231 def t_ppline_FILENAME(self, t):
232 if self.pp_line is None:
233 self._error('filename before line number in #line', t)
234 else:
235 self.pp_filename = t.value.lstrip('"').rstrip('"')
236 #~ print "PP got filename: ", self.pp_filename
237
238 @TOKEN(decimal_constant)
239 def t_ppline_LINE_NUMBER(self, t):
240 if self.pp_line is None:
241 self.pp_line = t.value
242 else:
243 # Ignore: GCC's cpp sometimes inserts a numeric flag
244 # after the file name
245 pass
246
247 def t_ppline_NEWLINE(self, t):
248 r'\n'
249
250 if self.pp_line is None:
251 self._error('line number missing in #line', t)
252 else:
253 self.lexer.lineno = int(self.pp_line)
254
255 if self.pp_filename is not None:
256 self.filename = self.pp_filename
257
258 t.lexer.begin('INITIAL')
259
260 def t_ppline_PPLINE(self, t):
261 r'line'
262 pass
263
264 t_ppline_ignore = ' \t'
265
266 def t_ppline_error(self, t):
267 msg = 'invalid #line directive'
268 self._error(msg, t)
269
270 ##
271 ## Rules for the normal state
272 ##
273 t_ignore = ' \t'
274
275 # Newlines
276 def t_NEWLINE(self, t):
277 r'\n+'
278 t.lexer.lineno += t.value.count("\n")
279
280 # Operators
281 t_PLUS = r'\+'
282 t_MINUS = r'-'
283 t_TIMES = r'\*'
284 t_DIVIDE = r'/'
285 t_MOD = r'%'
286 t_OR = r'\|'
287 t_AND = r'&'
288 t_NOT = r'~'
289 t_XOR = r'\^'
290 t_LSHIFT = r'<<'
291 t_RSHIFT = r'>>'
292 t_LOR = r'\|\|'
293 t_LAND = r'&&'
294 t_LNOT = r'!'
295 t_LT = r'<'
296 t_GT = r'>'
297 t_LE = r'<='
298 t_GE = r'>='
299 t_EQ = r'=='
300 t_NE = r'!='
301
302 # Assignment operators
303 t_EQUALS = r'='
304 t_TIMESEQUAL = r'\*='
305 t_DIVEQUAL = r'/='
306 t_MODEQUAL = r'%='
307 t_PLUSEQUAL = r'\+='
308 t_MINUSEQUAL = r'-='
309 t_LSHIFTEQUAL = r'<<='
310 t_RSHIFTEQUAL = r'>>='
311 t_ANDEQUAL = r'&='
312 t_OREQUAL = r'\|='
313 t_XOREQUAL = r'\^='
314
315 # Increment/decrement
316 t_PLUSPLUS = r'\+\+'
317 t_MINUSMINUS = r'--'
318
319 # ->
320 t_ARROW = r'->'
321
322 # ?
323 t_CONDOP = r'\?'
324
325 # Delimeters
326 t_LPAREN = r'\('
327 t_RPAREN = r'\)'
328 t_LBRACKET = r'\['
329 t_RBRACKET = r'\]'
330 t_LBRACE = r'\{'
331 t_RBRACE = r'\}'
332 t_COMMA = r','
333 t_PERIOD = r'\.'
334 t_SEMI = r';'
335 t_COLON = r':'
336 t_ELLIPSIS = r'\.\.\.'
337
338 t_STRING_LITERAL = string_literal
339
340 # The following floating and integer constants are defined as
341 # functions to impose a strict order (otherwise, decimal
342 # is placed before the others because its regex is longer,
343 # and this is bad)
344 #
345 @TOKEN(floating_constant)
346 def t_FLOAT_CONST(self, t):
347 return t
348
349 @TOKEN(hex_constant)
350 def t_INT_CONST_HEX(self, t):
351 return t
352
353 @TOKEN(bad_octal_constant)
354 def t_BAD_CONST_OCT(self, t):
355 msg = "Invalid octal constant"
356 self._error(msg, t)
357
358 @TOKEN(octal_constant)
359 def t_INT_CONST_OCT(self, t):
360 return t
361
362 @TOKEN(decimal_constant)
363 def t_INT_CONST_DEC(self, t):
364 return t
365
366 # Must come before bad_char_const, to prevent it from
367 # catching valid char constants as invalid
368 #
369 @TOKEN(char_const)
370 def t_CHAR_CONST(self, t):
371 return t
372
373 @TOKEN(wchar_const)
374 def t_WCHAR_CONST(self, t):
375 return t
376
377 @TOKEN(unmatched_quote)
378 def t_UNMATCHED_QUOTE(self, t):
379 msg = "Unmatched '"
380 self._error(msg, t)
381
382 @TOKEN(bad_char_const)
383 def t_BAD_CHAR_CONST(self, t):
384 msg = "Invalid char constant %s" % t.value
385 self._error(msg, t)
386
387 @TOKEN(wstring_literal)
388 def t_WSTRING_LITERAL(self, t):
389 return t
390
391 # unmatched string literals are caught by the preprocessor
392
393 @TOKEN(bad_string_literal)
394 def t_BAD_STRING_LITERAL(self, t):
395 msg = "String contains invalid escape code"
396 self._error(msg, t)
397
398 @TOKEN(identifier)
399 def t_ID(self, t):
400 t.type = self.keyword_map.get(t.value, "ID")
401
402 if t.type == 'ID' and self.type_lookup_func(t.value):
403 t.type = "TYPEID"
404
405 return t
406
407 def t_error(self, t):
408 msg = 'Illegal character %s' % repr(t.value[0])
409 self._error(msg, t)
410
411
412if __name__ == "__main__":
413 from portability import printme
414 filename = '../zp.c'
415 text = open(filename).read()
416
417 #~ text = '"'+r"""ka \p ka"""+'"'
418 text = r"""
419 546
420 #line 66 "kwas\df.h"
421 id 4
422 # 5
423 dsf
424 """
425
426 def errfoo(msg, a, b):
427 printme(msg)
428 sys.exit()
429
430 def typelookup(namd):
431 return False
432
433 clex = CLexer(errfoo, typelookup)
434 clex.build()
435 clex.input(text)
436
437 while 1:
438 tok = clex.token()
439 if not tok: break
440
441 #~ print type(tok)
442 printme([tok.value, tok.type, tok.lineno, clex.filename, tok.lexpos])
443
444
445