blob: 235f8f03814b30c7fc8451c198e1e9124f313f33 [file] [log] [blame]
Eli Bendersky3921e8e2010-05-21 09:05:39 +03001#-----------------------------------------------------------------
2# pycparser: c_lexer.py
3#
4# CLexer class: lexer for the C language
5#
eli.bendersky1a1e46b2011-02-18 15:32:18 +02006# Copyright (C) 2008-2011, Eli Bendersky
eli.bendersky84a6a632011-04-29 09:00:43 +03007# License: BSD
Eli Bendersky3921e8e2010-05-21 09:05:39 +03008#-----------------------------------------------------------------
9
10import re
11import sys
12
13import ply.lex
14from ply.lex import TOKEN
15
16
17class CLexer(object):
18 """ A lexer for the C language. After building it, set the
19 input text with input(), and call token() to get new
20 tokens.
21
22 The public attribute filename can be set to an initial
23 filaneme, but the lexer will update it upon #line
24 directives.
25 """
26 def __init__(self, error_func, type_lookup_func):
27 """ Create a new Lexer.
28
29 error_func:
30 An error function. Will be called with an error
31 message, line and column as arguments, in case of
32 an error during lexing.
33
34 type_lookup_func:
35 A type lookup function. Given a string, it must
36 return True IFF this string is a name of a type
37 that was defined with a typedef earlier.
38 """
39 self.error_func = error_func
40 self.type_lookup_func = type_lookup_func
41 self.filename = ''
42
43 # Allow either "# line" or "# <num>" to support GCC's
44 # cpp output
45 #
46 self.line_pattern = re.compile('([ \t]*line\W)|([ \t]*\d+)')
47
48 def build(self, **kwargs):
49 """ Builds the lexer from the specification. Must be
50 called after the lexer object is created.
51
52 This method exists separately, because the PLY
53 manual warns against calling lex.lex inside
54 __init__
55 """
56 self.lexer = ply.lex.lex(object=self, **kwargs)
57
58 def reset_lineno(self):
59 """ Resets the internal line number counter of the lexer.
60 """
61 self.lexer.lineno = 1
62
63 def input(self, text):
64 self.lexer.input(text)
65
66 def token(self):
67 g = self.lexer.token()
68 return g
69
70 ######################-- PRIVATE --######################
71
72 ##
73 ## Internal auxiliary methods
74 ##
75 def _error(self, msg, token):
76 location = self._make_tok_location(token)
77 self.error_func(msg, location[0], location[1])
78 self.lexer.skip(1)
79
80 def _find_tok_column(self, token):
81 i = token.lexpos
82 while i > 0:
83 if self.lexer.lexdata[i] == '\n': break
84 i -= 1
85 return (token.lexpos - i) + 1
86
87 def _make_tok_location(self, token):
88 return (token.lineno, self._find_tok_column(token))
89
90 ##
91 ## Reserved keywords
92 ##
93 keywords = (
Evenf08560d2011-09-18 15:14:08 +020094 'AUTO', '_BOOL', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE',
95 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
eli.bendersky145890d2010-10-29 12:02:32 +020096 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'REGISTER',
97 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
Evenf08560d2011-09-18 15:14:08 +020098 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
Eli Bendersky3921e8e2010-05-21 09:05:39 +030099 'VOLATILE', 'WHILE',
100 )
101
102 keyword_map = {}
eli.benderskyaffe0322011-09-24 06:16:55 +0300103 for keyword in keywords:
104 if keyword == '_BOOL':
105 keyword_map['_Bool'] = keyword
106 else:
107 keyword_map[keyword.lower()] = keyword
Evenf08560d2011-09-18 15:14:08 +0200108
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300109 ##
110 ## All the tokens recognized by the lexer
111 ##
112 tokens = keywords + (
113 # Identifiers
114 'ID',
115
116 # Type identifiers (identifiers previously defined as
117 # types with typedef)
118 'TYPEID',
119
120 # constants
121 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX',
122 'FLOAT_CONST',
123 'CHAR_CONST',
124 'WCHAR_CONST',
125
126 # String literals
127 'STRING_LITERAL',
128 'WSTRING_LITERAL',
129
130 # Operators
131 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
132 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
133 'LOR', 'LAND', 'LNOT',
134 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
135
136 # Assignment
137 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
138 'PLUSEQUAL', 'MINUSEQUAL',
139 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
140 'OREQUAL',
141
142 # Increment/decrement
143 'PLUSPLUS', 'MINUSMINUS',
144
145 # Structure dereference (->)
146 'ARROW',
147
148 # Conditional operator (?)
149 'CONDOP',
150
151 # Delimeters
152 'LPAREN', 'RPAREN', # ( )
153 'LBRACKET', 'RBRACKET', # [ ]
154 'LBRACE', 'RBRACE', # { }
155 'COMMA', 'PERIOD', # . ,
156 'SEMI', 'COLON', # ; :
157
158 # Ellipsis (...)
159 'ELLIPSIS',
160
161 # pre-processor
162 'PPHASH', # '#'
163 )
164
165 ##
166 ## Regexes for use in tokens
167 ##
168 ##
169
170 # valid C identifiers (K&R2: A.2.3)
171 identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'
172
173 # integer constants (K&R2: A.2.5.1)
eli.bendersky98f45372010-10-30 09:46:29 +0200174 integer_suffix_opt = r'(u?ll|U?LL|([uU][lL])|([lL][uU])|[uU]|[lL])?'
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300175 decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
176 octal_constant = '0[0-7]*'+integer_suffix_opt
177 hex_constant = '0[xX][0-9a-fA-F]+'+integer_suffix_opt
178
179 bad_octal_constant = '0[0-7]*[89]'
180
181 # character constants (K&R2: A.2.5.2)
eli.bendersky49f3b632011-10-31 06:38:41 +0200182 # Note: a-zA-Z and '.' are allowed as escape chars to support #line
183 # directives with Windows paths as filenames (..\..\dir\file)
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300184 #
eli.bendersky49f3b632011-10-31 06:38:41 +0200185 simple_escape = r"""([a-zA-Z.\\?'"])"""
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300186 octal_escape = r"""([0-7]{1,3})"""
187 hex_escape = r"""(x[0-9a-fA-F]+)"""
eli.bendersky49f3b632011-10-31 06:38:41 +0200188 bad_escape = r"""([\\][^a-zA-Z.\\?'"x0-7])"""
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300189
190 escape_sequence = r"""(\\("""+simple_escape+'|'+octal_escape+'|'+hex_escape+'))'
191 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
192 char_const = "'"+cconst_char+"'"
193 wchar_const = 'L'+char_const
194 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
195 bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
196
197 # string literals (K&R2: A.2.6)
198 string_char = r"""([^"\\\n]|"""+escape_sequence+')'
199 string_literal = '"'+string_char+'*"'
200 wstring_literal = 'L'+string_literal
201 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
202
203 # floating constants (K&R2: A.2.5.3)
204 exponent_part = r"""([eE][-+]?[0-9]+)"""
205 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
206 floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
207
208 ##
209 ## Lexer states
210 ##
211 states = (
212 # ppline: preprocessor line directives
213 #
214 ('ppline', 'exclusive'),
215 )
216
217 def t_PPHASH(self, t):
218 r'[ \t]*\#'
219 m = self.line_pattern.match(
220 t.lexer.lexdata, pos=t.lexer.lexpos)
221
222 if m:
223 t.lexer.begin('ppline')
224 self.pp_line = self.pp_filename = None
225 #~ print "ppline starts on line %s" % t.lexer.lineno
226 else:
227 t.type = 'PPHASH'
228 return t
229
230 ##
231 ## Rules for the ppline state
232 ##
233 @TOKEN(string_literal)
234 def t_ppline_FILENAME(self, t):
235 if self.pp_line is None:
236 self._error('filename before line number in #line', t)
237 else:
238 self.pp_filename = t.value.lstrip('"').rstrip('"')
239 #~ print "PP got filename: ", self.pp_filename
240
241 @TOKEN(decimal_constant)
242 def t_ppline_LINE_NUMBER(self, t):
243 if self.pp_line is None:
244 self.pp_line = t.value
245 else:
246 # Ignore: GCC's cpp sometimes inserts a numeric flag
247 # after the file name
248 pass
249
250 def t_ppline_NEWLINE(self, t):
251 r'\n'
252
253 if self.pp_line is None:
254 self._error('line number missing in #line', t)
255 else:
256 self.lexer.lineno = int(self.pp_line)
257
258 if self.pp_filename is not None:
259 self.filename = self.pp_filename
260
261 t.lexer.begin('INITIAL')
262
263 def t_ppline_PPLINE(self, t):
264 r'line'
265 pass
266
267 t_ppline_ignore = ' \t'
268
269 def t_ppline_error(self, t):
270 msg = 'invalid #line directive'
271 self._error(msg, t)
272
273 ##
274 ## Rules for the normal state
275 ##
276 t_ignore = ' \t'
277
278 # Newlines
279 def t_NEWLINE(self, t):
280 r'\n+'
281 t.lexer.lineno += t.value.count("\n")
282
283 # Operators
284 t_PLUS = r'\+'
285 t_MINUS = r'-'
286 t_TIMES = r'\*'
287 t_DIVIDE = r'/'
288 t_MOD = r'%'
289 t_OR = r'\|'
290 t_AND = r'&'
291 t_NOT = r'~'
292 t_XOR = r'\^'
293 t_LSHIFT = r'<<'
294 t_RSHIFT = r'>>'
295 t_LOR = r'\|\|'
296 t_LAND = r'&&'
297 t_LNOT = r'!'
298 t_LT = r'<'
299 t_GT = r'>'
300 t_LE = r'<='
301 t_GE = r'>='
302 t_EQ = r'=='
303 t_NE = r'!='
304
305 # Assignment operators
306 t_EQUALS = r'='
307 t_TIMESEQUAL = r'\*='
308 t_DIVEQUAL = r'/='
309 t_MODEQUAL = r'%='
310 t_PLUSEQUAL = r'\+='
311 t_MINUSEQUAL = r'-='
312 t_LSHIFTEQUAL = r'<<='
313 t_RSHIFTEQUAL = r'>>='
314 t_ANDEQUAL = r'&='
315 t_OREQUAL = r'\|='
316 t_XOREQUAL = r'\^='
317
318 # Increment/decrement
319 t_PLUSPLUS = r'\+\+'
320 t_MINUSMINUS = r'--'
321
322 # ->
323 t_ARROW = r'->'
324
325 # ?
326 t_CONDOP = r'\?'
327
328 # Delimeters
329 t_LPAREN = r'\('
330 t_RPAREN = r'\)'
331 t_LBRACKET = r'\['
332 t_RBRACKET = r'\]'
333 t_LBRACE = r'\{'
334 t_RBRACE = r'\}'
335 t_COMMA = r','
336 t_PERIOD = r'\.'
337 t_SEMI = r';'
338 t_COLON = r':'
339 t_ELLIPSIS = r'\.\.\.'
340
341 t_STRING_LITERAL = string_literal
342
343 # The following floating and integer constants are defined as
344 # functions to impose a strict order (otherwise, decimal
345 # is placed before the others because its regex is longer,
346 # and this is bad)
347 #
348 @TOKEN(floating_constant)
349 def t_FLOAT_CONST(self, t):
350 return t
351
352 @TOKEN(hex_constant)
353 def t_INT_CONST_HEX(self, t):
354 return t
355
356 @TOKEN(bad_octal_constant)
357 def t_BAD_CONST_OCT(self, t):
358 msg = "Invalid octal constant"
359 self._error(msg, t)
360
361 @TOKEN(octal_constant)
362 def t_INT_CONST_OCT(self, t):
363 return t
364
365 @TOKEN(decimal_constant)
366 def t_INT_CONST_DEC(self, t):
367 return t
368
369 # Must come before bad_char_const, to prevent it from
370 # catching valid char constants as invalid
371 #
372 @TOKEN(char_const)
373 def t_CHAR_CONST(self, t):
374 return t
375
376 @TOKEN(wchar_const)
377 def t_WCHAR_CONST(self, t):
378 return t
379
380 @TOKEN(unmatched_quote)
381 def t_UNMATCHED_QUOTE(self, t):
382 msg = "Unmatched '"
383 self._error(msg, t)
384
385 @TOKEN(bad_char_const)
386 def t_BAD_CHAR_CONST(self, t):
387 msg = "Invalid char constant %s" % t.value
388 self._error(msg, t)
389
390 @TOKEN(wstring_literal)
391 def t_WSTRING_LITERAL(self, t):
392 return t
393
394 # unmatched string literals are caught by the preprocessor
395
396 @TOKEN(bad_string_literal)
397 def t_BAD_STRING_LITERAL(self, t):
398 msg = "String contains invalid escape code"
399 self._error(msg, t)
400
401 @TOKEN(identifier)
402 def t_ID(self, t):
403 t.type = self.keyword_map.get(t.value, "ID")
404
405 if t.type == 'ID' and self.type_lookup_func(t.value):
406 t.type = "TYPEID"
407
408 return t
409
410 def t_error(self, t):
411 msg = 'Illegal character %s' % repr(t.value[0])
412 self._error(msg, t)
413
414
415if __name__ == "__main__":
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300416 filename = '../zp.c'
417 text = open(filename).read()
418
419 #~ text = '"'+r"""ka \p ka"""+'"'
420 text = r"""
421 546
422 #line 66 "kwas\df.h"
423 id 4
424 # 5
425 dsf
426 """
427
428 def errfoo(msg, a, b):
eli.bendersky1a1e46b2011-02-18 15:32:18 +0200429 sys.write(msg + "\n")
Eli Bendersky3921e8e2010-05-21 09:05:39 +0300430 sys.exit()
431
432 def typelookup(namd):
433 return False
434
435 clex = CLexer(errfoo, typelookup)
436 clex.build()
437 clex.input(text)
438
439 while 1:
440 tok = clex.token()
441 if not tok: break
442
443 #~ print type(tok)
444 printme([tok.value, tok.type, tok.lineno, clex.filename, tok.lexpos])
445
446
447