Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 1 | #----------------------------------------------------------------- |
| 2 | # pycparser: c_lexer.py |
| 3 | # |
| 4 | # CLexer class: lexer for the C language |
| 5 | # |
eli.bendersky | 1a1e46b | 2011-02-18 15:32:18 +0200 | [diff] [blame] | 6 | # Copyright (C) 2008-2011, Eli Bendersky |
eli.bendersky | 84a6a63 | 2011-04-29 09:00:43 +0300 | [diff] [blame] | 7 | # License: BSD |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 8 | #----------------------------------------------------------------- |
| 9 | |
| 10 | import re |
| 11 | import sys |
| 12 | |
| 13 | import ply.lex |
| 14 | from ply.lex import TOKEN |
| 15 | |
| 16 | |
| 17 | class CLexer(object): |
| 18 | """ A lexer for the C language. After building it, set the |
| 19 | input text with input(), and call token() to get new |
| 20 | tokens. |
| 21 | |
| 22 | The public attribute filename can be set to an initial |
| 23 | filaneme, but the lexer will update it upon #line |
| 24 | directives. |
| 25 | """ |
| 26 | def __init__(self, error_func, type_lookup_func): |
| 27 | """ Create a new Lexer. |
| 28 | |
| 29 | error_func: |
| 30 | An error function. Will be called with an error |
| 31 | message, line and column as arguments, in case of |
| 32 | an error during lexing. |
| 33 | |
| 34 | type_lookup_func: |
| 35 | A type lookup function. Given a string, it must |
| 36 | return True IFF this string is a name of a type |
| 37 | that was defined with a typedef earlier. |
| 38 | """ |
| 39 | self.error_func = error_func |
| 40 | self.type_lookup_func = type_lookup_func |
| 41 | self.filename = '' |
| 42 | |
| 43 | # Allow either "# line" or "# <num>" to support GCC's |
| 44 | # cpp output |
| 45 | # |
| 46 | self.line_pattern = re.compile('([ \t]*line\W)|([ \t]*\d+)') |
| 47 | |
| 48 | def build(self, **kwargs): |
| 49 | """ Builds the lexer from the specification. Must be |
| 50 | called after the lexer object is created. |
| 51 | |
| 52 | This method exists separately, because the PLY |
| 53 | manual warns against calling lex.lex inside |
| 54 | __init__ |
| 55 | """ |
| 56 | self.lexer = ply.lex.lex(object=self, **kwargs) |
| 57 | |
| 58 | def reset_lineno(self): |
| 59 | """ Resets the internal line number counter of the lexer. |
| 60 | """ |
| 61 | self.lexer.lineno = 1 |
| 62 | |
| 63 | def input(self, text): |
| 64 | self.lexer.input(text) |
| 65 | |
| 66 | def token(self): |
| 67 | g = self.lexer.token() |
| 68 | return g |
| 69 | |
| 70 | ######################-- PRIVATE --###################### |
| 71 | |
| 72 | ## |
| 73 | ## Internal auxiliary methods |
| 74 | ## |
| 75 | def _error(self, msg, token): |
| 76 | location = self._make_tok_location(token) |
| 77 | self.error_func(msg, location[0], location[1]) |
| 78 | self.lexer.skip(1) |
| 79 | |
| 80 | def _find_tok_column(self, token): |
| 81 | i = token.lexpos |
| 82 | while i > 0: |
| 83 | if self.lexer.lexdata[i] == '\n': break |
| 84 | i -= 1 |
| 85 | return (token.lexpos - i) + 1 |
| 86 | |
| 87 | def _make_tok_location(self, token): |
| 88 | return (token.lineno, self._find_tok_column(token)) |
| 89 | |
| 90 | ## |
| 91 | ## Reserved keywords |
| 92 | ## |
| 93 | keywords = ( |
Even | f08560d | 2011-09-18 15:14:08 +0200 | [diff] [blame] | 94 | 'AUTO', '_BOOL', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE', |
| 95 | 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', |
eli.bendersky | 145890d | 2010-10-29 12:02:32 +0200 | [diff] [blame] | 96 | 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'REGISTER', |
| 97 | 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', |
Even | f08560d | 2011-09-18 15:14:08 +0200 | [diff] [blame] | 98 | 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 99 | 'VOLATILE', 'WHILE', |
| 100 | ) |
| 101 | |
| 102 | keyword_map = {} |
eli.bendersky | affe032 | 2011-09-24 06:16:55 +0300 | [diff] [blame] | 103 | for keyword in keywords: |
| 104 | if keyword == '_BOOL': |
| 105 | keyword_map['_Bool'] = keyword |
| 106 | else: |
| 107 | keyword_map[keyword.lower()] = keyword |
Even | f08560d | 2011-09-18 15:14:08 +0200 | [diff] [blame] | 108 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 109 | ## |
| 110 | ## All the tokens recognized by the lexer |
| 111 | ## |
| 112 | tokens = keywords + ( |
| 113 | # Identifiers |
| 114 | 'ID', |
| 115 | |
| 116 | # Type identifiers (identifiers previously defined as |
| 117 | # types with typedef) |
| 118 | 'TYPEID', |
| 119 | |
| 120 | # constants |
| 121 | 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', |
| 122 | 'FLOAT_CONST', |
| 123 | 'CHAR_CONST', |
| 124 | 'WCHAR_CONST', |
| 125 | |
| 126 | # String literals |
| 127 | 'STRING_LITERAL', |
| 128 | 'WSTRING_LITERAL', |
| 129 | |
| 130 | # Operators |
| 131 | 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', |
| 132 | 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', |
| 133 | 'LOR', 'LAND', 'LNOT', |
| 134 | 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', |
| 135 | |
| 136 | # Assignment |
| 137 | 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', |
| 138 | 'PLUSEQUAL', 'MINUSEQUAL', |
| 139 | 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', |
| 140 | 'OREQUAL', |
| 141 | |
| 142 | # Increment/decrement |
| 143 | 'PLUSPLUS', 'MINUSMINUS', |
| 144 | |
| 145 | # Structure dereference (->) |
| 146 | 'ARROW', |
| 147 | |
| 148 | # Conditional operator (?) |
| 149 | 'CONDOP', |
| 150 | |
| 151 | # Delimeters |
| 152 | 'LPAREN', 'RPAREN', # ( ) |
| 153 | 'LBRACKET', 'RBRACKET', # [ ] |
| 154 | 'LBRACE', 'RBRACE', # { } |
| 155 | 'COMMA', 'PERIOD', # . , |
| 156 | 'SEMI', 'COLON', # ; : |
| 157 | |
| 158 | # Ellipsis (...) |
| 159 | 'ELLIPSIS', |
| 160 | |
| 161 | # pre-processor |
| 162 | 'PPHASH', # '#' |
| 163 | ) |
| 164 | |
| 165 | ## |
| 166 | ## Regexes for use in tokens |
| 167 | ## |
| 168 | ## |
| 169 | |
| 170 | # valid C identifiers (K&R2: A.2.3) |
| 171 | identifier = r'[a-zA-Z_][0-9a-zA-Z_]*' |
| 172 | |
| 173 | # integer constants (K&R2: A.2.5.1) |
eli.bendersky | 98f4537 | 2010-10-30 09:46:29 +0200 | [diff] [blame] | 174 | integer_suffix_opt = r'(u?ll|U?LL|([uU][lL])|([lL][uU])|[uU]|[lL])?' |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 175 | decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' |
| 176 | octal_constant = '0[0-7]*'+integer_suffix_opt |
| 177 | hex_constant = '0[xX][0-9a-fA-F]+'+integer_suffix_opt |
| 178 | |
| 179 | bad_octal_constant = '0[0-7]*[89]' |
| 180 | |
| 181 | # character constants (K&R2: A.2.5.2) |
eli.bendersky | 49f3b63 | 2011-10-31 06:38:41 +0200 | [diff] [blame^] | 182 | # Note: a-zA-Z and '.' are allowed as escape chars to support #line |
| 183 | # directives with Windows paths as filenames (..\..\dir\file) |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 184 | # |
eli.bendersky | 49f3b63 | 2011-10-31 06:38:41 +0200 | [diff] [blame^] | 185 | simple_escape = r"""([a-zA-Z.\\?'"])""" |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 186 | octal_escape = r"""([0-7]{1,3})""" |
| 187 | hex_escape = r"""(x[0-9a-fA-F]+)""" |
eli.bendersky | 49f3b63 | 2011-10-31 06:38:41 +0200 | [diff] [blame^] | 188 | bad_escape = r"""([\\][^a-zA-Z.\\?'"x0-7])""" |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 189 | |
| 190 | escape_sequence = r"""(\\("""+simple_escape+'|'+octal_escape+'|'+hex_escape+'))' |
| 191 | cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' |
| 192 | char_const = "'"+cconst_char+"'" |
| 193 | wchar_const = 'L'+char_const |
| 194 | unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" |
| 195 | bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" |
| 196 | |
| 197 | # string literals (K&R2: A.2.6) |
| 198 | string_char = r"""([^"\\\n]|"""+escape_sequence+')' |
| 199 | string_literal = '"'+string_char+'*"' |
| 200 | wstring_literal = 'L'+string_literal |
| 201 | bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' |
| 202 | |
| 203 | # floating constants (K&R2: A.2.5.3) |
| 204 | exponent_part = r"""([eE][-+]?[0-9]+)""" |
| 205 | fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" |
| 206 | floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' |
| 207 | |
| 208 | ## |
| 209 | ## Lexer states |
| 210 | ## |
| 211 | states = ( |
| 212 | # ppline: preprocessor line directives |
| 213 | # |
| 214 | ('ppline', 'exclusive'), |
| 215 | ) |
| 216 | |
| 217 | def t_PPHASH(self, t): |
| 218 | r'[ \t]*\#' |
| 219 | m = self.line_pattern.match( |
| 220 | t.lexer.lexdata, pos=t.lexer.lexpos) |
| 221 | |
| 222 | if m: |
| 223 | t.lexer.begin('ppline') |
| 224 | self.pp_line = self.pp_filename = None |
| 225 | #~ print "ppline starts on line %s" % t.lexer.lineno |
| 226 | else: |
| 227 | t.type = 'PPHASH' |
| 228 | return t |
| 229 | |
| 230 | ## |
| 231 | ## Rules for the ppline state |
| 232 | ## |
| 233 | @TOKEN(string_literal) |
| 234 | def t_ppline_FILENAME(self, t): |
| 235 | if self.pp_line is None: |
| 236 | self._error('filename before line number in #line', t) |
| 237 | else: |
| 238 | self.pp_filename = t.value.lstrip('"').rstrip('"') |
| 239 | #~ print "PP got filename: ", self.pp_filename |
| 240 | |
| 241 | @TOKEN(decimal_constant) |
| 242 | def t_ppline_LINE_NUMBER(self, t): |
| 243 | if self.pp_line is None: |
| 244 | self.pp_line = t.value |
| 245 | else: |
| 246 | # Ignore: GCC's cpp sometimes inserts a numeric flag |
| 247 | # after the file name |
| 248 | pass |
| 249 | |
| 250 | def t_ppline_NEWLINE(self, t): |
| 251 | r'\n' |
| 252 | |
| 253 | if self.pp_line is None: |
| 254 | self._error('line number missing in #line', t) |
| 255 | else: |
| 256 | self.lexer.lineno = int(self.pp_line) |
| 257 | |
| 258 | if self.pp_filename is not None: |
| 259 | self.filename = self.pp_filename |
| 260 | |
| 261 | t.lexer.begin('INITIAL') |
| 262 | |
| 263 | def t_ppline_PPLINE(self, t): |
| 264 | r'line' |
| 265 | pass |
| 266 | |
| 267 | t_ppline_ignore = ' \t' |
| 268 | |
| 269 | def t_ppline_error(self, t): |
| 270 | msg = 'invalid #line directive' |
| 271 | self._error(msg, t) |
| 272 | |
| 273 | ## |
| 274 | ## Rules for the normal state |
| 275 | ## |
| 276 | t_ignore = ' \t' |
| 277 | |
| 278 | # Newlines |
| 279 | def t_NEWLINE(self, t): |
| 280 | r'\n+' |
| 281 | t.lexer.lineno += t.value.count("\n") |
| 282 | |
| 283 | # Operators |
| 284 | t_PLUS = r'\+' |
| 285 | t_MINUS = r'-' |
| 286 | t_TIMES = r'\*' |
| 287 | t_DIVIDE = r'/' |
| 288 | t_MOD = r'%' |
| 289 | t_OR = r'\|' |
| 290 | t_AND = r'&' |
| 291 | t_NOT = r'~' |
| 292 | t_XOR = r'\^' |
| 293 | t_LSHIFT = r'<<' |
| 294 | t_RSHIFT = r'>>' |
| 295 | t_LOR = r'\|\|' |
| 296 | t_LAND = r'&&' |
| 297 | t_LNOT = r'!' |
| 298 | t_LT = r'<' |
| 299 | t_GT = r'>' |
| 300 | t_LE = r'<=' |
| 301 | t_GE = r'>=' |
| 302 | t_EQ = r'==' |
| 303 | t_NE = r'!=' |
| 304 | |
| 305 | # Assignment operators |
| 306 | t_EQUALS = r'=' |
| 307 | t_TIMESEQUAL = r'\*=' |
| 308 | t_DIVEQUAL = r'/=' |
| 309 | t_MODEQUAL = r'%=' |
| 310 | t_PLUSEQUAL = r'\+=' |
| 311 | t_MINUSEQUAL = r'-=' |
| 312 | t_LSHIFTEQUAL = r'<<=' |
| 313 | t_RSHIFTEQUAL = r'>>=' |
| 314 | t_ANDEQUAL = r'&=' |
| 315 | t_OREQUAL = r'\|=' |
| 316 | t_XOREQUAL = r'\^=' |
| 317 | |
| 318 | # Increment/decrement |
| 319 | t_PLUSPLUS = r'\+\+' |
| 320 | t_MINUSMINUS = r'--' |
| 321 | |
| 322 | # -> |
| 323 | t_ARROW = r'->' |
| 324 | |
| 325 | # ? |
| 326 | t_CONDOP = r'\?' |
| 327 | |
| 328 | # Delimeters |
| 329 | t_LPAREN = r'\(' |
| 330 | t_RPAREN = r'\)' |
| 331 | t_LBRACKET = r'\[' |
| 332 | t_RBRACKET = r'\]' |
| 333 | t_LBRACE = r'\{' |
| 334 | t_RBRACE = r'\}' |
| 335 | t_COMMA = r',' |
| 336 | t_PERIOD = r'\.' |
| 337 | t_SEMI = r';' |
| 338 | t_COLON = r':' |
| 339 | t_ELLIPSIS = r'\.\.\.' |
| 340 | |
| 341 | t_STRING_LITERAL = string_literal |
| 342 | |
| 343 | # The following floating and integer constants are defined as |
| 344 | # functions to impose a strict order (otherwise, decimal |
| 345 | # is placed before the others because its regex is longer, |
| 346 | # and this is bad) |
| 347 | # |
| 348 | @TOKEN(floating_constant) |
| 349 | def t_FLOAT_CONST(self, t): |
| 350 | return t |
| 351 | |
| 352 | @TOKEN(hex_constant) |
| 353 | def t_INT_CONST_HEX(self, t): |
| 354 | return t |
| 355 | |
| 356 | @TOKEN(bad_octal_constant) |
| 357 | def t_BAD_CONST_OCT(self, t): |
| 358 | msg = "Invalid octal constant" |
| 359 | self._error(msg, t) |
| 360 | |
| 361 | @TOKEN(octal_constant) |
| 362 | def t_INT_CONST_OCT(self, t): |
| 363 | return t |
| 364 | |
| 365 | @TOKEN(decimal_constant) |
| 366 | def t_INT_CONST_DEC(self, t): |
| 367 | return t |
| 368 | |
| 369 | # Must come before bad_char_const, to prevent it from |
| 370 | # catching valid char constants as invalid |
| 371 | # |
| 372 | @TOKEN(char_const) |
| 373 | def t_CHAR_CONST(self, t): |
| 374 | return t |
| 375 | |
| 376 | @TOKEN(wchar_const) |
| 377 | def t_WCHAR_CONST(self, t): |
| 378 | return t |
| 379 | |
| 380 | @TOKEN(unmatched_quote) |
| 381 | def t_UNMATCHED_QUOTE(self, t): |
| 382 | msg = "Unmatched '" |
| 383 | self._error(msg, t) |
| 384 | |
| 385 | @TOKEN(bad_char_const) |
| 386 | def t_BAD_CHAR_CONST(self, t): |
| 387 | msg = "Invalid char constant %s" % t.value |
| 388 | self._error(msg, t) |
| 389 | |
| 390 | @TOKEN(wstring_literal) |
| 391 | def t_WSTRING_LITERAL(self, t): |
| 392 | return t |
| 393 | |
| 394 | # unmatched string literals are caught by the preprocessor |
| 395 | |
| 396 | @TOKEN(bad_string_literal) |
| 397 | def t_BAD_STRING_LITERAL(self, t): |
| 398 | msg = "String contains invalid escape code" |
| 399 | self._error(msg, t) |
| 400 | |
| 401 | @TOKEN(identifier) |
| 402 | def t_ID(self, t): |
| 403 | t.type = self.keyword_map.get(t.value, "ID") |
| 404 | |
| 405 | if t.type == 'ID' and self.type_lookup_func(t.value): |
| 406 | t.type = "TYPEID" |
| 407 | |
| 408 | return t |
| 409 | |
| 410 | def t_error(self, t): |
| 411 | msg = 'Illegal character %s' % repr(t.value[0]) |
| 412 | self._error(msg, t) |
| 413 | |
| 414 | |
| 415 | if __name__ == "__main__": |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 416 | filename = '../zp.c' |
| 417 | text = open(filename).read() |
| 418 | |
| 419 | #~ text = '"'+r"""ka \p ka"""+'"' |
| 420 | text = r""" |
| 421 | 546 |
| 422 | #line 66 "kwas\df.h" |
| 423 | id 4 |
| 424 | # 5 |
| 425 | dsf |
| 426 | """ |
| 427 | |
| 428 | def errfoo(msg, a, b): |
eli.bendersky | 1a1e46b | 2011-02-18 15:32:18 +0200 | [diff] [blame] | 429 | sys.write(msg + "\n") |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 430 | sys.exit() |
| 431 | |
| 432 | def typelookup(namd): |
| 433 | return False |
| 434 | |
| 435 | clex = CLexer(errfoo, typelookup) |
| 436 | clex.build() |
| 437 | clex.input(text) |
| 438 | |
| 439 | while 1: |
| 440 | tok = clex.token() |
| 441 | if not tok: break |
| 442 | |
| 443 | #~ print type(tok) |
| 444 | printme([tok.value, tok.type, tok.lineno, clex.filename, tok.lexpos]) |
| 445 | |
| 446 | |
| 447 | |