Eli Bendersky | ffa1809 | 2013-07-13 06:30:42 -0700 | [diff] [blame] | 1 | #------------------------------------------------------------------------------ |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 2 | # pycparser: c_lexer.py |
| 3 | # |
| 4 | # CLexer class: lexer for the C language |
| 5 | # |
Eli Bendersky | ffa1809 | 2013-07-13 06:30:42 -0700 | [diff] [blame] | 6 | # Copyright (C) 2008-2013, Eli Bendersky |
eli.bendersky | 84a6a63 | 2011-04-29 09:00:43 +0300 | [diff] [blame] | 7 | # License: BSD |
Eli Bendersky | ffa1809 | 2013-07-13 06:30:42 -0700 | [diff] [blame] | 8 | #------------------------------------------------------------------------------ |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 9 | import re |
| 10 | import sys |
| 11 | |
Eli Bendersky | 97b1ee0 | 2012-12-24 15:15:22 -0800 | [diff] [blame] | 12 | from .ply import lex |
| 13 | from .ply.lex import TOKEN |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 14 | |
| 15 | |
| 16 | class CLexer(object): |
| 17 | """ A lexer for the C language. After building it, set the |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 18 | input text with input(), and call token() to get new |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 19 | tokens. |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 20 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 21 | The public attribute filename can be set to an initial |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 22 | filaneme, but the lexer will update it upon #line |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 23 | directives. |
| 24 | """ |
Sye van der Veen | 9ec6c42 | 2013-07-11 09:10:38 -0400 | [diff] [blame] | 25 | def __init__(self, error_func, on_lbrace_func, on_rbrace_func, |
Eli Bendersky | fe26e7a | 2013-07-13 05:54:02 -0700 | [diff] [blame] | 26 | type_lookup_func): |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 27 | """ Create a new Lexer. |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 28 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 29 | error_func: |
| 30 | An error function. Will be called with an error |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 31 | message, line and column as arguments, in case of |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 32 | an error during lexing. |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 33 | |
Sye van der Veen | 9ec6c42 | 2013-07-11 09:10:38 -0400 | [diff] [blame] | 34 | on_lbrace_func, on_rbrace_func: |
| 35 | Called when an LBRACE or RBRACE is encountered |
| 36 | (likely to push/pop type_lookup_func's scope) |
| 37 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 38 | type_lookup_func: |
| 39 | A type lookup function. Given a string, it must |
| 40 | return True IFF this string is a name of a type |
| 41 | that was defined with a typedef earlier. |
| 42 | """ |
| 43 | self.error_func = error_func |
Sye van der Veen | 9ec6c42 | 2013-07-11 09:10:38 -0400 | [diff] [blame] | 44 | self.on_lbrace_func = on_lbrace_func |
| 45 | self.on_rbrace_func = on_rbrace_func |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 46 | self.type_lookup_func = type_lookup_func |
| 47 | self.filename = '' |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 48 | |
Eli Bendersky | 2a826bc | 2013-07-13 06:40:36 -0700 | [diff] [blame] | 49 | # Keeps track of the last token returned from self.token() |
| 50 | self.last_token = None |
| 51 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 52 | # Allow either "# line" or "# <num>" to support GCC's |
| 53 | # cpp output |
| 54 | # |
| 55 | self.line_pattern = re.compile('([ \t]*line\W)|([ \t]*\d+)') |
Eli Bendersky | 09fc200 | 2012-08-10 07:41:42 +0300 | [diff] [blame] | 56 | self.pragma_pattern = re.compile('[ \t]*pragma\W') |
| 57 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 58 | def build(self, **kwargs): |
| 59 | """ Builds the lexer from the specification. Must be |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 60 | called after the lexer object is created. |
| 61 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 62 | This method exists separately, because the PLY |
| 63 | manual warns against calling lex.lex inside |
| 64 | __init__ |
| 65 | """ |
Eli Bendersky | 97b1ee0 | 2012-12-24 15:15:22 -0800 | [diff] [blame] | 66 | self.lexer = lex.lex(object=self, **kwargs) |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 67 | |
| 68 | def reset_lineno(self): |
| 69 | """ Resets the internal line number counter of the lexer. |
| 70 | """ |
| 71 | self.lexer.lineno = 1 |
| 72 | |
| 73 | def input(self, text): |
| 74 | self.lexer.input(text) |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 75 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 76 | def token(self): |
Eli Bendersky | 2a826bc | 2013-07-13 06:40:36 -0700 | [diff] [blame] | 77 | self.last_token = self.lexer.token() |
| 78 | return self.last_token |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 79 | |
Eli Bendersky | e8b7eb6 | 2012-12-25 06:13:53 -0800 | [diff] [blame] | 80 | def find_tok_column(self, token): |
| 81 | """ Find the column of the token in its line. |
| 82 | """ |
Eli Bendersky | 645e260 | 2012-12-25 06:21:08 -0800 | [diff] [blame] | 83 | last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos) |
| 84 | return token.lexpos - last_cr |
Eli Bendersky | e8b7eb6 | 2012-12-25 06:13:53 -0800 | [diff] [blame] | 85 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 86 | ######################-- PRIVATE --###################### |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 87 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 88 | ## |
| 89 | ## Internal auxiliary methods |
| 90 | ## |
| 91 | def _error(self, msg, token): |
| 92 | location = self._make_tok_location(token) |
| 93 | self.error_func(msg, location[0], location[1]) |
| 94 | self.lexer.skip(1) |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 95 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 96 | def _make_tok_location(self, token): |
Eli Bendersky | e8b7eb6 | 2012-12-25 06:13:53 -0800 | [diff] [blame] | 97 | return (token.lineno, self.find_tok_column(token)) |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 98 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 99 | ## |
| 100 | ## Reserved keywords |
| 101 | ## |
| 102 | keywords = ( |
Eli Bendersky | f4d7346 | 2012-01-19 05:56:27 +0200 | [diff] [blame] | 103 | '_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', |
| 104 | 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', |
eli.bendersky | 145890d | 2010-10-29 12:02:32 +0200 | [diff] [blame] | 105 | 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'REGISTER', |
| 106 | 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', |
Even | f08560d | 2011-09-18 15:14:08 +0200 | [diff] [blame] | 107 | 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 108 | 'VOLATILE', 'WHILE', |
| 109 | ) |
| 110 | |
| 111 | keyword_map = {} |
eli.bendersky | affe032 | 2011-09-24 06:16:55 +0300 | [diff] [blame] | 112 | for keyword in keywords: |
| 113 | if keyword == '_BOOL': |
| 114 | keyword_map['_Bool'] = keyword |
Eli Bendersky | f4d7346 | 2012-01-19 05:56:27 +0200 | [diff] [blame] | 115 | elif keyword == '_COMPLEX': |
| 116 | keyword_map['_Complex'] = keyword |
eli.bendersky | affe032 | 2011-09-24 06:16:55 +0300 | [diff] [blame] | 117 | else: |
| 118 | keyword_map[keyword.lower()] = keyword |
Even | f08560d | 2011-09-18 15:14:08 +0200 | [diff] [blame] | 119 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 120 | ## |
| 121 | ## All the tokens recognized by the lexer |
| 122 | ## |
| 123 | tokens = keywords + ( |
| 124 | # Identifiers |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 125 | 'ID', |
| 126 | |
| 127 | # Type identifiers (identifiers previously defined as |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 128 | # types with typedef) |
| 129 | 'TYPEID', |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 130 | |
| 131 | # constants |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 132 | 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', |
Eli Bendersky | 3b1b08d | 2012-06-15 12:37:54 +0300 | [diff] [blame] | 133 | 'FLOAT_CONST', 'HEX_FLOAT_CONST', |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 134 | 'CHAR_CONST', |
| 135 | 'WCHAR_CONST', |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 136 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 137 | # String literals |
| 138 | 'STRING_LITERAL', |
| 139 | 'WSTRING_LITERAL', |
| 140 | |
Eli Bendersky | fe26e7a | 2013-07-13 05:54:02 -0700 | [diff] [blame] | 141 | # Operators |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 142 | 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', |
| 143 | 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', |
| 144 | 'LOR', 'LAND', 'LNOT', |
| 145 | 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 146 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 147 | # Assignment |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 148 | 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 149 | 'PLUSEQUAL', 'MINUSEQUAL', |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 150 | 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 151 | 'OREQUAL', |
| 152 | |
Eli Bendersky | fe26e7a | 2013-07-13 05:54:02 -0700 | [diff] [blame] | 153 | # Increment/decrement |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 154 | 'PLUSPLUS', 'MINUSMINUS', |
| 155 | |
| 156 | # Structure dereference (->) |
| 157 | 'ARROW', |
| 158 | |
| 159 | # Conditional operator (?) |
| 160 | 'CONDOP', |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 161 | |
| 162 | # Delimeters |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 163 | 'LPAREN', 'RPAREN', # ( ) |
| 164 | 'LBRACKET', 'RBRACKET', # [ ] |
Eli Bendersky | fe26e7a | 2013-07-13 05:54:02 -0700 | [diff] [blame] | 165 | 'LBRACE', 'RBRACE', # { } |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 166 | 'COMMA', 'PERIOD', # . , |
| 167 | 'SEMI', 'COLON', # ; : |
| 168 | |
| 169 | # Ellipsis (...) |
| 170 | 'ELLIPSIS', |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 171 | |
| 172 | # pre-processor |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 173 | 'PPHASH', # '#' |
| 174 | ) |
| 175 | |
| 176 | ## |
| 177 | ## Regexes for use in tokens |
| 178 | ## |
| 179 | ## |
| 180 | |
Sye van der Veen | 3576ed1 | 2013-06-10 13:27:58 -0400 | [diff] [blame] | 181 | # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) |
| 182 | identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 183 | |
Eli Bendersky | 3b1b08d | 2012-06-15 12:37:54 +0300 | [diff] [blame] | 184 | hex_prefix = '0[xX]' |
| 185 | hex_digits = '[0-9a-fA-F]+' |
| 186 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 187 | # integer constants (K&R2: A.2.5.1) |
Sye van der Veen | 08a5489 | 2013-06-10 12:59:03 -0400 | [diff] [blame] | 188 | integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 189 | decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' |
| 190 | octal_constant = '0[0-7]*'+integer_suffix_opt |
Eli Bendersky | 3b1b08d | 2012-06-15 12:37:54 +0300 | [diff] [blame] | 191 | hex_constant = hex_prefix+hex_digits+integer_suffix_opt |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 192 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 193 | bad_octal_constant = '0[0-7]*[89]' |
| 194 | |
| 195 | # character constants (K&R2: A.2.5.2) |
Even | 13ad219 | 2011-11-06 16:02:43 +0100 | [diff] [blame] | 196 | # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line |
eli.bendersky | 49f3b63 | 2011-10-31 06:38:41 +0200 | [diff] [blame] | 197 | # directives with Windows paths as filenames (..\..\dir\file) |
eliben | e40ae0a | 2012-08-10 08:24:28 +0300 | [diff] [blame] | 198 | # For the same reason, decimal_escape allows all digit sequences. We want to |
| 199 | # parse all correct code, even if it means to sometimes parse incorrect |
| 200 | # code. |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 201 | # |
Even | 13ad219 | 2011-11-06 16:02:43 +0100 | [diff] [blame] | 202 | simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" |
eliben | e40ae0a | 2012-08-10 08:24:28 +0300 | [diff] [blame] | 203 | decimal_escape = r"""(\d+)""" |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 204 | hex_escape = r"""(x[0-9a-fA-F]+)""" |
Even | 13ad219 | 2011-11-06 16:02:43 +0100 | [diff] [blame] | 205 | bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 206 | |
eliben | e40ae0a | 2012-08-10 08:24:28 +0300 | [diff] [blame] | 207 | escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 208 | cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 209 | char_const = "'"+cconst_char+"'" |
| 210 | wchar_const = 'L'+char_const |
| 211 | unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" |
| 212 | bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" |
| 213 | |
| 214 | # string literals (K&R2: A.2.6) |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 215 | string_char = r"""([^"\\\n]|"""+escape_sequence+')' |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 216 | string_literal = '"'+string_char+'*"' |
| 217 | wstring_literal = 'L'+string_literal |
| 218 | bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' |
| 219 | |
| 220 | # floating constants (K&R2: A.2.5.3) |
| 221 | exponent_part = r"""([eE][-+]?[0-9]+)""" |
| 222 | fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" |
| 223 | floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' |
Eli Bendersky | 3b1b08d | 2012-06-15 12:37:54 +0300 | [diff] [blame] | 224 | binary_exponent_part = r'''([pP][+-]?[0-9]+)''' |
| 225 | hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" |
| 226 | hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)' |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 227 | |
| 228 | ## |
Eli Bendersky | 09fc200 | 2012-08-10 07:41:42 +0300 | [diff] [blame] | 229 | ## Lexer states: used for preprocessor \n-terminated directives |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 230 | ## |
| 231 | states = ( |
| 232 | # ppline: preprocessor line directives |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 233 | # |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 234 | ('ppline', 'exclusive'), |
Eli Bendersky | 09fc200 | 2012-08-10 07:41:42 +0300 | [diff] [blame] | 235 | |
| 236 | # pppragma: pragma |
| 237 | # |
| 238 | ('pppragma', 'exclusive'), |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 239 | ) |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 240 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 241 | def t_PPHASH(self, t): |
| 242 | r'[ \t]*\#' |
Eli Bendersky | 09fc200 | 2012-08-10 07:41:42 +0300 | [diff] [blame] | 243 | if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 244 | t.lexer.begin('ppline') |
| 245 | self.pp_line = self.pp_filename = None |
Eli Bendersky | 09fc200 | 2012-08-10 07:41:42 +0300 | [diff] [blame] | 246 | elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): |
| 247 | t.lexer.begin('pppragma') |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 248 | else: |
| 249 | t.type = 'PPHASH' |
| 250 | return t |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 251 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 252 | ## |
| 253 | ## Rules for the ppline state |
| 254 | ## |
| 255 | @TOKEN(string_literal) |
| 256 | def t_ppline_FILENAME(self, t): |
| 257 | if self.pp_line is None: |
| 258 | self._error('filename before line number in #line', t) |
| 259 | else: |
| 260 | self.pp_filename = t.value.lstrip('"').rstrip('"') |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 261 | |
| 262 | @TOKEN(decimal_constant) |
| 263 | def t_ppline_LINE_NUMBER(self, t): |
| 264 | if self.pp_line is None: |
| 265 | self.pp_line = t.value |
| 266 | else: |
| 267 | # Ignore: GCC's cpp sometimes inserts a numeric flag |
| 268 | # after the file name |
| 269 | pass |
| 270 | |
| 271 | def t_ppline_NEWLINE(self, t): |
| 272 | r'\n' |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 273 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 274 | if self.pp_line is None: |
| 275 | self._error('line number missing in #line', t) |
| 276 | else: |
| 277 | self.lexer.lineno = int(self.pp_line) |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 278 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 279 | if self.pp_filename is not None: |
| 280 | self.filename = self.pp_filename |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 281 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 282 | t.lexer.begin('INITIAL') |
| 283 | |
| 284 | def t_ppline_PPLINE(self, t): |
| 285 | r'line' |
| 286 | pass |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 287 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 288 | t_ppline_ignore = ' \t' |
| 289 | |
| 290 | def t_ppline_error(self, t): |
Eli Bendersky | 09fc200 | 2012-08-10 07:41:42 +0300 | [diff] [blame] | 291 | self._error('invalid #line directive', t) |
| 292 | |
| 293 | ## |
| 294 | ## Rules for the pppragma state |
| 295 | ## |
| 296 | def t_pppragma_NEWLINE(self, t): |
| 297 | r'\n' |
| 298 | t.lexer.lineno += 1 |
| 299 | t.lexer.begin('INITIAL') |
| 300 | |
| 301 | def t_pppragma_PPPRAGMA(self, t): |
| 302 | r'pragma' |
| 303 | pass |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 304 | |
Eli Bendersky | 09fc200 | 2012-08-10 07:41:42 +0300 | [diff] [blame] | 305 | t_pppragma_ignore = ' \t<>.-{}();+-*/$%@&^~!?:,0123456789' |
| 306 | |
| 307 | @TOKEN(string_literal) |
| 308 | def t_pppragma_STR(self, t): pass |
| 309 | |
| 310 | @TOKEN(identifier) |
| 311 | def t_pppragma_ID(self, t): pass |
| 312 | |
| 313 | def t_pppragma_error(self, t): |
| 314 | self._error('invalid #pragma directive', t) |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 315 | |
| 316 | ## |
| 317 | ## Rules for the normal state |
| 318 | ## |
| 319 | t_ignore = ' \t' |
| 320 | |
| 321 | # Newlines |
| 322 | def t_NEWLINE(self, t): |
| 323 | r'\n+' |
| 324 | t.lexer.lineno += t.value.count("\n") |
| 325 | |
| 326 | # Operators |
| 327 | t_PLUS = r'\+' |
| 328 | t_MINUS = r'-' |
| 329 | t_TIMES = r'\*' |
| 330 | t_DIVIDE = r'/' |
| 331 | t_MOD = r'%' |
| 332 | t_OR = r'\|' |
| 333 | t_AND = r'&' |
| 334 | t_NOT = r'~' |
| 335 | t_XOR = r'\^' |
| 336 | t_LSHIFT = r'<<' |
| 337 | t_RSHIFT = r'>>' |
| 338 | t_LOR = r'\|\|' |
| 339 | t_LAND = r'&&' |
| 340 | t_LNOT = r'!' |
| 341 | t_LT = r'<' |
| 342 | t_GT = r'>' |
| 343 | t_LE = r'<=' |
| 344 | t_GE = r'>=' |
| 345 | t_EQ = r'==' |
| 346 | t_NE = r'!=' |
| 347 | |
| 348 | # Assignment operators |
| 349 | t_EQUALS = r'=' |
| 350 | t_TIMESEQUAL = r'\*=' |
| 351 | t_DIVEQUAL = r'/=' |
| 352 | t_MODEQUAL = r'%=' |
| 353 | t_PLUSEQUAL = r'\+=' |
| 354 | t_MINUSEQUAL = r'-=' |
| 355 | t_LSHIFTEQUAL = r'<<=' |
| 356 | t_RSHIFTEQUAL = r'>>=' |
| 357 | t_ANDEQUAL = r'&=' |
| 358 | t_OREQUAL = r'\|=' |
| 359 | t_XOREQUAL = r'\^=' |
| 360 | |
| 361 | # Increment/decrement |
| 362 | t_PLUSPLUS = r'\+\+' |
| 363 | t_MINUSMINUS = r'--' |
| 364 | |
| 365 | # -> |
| 366 | t_ARROW = r'->' |
| 367 | |
| 368 | # ? |
| 369 | t_CONDOP = r'\?' |
| 370 | |
| 371 | # Delimeters |
| 372 | t_LPAREN = r'\(' |
| 373 | t_RPAREN = r'\)' |
| 374 | t_LBRACKET = r'\[' |
| 375 | t_RBRACKET = r'\]' |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 376 | t_COMMA = r',' |
| 377 | t_PERIOD = r'\.' |
| 378 | t_SEMI = r';' |
| 379 | t_COLON = r':' |
| 380 | t_ELLIPSIS = r'\.\.\.' |
| 381 | |
Sye van der Veen | 9ec6c42 | 2013-07-11 09:10:38 -0400 | [diff] [blame] | 382 | # Scope delimiters |
| 383 | # To see why on_lbrace_func is needed, consider: |
| 384 | # typedef char TT; |
| 385 | # void foo(int TT) { TT = 10; } |
| 386 | # TT x = 5; |
| 387 | # Outside the function, TT is a typedef, but inside (starting and ending |
| 388 | # with the braces) it's a parameter. The trouble begins with yacc's |
| 389 | # lookahead token. If we open a new scope in brace_open, then TT has |
| 390 | # already been read and incorrectly interpreted as TYPEID. So, we need |
| 391 | # to open and close scopes from within the lexer. |
| 392 | # Similar for the TT immediately outside the end of the function. |
| 393 | # |
| 394 | @TOKEN(r'\{') |
| 395 | def t_LBRACE(self, t): |
| 396 | self.on_lbrace_func() |
| 397 | return t |
| 398 | @TOKEN(r'\}') |
| 399 | def t_RBRACE(self, t): |
| 400 | self.on_rbrace_func() |
| 401 | return t |
| 402 | |
Eli Bendersky | fe26e7a | 2013-07-13 05:54:02 -0700 | [diff] [blame] | 403 | t_STRING_LITERAL = string_literal |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 404 | |
| 405 | # The following floating and integer constants are defined as |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 406 | # functions to impose a strict order (otherwise, decimal |
| 407 | # is placed before the others because its regex is longer, |
| 408 | # and this is bad) |
| 409 | # |
| 410 | @TOKEN(floating_constant) |
| 411 | def t_FLOAT_CONST(self, t): |
| 412 | return t |
| 413 | |
Eli Bendersky | 3b1b08d | 2012-06-15 12:37:54 +0300 | [diff] [blame] | 414 | @TOKEN(hex_floating_constant) |
| 415 | def t_HEX_FLOAT_CONST(self, t): |
| 416 | return t |
| 417 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 418 | @TOKEN(hex_constant) |
| 419 | def t_INT_CONST_HEX(self, t): |
| 420 | return t |
| 421 | |
| 422 | @TOKEN(bad_octal_constant) |
| 423 | def t_BAD_CONST_OCT(self, t): |
| 424 | msg = "Invalid octal constant" |
| 425 | self._error(msg, t) |
| 426 | |
| 427 | @TOKEN(octal_constant) |
| 428 | def t_INT_CONST_OCT(self, t): |
| 429 | return t |
| 430 | |
| 431 | @TOKEN(decimal_constant) |
| 432 | def t_INT_CONST_DEC(self, t): |
| 433 | return t |
| 434 | |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 435 | # Must come before bad_char_const, to prevent it from |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 436 | # catching valid char constants as invalid |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 437 | # |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 438 | @TOKEN(char_const) |
| 439 | def t_CHAR_CONST(self, t): |
| 440 | return t |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 441 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 442 | @TOKEN(wchar_const) |
| 443 | def t_WCHAR_CONST(self, t): |
| 444 | return t |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 445 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 446 | @TOKEN(unmatched_quote) |
| 447 | def t_UNMATCHED_QUOTE(self, t): |
| 448 | msg = "Unmatched '" |
| 449 | self._error(msg, t) |
| 450 | |
| 451 | @TOKEN(bad_char_const) |
| 452 | def t_BAD_CHAR_CONST(self, t): |
| 453 | msg = "Invalid char constant %s" % t.value |
| 454 | self._error(msg, t) |
| 455 | |
| 456 | @TOKEN(wstring_literal) |
| 457 | def t_WSTRING_LITERAL(self, t): |
| 458 | return t |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 459 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 460 | # unmatched string literals are caught by the preprocessor |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 461 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 462 | @TOKEN(bad_string_literal) |
| 463 | def t_BAD_STRING_LITERAL(self, t): |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 464 | msg = "String contains invalid escape code" |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 465 | self._error(msg, t) |
| 466 | |
| 467 | @TOKEN(identifier) |
| 468 | def t_ID(self, t): |
| 469 | t.type = self.keyword_map.get(t.value, "ID") |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 470 | if t.type == 'ID' and self.type_lookup_func(t.value): |
| 471 | t.type = "TYPEID" |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 472 | return t |
Eli Bendersky | 64b7a20 | 2013-06-12 06:24:46 -0700 | [diff] [blame] | 473 | |
Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame] | 474 | def t_error(self, t): |
| 475 | msg = 'Illegal character %s' % repr(t.value[0]) |
| 476 | self._error(msg, t) |
| 477 | |