Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 1 | //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This file implements the TokenConcatenation class. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "clang/Lex/TokenConcatenation.h" |
| 15 | #include "clang/Lex/Preprocessor.h" |
| 16 | using namespace clang; |
| 17 | |
| 18 | |
| 19 | /// StartsWithL - Return true if the spelling of this token starts with 'L'. |
| 20 | bool TokenConcatenation::StartsWithL(const Token &Tok) const { |
| 21 | if (!Tok.needsCleaning()) { |
| 22 | SourceManager &SM = PP.getSourceManager(); |
| 23 | return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L'; |
| 24 | } |
| 25 | |
| 26 | if (Tok.getLength() < 256) { |
| 27 | char Buffer[256]; |
| 28 | const char *TokPtr = Buffer; |
| 29 | PP.getSpelling(Tok, TokPtr); |
| 30 | return TokPtr[0] == 'L'; |
| 31 | } |
| 32 | |
| 33 | return PP.getSpelling(Tok)[0] == 'L'; |
| 34 | } |
| 35 | |
| 36 | /// IsIdentifierL - Return true if the spelling of this token is literally |
| 37 | /// 'L'. |
| 38 | bool TokenConcatenation::IsIdentifierL(const Token &Tok) const { |
| 39 | if (!Tok.needsCleaning()) { |
| 40 | if (Tok.getLength() != 1) |
| 41 | return false; |
| 42 | SourceManager &SM = PP.getSourceManager(); |
| 43 | return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L'; |
| 44 | } |
| 45 | |
| 46 | if (Tok.getLength() < 256) { |
| 47 | char Buffer[256]; |
| 48 | const char *TokPtr = Buffer; |
| 49 | if (PP.getSpelling(Tok, TokPtr) != 1) |
| 50 | return false; |
| 51 | return TokPtr[0] == 'L'; |
| 52 | } |
| 53 | |
| 54 | return PP.getSpelling(Tok) == "L"; |
| 55 | } |
| 56 | |
| 57 | TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { |
| 58 | memset(TokenInfo, 0, sizeof(TokenInfo)); |
| 59 | |
| 60 | // These tokens have custom code in AvoidConcat. |
| 61 | TokenInfo[tok::identifier ] |= aci_custom; |
| 62 | TokenInfo[tok::numeric_constant] |= aci_custom_firstchar; |
| 63 | TokenInfo[tok::period ] |= aci_custom_firstchar; |
| 64 | TokenInfo[tok::amp ] |= aci_custom_firstchar; |
| 65 | TokenInfo[tok::plus ] |= aci_custom_firstchar; |
| 66 | TokenInfo[tok::minus ] |= aci_custom_firstchar; |
| 67 | TokenInfo[tok::slash ] |= aci_custom_firstchar; |
| 68 | TokenInfo[tok::less ] |= aci_custom_firstchar; |
| 69 | TokenInfo[tok::greater ] |= aci_custom_firstchar; |
| 70 | TokenInfo[tok::pipe ] |= aci_custom_firstchar; |
| 71 | TokenInfo[tok::percent ] |= aci_custom_firstchar; |
| 72 | TokenInfo[tok::colon ] |= aci_custom_firstchar; |
| 73 | TokenInfo[tok::hash ] |= aci_custom_firstchar; |
| 74 | TokenInfo[tok::arrow ] |= aci_custom_firstchar; |
| 75 | |
| 76 | // These tokens change behavior if followed by an '='. |
| 77 | TokenInfo[tok::amp ] |= aci_avoid_equal; // &= |
| 78 | TokenInfo[tok::plus ] |= aci_avoid_equal; // += |
| 79 | TokenInfo[tok::minus ] |= aci_avoid_equal; // -= |
| 80 | TokenInfo[tok::slash ] |= aci_avoid_equal; // /= |
| 81 | TokenInfo[tok::less ] |= aci_avoid_equal; // <= |
| 82 | TokenInfo[tok::greater ] |= aci_avoid_equal; // >= |
| 83 | TokenInfo[tok::pipe ] |= aci_avoid_equal; // |= |
| 84 | TokenInfo[tok::percent ] |= aci_avoid_equal; // %= |
| 85 | TokenInfo[tok::star ] |= aci_avoid_equal; // *= |
| 86 | TokenInfo[tok::exclaim ] |= aci_avoid_equal; // != |
| 87 | TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<= |
| 88 | TokenInfo[tok::greaterequal] |= aci_avoid_equal; // >>= |
| 89 | TokenInfo[tok::caret ] |= aci_avoid_equal; // ^= |
| 90 | TokenInfo[tok::equal ] |= aci_avoid_equal; // == |
| 91 | } |
| 92 | |
| 93 | /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause |
| 94 | /// the two individual tokens to be lexed as a single token, return true |
| 95 | /// (which causes a space to be printed between them). This allows the output |
| 96 | /// of -E mode to be lexed to the same token stream as lexing the input |
| 97 | /// directly would. |
| 98 | /// |
| 99 | /// This code must conservatively return true if it doesn't want to be 100% |
| 100 | /// accurate. This will cause the output to include extra space characters, |
| 101 | /// but the resulting output won't have incorrect concatenations going on. |
| 102 | /// Examples include "..", which we print with a space between, because we |
| 103 | /// don't want to track enough to tell "x.." from "...". |
| 104 | bool TokenConcatenation::AvoidConcat(const Token &PrevTok, |
| 105 | const Token &Tok) const { |
| 106 | char Buffer[256]; |
| 107 | |
| 108 | tok::TokenKind PrevKind = PrevTok.getKind(); |
| 109 | if (PrevTok.getIdentifierInfo()) // Language keyword or named operator. |
| 110 | PrevKind = tok::identifier; |
| 111 | |
| 112 | // Look up information on when we should avoid concatenation with prevtok. |
| 113 | unsigned ConcatInfo = TokenInfo[PrevKind]; |
| 114 | |
| 115 | // If prevtok never causes a problem for anything after it, return quickly. |
| 116 | if (ConcatInfo == 0) return false; |
| 117 | |
| 118 | if (ConcatInfo & aci_avoid_equal) { |
| 119 | // If the next token is '=' or '==', avoid concatenation. |
| 120 | if (Tok.is(tok::equal) || Tok.is(tok::equalequal)) |
| 121 | return true; |
| 122 | ConcatInfo &= ~aci_avoid_equal; |
| 123 | } |
| 124 | |
| 125 | if (ConcatInfo == 0) return false; |
| 126 | |
| 127 | // Basic algorithm: we look at the first character of the second token, and |
| 128 | // determine whether it, if appended to the first token, would form (or |
| 129 | // would contribute) to a larger token if concatenated. |
| 130 | char FirstChar = 0; |
| 131 | if (ConcatInfo & aci_custom) { |
| 132 | // If the token does not need to know the first character, don't get it. |
| 133 | } else if (IdentifierInfo *II = Tok.getIdentifierInfo()) { |
| 134 | // Avoid spelling identifiers, the most common form of token. |
| 135 | FirstChar = II->getName()[0]; |
| 136 | } else if (!Tok.needsCleaning()) { |
| 137 | if (Tok.isLiteral() && Tok.getLiteralData()) { |
| 138 | FirstChar = *Tok.getLiteralData(); |
| 139 | } else { |
| 140 | SourceManager &SrcMgr = PP.getSourceManager(); |
| 141 | FirstChar = |
| 142 | *SrcMgr.getCharacterData(SrcMgr.getSpellingLoc(Tok.getLocation())); |
| 143 | } |
| 144 | } else if (Tok.getLength() < 256) { |
| 145 | const char *TokPtr = Buffer; |
| 146 | PP.getSpelling(Tok, TokPtr); |
| 147 | FirstChar = TokPtr[0]; |
| 148 | } else { |
| 149 | FirstChar = PP.getSpelling(Tok)[0]; |
| 150 | } |
| 151 | |
| 152 | switch (PrevKind) { |
| 153 | default: assert(0 && "InitAvoidConcatTokenInfo built wrong"); |
| 154 | case tok::identifier: // id+id or id+number or id+L"foo". |
| 155 | if (Tok.is(tok::numeric_constant) || Tok.getIdentifierInfo() || |
| 156 | Tok.is(tok::wide_string_literal) /* || |
| 157 | Tok.is(tok::wide_char_literal)*/) |
| 158 | return true; |
| 159 | |
| 160 | // If this isn't identifier + string, we're done. |
| 161 | if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) |
| 162 | return false; |
| 163 | |
| 164 | // FIXME: need a wide_char_constant! |
| 165 | |
| 166 | // If the string was a wide string L"foo" or wide char L'f', it would |
| 167 | // concat with the previous identifier into fooL"bar". Avoid this. |
| 168 | if (StartsWithL(Tok)) |
| 169 | return true; |
| 170 | |
| 171 | // Otherwise, this is a narrow character or string. If the *identifier* |
| 172 | // is a literal 'L', avoid pasting L "foo" -> L"foo". |
| 173 | return IsIdentifierL(PrevTok); |
| 174 | case tok::numeric_constant: |
| 175 | return isalnum(FirstChar) || Tok.is(tok::numeric_constant) || |
| 176 | FirstChar == '+' || FirstChar == '-' || FirstChar == '.'; |
| 177 | case tok::period: // ..., .*, .1234 |
| 178 | return FirstChar == '.' || isdigit(FirstChar) || |
| 179 | (FirstChar == '*' && PP.getLangOptions().CPlusPlus); |
| 180 | case tok::amp: // && |
| 181 | return FirstChar == '&'; |
| 182 | case tok::plus: // ++ |
| 183 | return FirstChar == '+'; |
| 184 | case tok::minus: // --, ->, ->* |
| 185 | return FirstChar == '-' || FirstChar == '>'; |
| 186 | case tok::slash: //, /*, // |
| 187 | return FirstChar == '*' || FirstChar == '/'; |
| 188 | case tok::less: // <<, <<=, <:, <% |
| 189 | return FirstChar == '<' || FirstChar == ':' || FirstChar == '%'; |
| 190 | case tok::greater: // >>, >>= |
| 191 | return FirstChar == '>'; |
| 192 | case tok::pipe: // || |
| 193 | return FirstChar == '|'; |
| 194 | case tok::percent: // %>, %: |
| 195 | return (FirstChar == '>' || FirstChar == ':') && |
| 196 | PP.getLangOptions().Digraphs; |
| 197 | case tok::colon: // ::, :> |
| 198 | return (FirstChar == ':' && PP.getLangOptions().CPlusPlus) || |
| 199 | (FirstChar == '>' && PP.getLangOptions().Digraphs); |
| 200 | case tok::hash: // ##, #@, %:%: |
| 201 | return FirstChar == '#' || FirstChar == '@' || FirstChar == '%'; |
| 202 | case tok::arrow: // ->* |
| 203 | return FirstChar == '*'; |
| 204 | } |
| 205 | } |