Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 1 | //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This file implements the TokenConcatenation class. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "clang/Lex/TokenConcatenation.h" |
| 15 | #include "clang/Lex/Preprocessor.h" |
Abramo Bagnara | c4bf2b9 | 2010-12-22 08:23:18 +0000 | [diff] [blame] | 16 | #include "llvm/Support/ErrorHandling.h" |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 17 | using namespace clang; |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 18 | |
| 19 | |
Craig Topper | 2fa4e86 | 2011-08-11 04:06:15 +0000 | [diff] [blame] | 20 | /// IsStringPrefix - Return true if Str is a string prefix. |
| 21 | /// 'L', 'u', 'U', or 'u8'. Including raw versions. |
Craig Topper | 03720fc | 2011-08-11 05:10:55 +0000 | [diff] [blame] | 22 | static bool IsStringPrefix(StringRef Str, bool CPlusPlus0x) { |
Craig Topper | 2fa4e86 | 2011-08-11 04:06:15 +0000 | [diff] [blame] | 23 | |
| 24 | if (Str[0] == 'L' || |
| 25 | (CPlusPlus0x && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) { |
| 26 | |
| 27 | if (Str.size() == 1) |
| 28 | return true; // "L", "u", "U", and "R" |
| 29 | |
| 30 | // Check for raw flavors. Need to make sure the first character wasn't |
| 31 | // already R. Need CPlusPlus0x check for "LR". |
| 32 | if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus0x) |
| 33 | return true; // "LR", "uR", "UR" |
| 34 | |
| 35 | // Check for "u8" and "u8R" |
| 36 | if (Str[0] == 'u' && Str[1] == '8') { |
| 37 | if (Str.size() == 2) return true; // "u8" |
| 38 | if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R" |
| 39 | } |
| 40 | } |
| 41 | |
| 42 | return false; |
| 43 | } |
| 44 | |
Douglas Gregor | 5cee119 | 2011-07-27 05:40:30 +0000 | [diff] [blame] | 45 | /// IsIdentifierStringPrefix - Return true if the spelling of the token |
Craig Topper | 2fa4e86 | 2011-08-11 04:06:15 +0000 | [diff] [blame] | 46 | /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions. |
Douglas Gregor | 5cee119 | 2011-07-27 05:40:30 +0000 | [diff] [blame] | 47 | bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { |
| 48 | const LangOptions &LangOpts = PP.getLangOptions(); |
| 49 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 50 | if (!Tok.needsCleaning()) { |
Craig Topper | 2fa4e86 | 2011-08-11 04:06:15 +0000 | [diff] [blame] | 51 | if (Tok.getLength() < 1 || Tok.getLength() > 3) |
Douglas Gregor | 5cee119 | 2011-07-27 05:40:30 +0000 | [diff] [blame] | 52 | return false; |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 53 | SourceManager &SM = PP.getSourceManager(); |
Douglas Gregor | 5cee119 | 2011-07-27 05:40:30 +0000 | [diff] [blame] | 54 | const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); |
Craig Topper | 2fa4e86 | 2011-08-11 04:06:15 +0000 | [diff] [blame] | 55 | return IsStringPrefix(StringRef(Ptr, Tok.getLength()), |
| 56 | LangOpts.CPlusPlus0x); |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 57 | } |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 58 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 59 | if (Tok.getLength() < 256) { |
| 60 | char Buffer[256]; |
| 61 | const char *TokPtr = Buffer; |
Douglas Gregor | 5cee119 | 2011-07-27 05:40:30 +0000 | [diff] [blame] | 62 | unsigned length = PP.getSpelling(Tok, TokPtr); |
Craig Topper | 2fa4e86 | 2011-08-11 04:06:15 +0000 | [diff] [blame] | 63 | return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus0x); |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 64 | } |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 65 | |
Craig Topper | 2fa4e86 | 2011-08-11 04:06:15 +0000 | [diff] [blame] | 66 | return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus0x); |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 67 | } |
| 68 | |
| 69 | TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { |
| 70 | memset(TokenInfo, 0, sizeof(TokenInfo)); |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 71 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 72 | // These tokens have custom code in AvoidConcat. |
| 73 | TokenInfo[tok::identifier ] |= aci_custom; |
| 74 | TokenInfo[tok::numeric_constant] |= aci_custom_firstchar; |
| 75 | TokenInfo[tok::period ] |= aci_custom_firstchar; |
| 76 | TokenInfo[tok::amp ] |= aci_custom_firstchar; |
| 77 | TokenInfo[tok::plus ] |= aci_custom_firstchar; |
| 78 | TokenInfo[tok::minus ] |= aci_custom_firstchar; |
| 79 | TokenInfo[tok::slash ] |= aci_custom_firstchar; |
| 80 | TokenInfo[tok::less ] |= aci_custom_firstchar; |
| 81 | TokenInfo[tok::greater ] |= aci_custom_firstchar; |
| 82 | TokenInfo[tok::pipe ] |= aci_custom_firstchar; |
| 83 | TokenInfo[tok::percent ] |= aci_custom_firstchar; |
| 84 | TokenInfo[tok::colon ] |= aci_custom_firstchar; |
| 85 | TokenInfo[tok::hash ] |= aci_custom_firstchar; |
| 86 | TokenInfo[tok::arrow ] |= aci_custom_firstchar; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 87 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 88 | // These tokens change behavior if followed by an '='. |
| 89 | TokenInfo[tok::amp ] |= aci_avoid_equal; // &= |
| 90 | TokenInfo[tok::plus ] |= aci_avoid_equal; // += |
| 91 | TokenInfo[tok::minus ] |= aci_avoid_equal; // -= |
| 92 | TokenInfo[tok::slash ] |= aci_avoid_equal; // /= |
| 93 | TokenInfo[tok::less ] |= aci_avoid_equal; // <= |
| 94 | TokenInfo[tok::greater ] |= aci_avoid_equal; // >= |
| 95 | TokenInfo[tok::pipe ] |= aci_avoid_equal; // |= |
| 96 | TokenInfo[tok::percent ] |= aci_avoid_equal; // %= |
| 97 | TokenInfo[tok::star ] |= aci_avoid_equal; // *= |
| 98 | TokenInfo[tok::exclaim ] |= aci_avoid_equal; // != |
| 99 | TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<= |
Chris Lattner | 8685110 | 2010-03-26 17:10:02 +0000 | [diff] [blame] | 100 | TokenInfo[tok::greatergreater] |= aci_avoid_equal; // >>= |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 101 | TokenInfo[tok::caret ] |= aci_avoid_equal; // ^= |
| 102 | TokenInfo[tok::equal ] |= aci_avoid_equal; // == |
| 103 | } |
| 104 | |
Daniel Dunbar | 99c7622 | 2009-03-18 03:32:24 +0000 | [diff] [blame] | 105 | /// GetFirstChar - Get the first character of the token \arg Tok, |
| 106 | /// avoiding calls to getSpelling where possible. |
| 107 | static char GetFirstChar(Preprocessor &PP, const Token &Tok) { |
| 108 | if (IdentifierInfo *II = Tok.getIdentifierInfo()) { |
| 109 | // Avoid spelling identifiers, the most common form of token. |
Daniel Dunbar | e013d68 | 2009-10-18 20:26:12 +0000 | [diff] [blame] | 110 | return II->getNameStart()[0]; |
Daniel Dunbar | 99c7622 | 2009-03-18 03:32:24 +0000 | [diff] [blame] | 111 | } else if (!Tok.needsCleaning()) { |
| 112 | if (Tok.isLiteral() && Tok.getLiteralData()) { |
| 113 | return *Tok.getLiteralData(); |
| 114 | } else { |
| 115 | SourceManager &SM = PP.getSourceManager(); |
| 116 | return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); |
| 117 | } |
| 118 | } else if (Tok.getLength() < 256) { |
| 119 | char Buffer[256]; |
| 120 | const char *TokPtr = Buffer; |
| 121 | PP.getSpelling(Tok, TokPtr); |
| 122 | return TokPtr[0]; |
| 123 | } else { |
| 124 | return PP.getSpelling(Tok)[0]; |
| 125 | } |
| 126 | } |
| 127 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 128 | /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause |
| 129 | /// the two individual tokens to be lexed as a single token, return true |
| 130 | /// (which causes a space to be printed between them). This allows the output |
| 131 | /// of -E mode to be lexed to the same token stream as lexing the input |
| 132 | /// directly would. |
| 133 | /// |
| 134 | /// This code must conservatively return true if it doesn't want to be 100% |
| 135 | /// accurate. This will cause the output to include extra space characters, |
| 136 | /// but the resulting output won't have incorrect concatenations going on. |
| 137 | /// Examples include "..", which we print with a space between, because we |
| 138 | /// don't want to track enough to tell "x.." from "...". |
Chris Lattner | 8877321 | 2010-04-14 03:57:19 +0000 | [diff] [blame] | 139 | bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, |
| 140 | const Token &PrevTok, |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 141 | const Token &Tok) const { |
Chris Lattner | e1614bb | 2009-04-21 23:28:41 +0000 | [diff] [blame] | 142 | // First, check to see if the tokens were directly adjacent in the original |
| 143 | // source. If they were, it must be okay to stick them together: if there |
| 144 | // were an issue, the tokens would have been lexed differently. |
| 145 | if (PrevTok.getLocation().isFileID() && Tok.getLocation().isFileID() && |
Argyrios Kyrtzidis | a64ccef | 2011-09-19 20:40:19 +0000 | [diff] [blame] | 146 | PrevTok.getLocation().getLocWithOffset(PrevTok.getLength()) == |
Chris Lattner | e1614bb | 2009-04-21 23:28:41 +0000 | [diff] [blame] | 147 | Tok.getLocation()) |
| 148 | return false; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 149 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 150 | tok::TokenKind PrevKind = PrevTok.getKind(); |
| 151 | if (PrevTok.getIdentifierInfo()) // Language keyword or named operator. |
| 152 | PrevKind = tok::identifier; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 153 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 154 | // Look up information on when we should avoid concatenation with prevtok. |
| 155 | unsigned ConcatInfo = TokenInfo[PrevKind]; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 156 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 157 | // If prevtok never causes a problem for anything after it, return quickly. |
| 158 | if (ConcatInfo == 0) return false; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 159 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 160 | if (ConcatInfo & aci_avoid_equal) { |
| 161 | // If the next token is '=' or '==', avoid concatenation. |
| 162 | if (Tok.is(tok::equal) || Tok.is(tok::equalequal)) |
| 163 | return true; |
| 164 | ConcatInfo &= ~aci_avoid_equal; |
| 165 | } |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 166 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 167 | if (ConcatInfo == 0) return false; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 168 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 169 | // Basic algorithm: we look at the first character of the second token, and |
| 170 | // determine whether it, if appended to the first token, would form (or |
| 171 | // would contribute) to a larger token if concatenated. |
| 172 | char FirstChar = 0; |
| 173 | if (ConcatInfo & aci_custom) { |
| 174 | // If the token does not need to know the first character, don't get it. |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 175 | } else { |
Daniel Dunbar | 99c7622 | 2009-03-18 03:32:24 +0000 | [diff] [blame] | 176 | FirstChar = GetFirstChar(PP, Tok); |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 177 | } |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 178 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 179 | switch (PrevKind) { |
Abramo Bagnara | c4bf2b9 | 2010-12-22 08:23:18 +0000 | [diff] [blame] | 180 | default: |
| 181 | llvm_unreachable("InitAvoidConcatTokenInfo built wrong"); |
Abramo Bagnara | c4bf2b9 | 2010-12-22 08:23:18 +0000 | [diff] [blame] | 182 | |
| 183 | case tok::raw_identifier: |
| 184 | llvm_unreachable("tok::raw_identifier in non-raw lexing mode!"); |
Abramo Bagnara | c4bf2b9 | 2010-12-22 08:23:18 +0000 | [diff] [blame] | 185 | |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 186 | case tok::identifier: // id+id or id+number or id+L"foo". |
Daniel Dunbar | 99c7622 | 2009-03-18 03:32:24 +0000 | [diff] [blame] | 187 | // id+'.'... will not append. |
| 188 | if (Tok.is(tok::numeric_constant)) |
| 189 | return GetFirstChar(PP, Tok) != '.'; |
| 190 | |
Douglas Gregor | 5cee119 | 2011-07-27 05:40:30 +0000 | [diff] [blame] | 191 | if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) || |
| 192 | Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) || |
| 193 | Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) || |
| 194 | Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant)) |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 195 | return true; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 196 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 197 | // If this isn't identifier + string, we're done. |
| 198 | if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) |
| 199 | return false; |
Mike Stump | 1eb4433 | 2009-09-09 15:08:12 +0000 | [diff] [blame] | 200 | |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 201 | // Otherwise, this is a narrow character or string. If the *identifier* |
Douglas Gregor | 5cee119 | 2011-07-27 05:40:30 +0000 | [diff] [blame] | 202 | // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". |
| 203 | return IsIdentifierStringPrefix(PrevTok); |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 204 | case tok::numeric_constant: |
| 205 | return isalnum(FirstChar) || Tok.is(tok::numeric_constant) || |
| 206 | FirstChar == '+' || FirstChar == '-' || FirstChar == '.'; |
| 207 | case tok::period: // ..., .*, .1234 |
Chris Lattner | 8877321 | 2010-04-14 03:57:19 +0000 | [diff] [blame] | 208 | return (FirstChar == '.' && PrevPrevTok.is(tok::period)) || |
| 209 | isdigit(FirstChar) || |
Eli Friedman | 8849f11 | 2009-06-15 19:48:50 +0000 | [diff] [blame] | 210 | (PP.getLangOptions().CPlusPlus && FirstChar == '*'); |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 211 | case tok::amp: // && |
| 212 | return FirstChar == '&'; |
| 213 | case tok::plus: // ++ |
| 214 | return FirstChar == '+'; |
| 215 | case tok::minus: // --, ->, ->* |
| 216 | return FirstChar == '-' || FirstChar == '>'; |
| 217 | case tok::slash: //, /*, // |
| 218 | return FirstChar == '*' || FirstChar == '/'; |
| 219 | case tok::less: // <<, <<=, <:, <% |
| 220 | return FirstChar == '<' || FirstChar == ':' || FirstChar == '%'; |
| 221 | case tok::greater: // >>, >>= |
| 222 | return FirstChar == '>'; |
| 223 | case tok::pipe: // || |
| 224 | return FirstChar == '|'; |
| 225 | case tok::percent: // %>, %: |
Eli Friedman | 896ccf8 | 2009-05-27 22:33:06 +0000 | [diff] [blame] | 226 | return FirstChar == '>' || FirstChar == ':'; |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 227 | case tok::colon: // ::, :> |
Eli Friedman | 8849f11 | 2009-06-15 19:48:50 +0000 | [diff] [blame] | 228 | return FirstChar == '>' || |
| 229 | (PP.getLangOptions().CPlusPlus && FirstChar == ':'); |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 230 | case tok::hash: // ##, #@, %:%: |
| 231 | return FirstChar == '#' || FirstChar == '@' || FirstChar == '%'; |
| 232 | case tok::arrow: // ->* |
Eli Friedman | 8849f11 | 2009-06-15 19:48:50 +0000 | [diff] [blame] | 233 | return PP.getLangOptions().CPlusPlus && FirstChar == '*'; |
Chris Lattner | d7038e1 | 2009-02-13 00:46:04 +0000 | [diff] [blame] | 234 | } |
| 235 | } |