blob: 80e025def7b3e9319deecf6b21dd90df72e291d7 [file] [log] [blame]
Chris Lattner2f5add62007-04-05 06:57:15 +00001//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
Steve Naroff09ef4742007-03-09 23:16:33 +00002//
3// The LLVM Compiler Infrastructure
4//
Chris Lattner5b12ab82007-12-29 19:59:25 +00005// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
Steve Naroff09ef4742007-03-09 23:16:33 +00007//
8//===----------------------------------------------------------------------===//
9//
Chris Lattner2f5add62007-04-05 06:57:15 +000010// This file implements the NumericLiteralParser, CharLiteralParser, and
11// StringLiteralParser interfaces.
Steve Naroff09ef4742007-03-09 23:16:33 +000012//
13//===----------------------------------------------------------------------===//
14
15#include "clang/Lex/LiteralSupport.h"
Jordan Rosea7d03842013-02-08 22:30:41 +000016#include "clang/Basic/CharInfo.h"
Chandler Carruth3a022472012-12-04 09:13:33 +000017#include "clang/Basic/TargetInfo.h"
18#include "clang/Lex/LexDiagnostic.h"
19#include "clang/Lex/Preprocessor.h"
Steve Naroff4f88b312007-03-13 22:37:02 +000020#include "llvm/ADT/StringExtras.h"
Dmitri Gribenko9feeef42013-01-30 12:06:08 +000021#include "llvm/Support/ConvertUTF.h"
David Blaikie76bd3c82011-09-23 05:35:21 +000022#include "llvm/Support/ErrorHandling.h"
Dmitri Gribenko9feeef42013-01-30 12:06:08 +000023
Steve Naroff09ef4742007-03-09 23:16:33 +000024using namespace clang;
25
Douglas Gregorfb65e592011-07-27 05:40:30 +000026static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
27 switch (kind) {
David Blaikie83d382b2011-09-23 05:06:16 +000028 default: llvm_unreachable("Unknown token type!");
Douglas Gregorfb65e592011-07-27 05:40:30 +000029 case tok::char_constant:
30 case tok::string_literal:
31 case tok::utf8_string_literal:
32 return Target.getCharWidth();
33 case tok::wide_char_constant:
34 case tok::wide_string_literal:
35 return Target.getWCharWidth();
36 case tok::utf16_char_constant:
37 case tok::utf16_string_literal:
38 return Target.getChar16Width();
39 case tok::utf32_char_constant:
40 case tok::utf32_string_literal:
41 return Target.getChar32Width();
42 }
43}
44
Seth Cantrell4cfc8172012-10-28 18:24:46 +000045static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
46 FullSourceLoc TokLoc,
47 const char *TokBegin,
48 const char *TokRangeBegin,
49 const char *TokRangeEnd) {
50 SourceLocation Begin =
51 Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
52 TokLoc.getManager(), Features);
53 SourceLocation End =
54 Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
55 TokLoc.getManager(), Features);
56 return CharSourceRange::getCharRange(Begin, End);
57}
58
Richard Smith639b8d02012-09-08 07:16:20 +000059/// \brief Produce a diagnostic highlighting some portion of a literal.
60///
61/// Emits the diagnostic \p DiagID, highlighting the range of characters from
62/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
63/// a substring of a spelling buffer for the token beginning at \p TokBegin.
64static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
65 const LangOptions &Features, FullSourceLoc TokLoc,
66 const char *TokBegin, const char *TokRangeBegin,
67 const char *TokRangeEnd, unsigned DiagID) {
68 SourceLocation Begin =
69 Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
70 TokLoc.getManager(), Features);
Seth Cantrell4cfc8172012-10-28 18:24:46 +000071 return Diags->Report(Begin, DiagID) <<
72 MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
Richard Smith639b8d02012-09-08 07:16:20 +000073}
74
Chris Lattner2f5add62007-04-05 06:57:15 +000075/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
76/// either a character or a string literal.
Richard Smith639b8d02012-09-08 07:16:20 +000077static unsigned ProcessCharEscape(const char *ThisTokBegin,
78 const char *&ThisTokBuf,
Chris Lattner2f5add62007-04-05 06:57:15 +000079 const char *ThisTokEnd, bool &HadError,
Douglas Gregorfb65e592011-07-27 05:40:30 +000080 FullSourceLoc Loc, unsigned CharWidth,
Richard Smith639b8d02012-09-08 07:16:20 +000081 DiagnosticsEngine *Diags,
82 const LangOptions &Features) {
83 const char *EscapeBegin = ThisTokBuf;
84
Chris Lattner2f5add62007-04-05 06:57:15 +000085 // Skip the '\' char.
86 ++ThisTokBuf;
87
88 // We know that this character can't be off the end of the buffer, because
89 // that would have been \", which would not have been the end of string.
90 unsigned ResultChar = *ThisTokBuf++;
91 switch (ResultChar) {
92 // These map to themselves.
93 case '\\': case '\'': case '"': case '?': break;
Mike Stump11289f42009-09-09 15:08:12 +000094
Chris Lattner2f5add62007-04-05 06:57:15 +000095 // These have fixed mappings.
96 case 'a':
97 // TODO: K&R: the meaning of '\\a' is different in traditional C
98 ResultChar = 7;
99 break;
100 case 'b':
101 ResultChar = 8;
102 break;
103 case 'e':
Chris Lattner7a02bfd2010-11-17 06:26:08 +0000104 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +0000105 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
106 diag::ext_nonstandard_escape) << "e";
Chris Lattner2f5add62007-04-05 06:57:15 +0000107 ResultChar = 27;
108 break;
Eli Friedman28a00aa2009-06-10 01:32:39 +0000109 case 'E':
Chris Lattner7a02bfd2010-11-17 06:26:08 +0000110 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +0000111 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
112 diag::ext_nonstandard_escape) << "E";
Eli Friedman28a00aa2009-06-10 01:32:39 +0000113 ResultChar = 27;
114 break;
Chris Lattner2f5add62007-04-05 06:57:15 +0000115 case 'f':
116 ResultChar = 12;
117 break;
118 case 'n':
119 ResultChar = 10;
120 break;
121 case 'r':
122 ResultChar = 13;
123 break;
124 case 't':
125 ResultChar = 9;
126 break;
127 case 'v':
128 ResultChar = 11;
129 break;
Chris Lattnerc10adde2007-05-20 05:00:58 +0000130 case 'x': { // Hex escape.
131 ResultChar = 0;
Jordan Rosea7d03842013-02-08 22:30:41 +0000132 if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
Chris Lattner7a02bfd2010-11-17 06:26:08 +0000133 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +0000134 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
Jordan Roseaa89cf12013-01-24 20:50:13 +0000135 diag::err_hex_escape_no_digits) << "x";
Chris Lattner2f5add62007-04-05 06:57:15 +0000136 HadError = 1;
Chris Lattner2f5add62007-04-05 06:57:15 +0000137 break;
138 }
Mike Stump11289f42009-09-09 15:08:12 +0000139
Chris Lattner812eda82007-05-20 05:17:04 +0000140 // Hex escapes are a maximal series of hex digits.
Chris Lattnerc10adde2007-05-20 05:00:58 +0000141 bool Overflow = false;
142 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
Jordan Rose78ed86a2013-01-18 22:33:58 +0000143 int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
Chris Lattnerc10adde2007-05-20 05:00:58 +0000144 if (CharVal == -1) break;
Chris Lattner59f09b62008-09-30 20:45:40 +0000145 // About to shift out a digit?
146 Overflow |= (ResultChar & 0xF0000000) ? true : false;
Chris Lattnerc10adde2007-05-20 05:00:58 +0000147 ResultChar <<= 4;
148 ResultChar |= CharVal;
149 }
150
151 // See if any bits will be truncated when evaluated as a character.
Chris Lattnerc10adde2007-05-20 05:00:58 +0000152 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
153 Overflow = true;
154 ResultChar &= ~0U >> (32-CharWidth);
155 }
Mike Stump11289f42009-09-09 15:08:12 +0000156
Chris Lattnerc10adde2007-05-20 05:00:58 +0000157 // Check for overflow.
Chris Lattner7a02bfd2010-11-17 06:26:08 +0000158 if (Overflow && Diags) // Too many digits to fit in
Richard Smith639b8d02012-09-08 07:16:20 +0000159 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
Eli Friedman088d39a2013-07-23 00:25:18 +0000160 diag::err_hex_escape_too_large);
Chris Lattner2f5add62007-04-05 06:57:15 +0000161 break;
Chris Lattnerc10adde2007-05-20 05:00:58 +0000162 }
Chris Lattner2f5add62007-04-05 06:57:15 +0000163 case '0': case '1': case '2': case '3':
Chris Lattner812eda82007-05-20 05:17:04 +0000164 case '4': case '5': case '6': case '7': {
Chris Lattner2f5add62007-04-05 06:57:15 +0000165 // Octal escapes.
Chris Lattner3f4b6e32007-06-09 06:20:47 +0000166 --ThisTokBuf;
Chris Lattner812eda82007-05-20 05:17:04 +0000167 ResultChar = 0;
168
169 // Octal escapes are a series of octal digits with maximum length 3.
170 // "\0123" is a two digit sequence equal to "\012" "3".
171 unsigned NumDigits = 0;
172 do {
173 ResultChar <<= 3;
174 ResultChar |= *ThisTokBuf++ - '0';
175 ++NumDigits;
176 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
177 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
Mike Stump11289f42009-09-09 15:08:12 +0000178
Chris Lattner812eda82007-05-20 05:17:04 +0000179 // Check for overflow. Reject '\777', but not L'\777'.
Chris Lattner812eda82007-05-20 05:17:04 +0000180 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
Chris Lattner7a02bfd2010-11-17 06:26:08 +0000181 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +0000182 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
Eli Friedman088d39a2013-07-23 00:25:18 +0000183 diag::err_octal_escape_too_large);
Chris Lattner812eda82007-05-20 05:17:04 +0000184 ResultChar &= ~0U >> (32-CharWidth);
185 }
Chris Lattner2f5add62007-04-05 06:57:15 +0000186 break;
Chris Lattner812eda82007-05-20 05:17:04 +0000187 }
Mike Stump11289f42009-09-09 15:08:12 +0000188
Chris Lattner2f5add62007-04-05 06:57:15 +0000189 // Otherwise, these are not valid escapes.
190 case '(': case '{': case '[': case '%':
191 // GCC accepts these as extensions. We warn about them as such though.
Chris Lattner7a02bfd2010-11-17 06:26:08 +0000192 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +0000193 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194 diag::ext_nonstandard_escape)
195 << std::string(1, ResultChar);
Eli Friedman5d72d412009-04-28 00:51:18 +0000196 break;
Chris Lattner2f5add62007-04-05 06:57:15 +0000197 default:
Chris Lattner7a02bfd2010-11-17 06:26:08 +0000198 if (Diags == 0)
Douglas Gregor9af03022010-05-26 05:35:51 +0000199 break;
Richard Smith639b8d02012-09-08 07:16:20 +0000200
Jordan Rosea7d03842013-02-08 22:30:41 +0000201 if (isPrintable(ResultChar))
Richard Smith639b8d02012-09-08 07:16:20 +0000202 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
203 diag::ext_unknown_escape)
204 << std::string(1, ResultChar);
Chris Lattner59acca52008-11-22 07:23:31 +0000205 else
Richard Smith639b8d02012-09-08 07:16:20 +0000206 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
207 diag::ext_unknown_escape)
208 << "x" + llvm::utohexstr(ResultChar);
Chris Lattner2f5add62007-04-05 06:57:15 +0000209 break;
210 }
Mike Stump11289f42009-09-09 15:08:12 +0000211
Chris Lattner2f5add62007-04-05 06:57:15 +0000212 return ResultChar;
213}
214
Richard Smith8b7258b2014-02-17 21:52:30 +0000215static void appendCodePoint(unsigned Codepoint,
216 llvm::SmallVectorImpl<char> &Str) {
217 char ResultBuf[4];
218 char *ResultPtr = ResultBuf;
219 bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
220 (void)Res;
221 assert(Res && "Unexpected conversion failure");
222 Str.append(ResultBuf, ResultPtr);
223}
224
225void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
226 for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
227 if (*I != '\\') {
228 Buf.push_back(*I);
229 continue;
230 }
231
232 ++I;
233 assert(*I == 'u' || *I == 'U');
234
235 unsigned NumHexDigits;
236 if (*I == 'u')
237 NumHexDigits = 4;
238 else
239 NumHexDigits = 8;
240
241 assert(I + NumHexDigits <= E);
242
243 uint32_t CodePoint = 0;
244 for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
245 unsigned Value = llvm::hexDigitValue(*I);
246 assert(Value != -1U);
247
248 CodePoint <<= 4;
249 CodePoint += Value;
250 }
251
252 appendCodePoint(CodePoint, Buf);
253 --I;
254 }
255}
256
Steve Naroff7b753d22009-03-30 23:46:03 +0000257/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
Nico Webera6bde812010-10-09 00:27:47 +0000258/// return the UTF32.
Richard Smith2a70e652012-03-09 22:27:51 +0000259static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
260 const char *ThisTokEnd,
Nico Webera6bde812010-10-09 00:27:47 +0000261 uint32_t &UcnVal, unsigned short &UcnLen,
David Blaikie9c902b52011-09-25 23:23:43 +0000262 FullSourceLoc Loc, DiagnosticsEngine *Diags,
Seth Cantrell8b2b6772012-01-18 12:27:04 +0000263 const LangOptions &Features,
264 bool in_char_string_literal = false) {
Richard Smith2a70e652012-03-09 22:27:51 +0000265 const char *UcnBegin = ThisTokBuf;
Mike Stump11289f42009-09-09 15:08:12 +0000266
Steve Naroff7b753d22009-03-30 23:46:03 +0000267 // Skip the '\u' char's.
268 ThisTokBuf += 2;
Chris Lattner2f5add62007-04-05 06:57:15 +0000269
Jordan Rosea7d03842013-02-08 22:30:41 +0000270 if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
Chris Lattnerbde1b812010-11-17 06:46:14 +0000271 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +0000272 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
Jordan Roseaa89cf12013-01-24 20:50:13 +0000273 diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
Nico Webera6bde812010-10-09 00:27:47 +0000274 return false;
Steve Naroff7b753d22009-03-30 23:46:03 +0000275 }
Nico Webera6bde812010-10-09 00:27:47 +0000276 UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
Fariborz Jahanianabaae2b2010-08-31 23:34:27 +0000277 unsigned short UcnLenSave = UcnLen;
Nico Webera6bde812010-10-09 00:27:47 +0000278 for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
Jordan Rose78ed86a2013-01-18 22:33:58 +0000279 int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
Steve Naroff7b753d22009-03-30 23:46:03 +0000280 if (CharVal == -1) break;
281 UcnVal <<= 4;
282 UcnVal |= CharVal;
283 }
284 // If we didn't consume the proper number of digits, there is a problem.
Nico Webera6bde812010-10-09 00:27:47 +0000285 if (UcnLenSave) {
Richard Smith639b8d02012-09-08 07:16:20 +0000286 if (Diags)
287 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
288 diag::err_ucn_escape_incomplete);
Nico Webera6bde812010-10-09 00:27:47 +0000289 return false;
Steve Naroff7b753d22009-03-30 23:46:03 +0000290 }
Richard Smith2a70e652012-03-09 22:27:51 +0000291
Seth Cantrell8b2b6772012-01-18 12:27:04 +0000292 // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
Richard Smith2a70e652012-03-09 22:27:51 +0000293 if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
294 UcnVal > 0x10FFFF) { // maximum legal UTF32 value
Chris Lattnerbde1b812010-11-17 06:46:14 +0000295 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +0000296 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
297 diag::err_ucn_escape_invalid);
Nico Webera6bde812010-10-09 00:27:47 +0000298 return false;
299 }
Richard Smith2a70e652012-03-09 22:27:51 +0000300
301 // C++11 allows UCNs that refer to control characters and basic source
302 // characters inside character and string literals
303 if (UcnVal < 0xa0 &&
304 (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
Richard Smith2bf7fdb2013-01-02 11:42:31 +0000305 bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
Richard Smith2a70e652012-03-09 22:27:51 +0000306 if (Diags) {
Richard Smith2a70e652012-03-09 22:27:51 +0000307 char BasicSCSChar = UcnVal;
308 if (UcnVal >= 0x20 && UcnVal < 0x7f)
Richard Smith639b8d02012-09-08 07:16:20 +0000309 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
310 IsError ? diag::err_ucn_escape_basic_scs :
311 diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
312 << StringRef(&BasicSCSChar, 1);
Richard Smith2a70e652012-03-09 22:27:51 +0000313 else
Richard Smith639b8d02012-09-08 07:16:20 +0000314 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
315 IsError ? diag::err_ucn_control_character :
316 diag::warn_cxx98_compat_literal_ucn_control_character);
Richard Smith2a70e652012-03-09 22:27:51 +0000317 }
318 if (IsError)
319 return false;
320 }
321
Richard Smith639b8d02012-09-08 07:16:20 +0000322 if (!Features.CPlusPlus && !Features.C99 && Diags)
323 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
Jordan Rosec0cba272013-01-27 20:12:04 +0000324 diag::warn_ucn_not_valid_in_c89_literal);
Richard Smith639b8d02012-09-08 07:16:20 +0000325
Nico Webera6bde812010-10-09 00:27:47 +0000326 return true;
327}
328
Richard Smith4060f772012-06-13 05:37:23 +0000329/// MeasureUCNEscape - Determine the number of bytes within the resulting string
330/// which this UCN will occupy.
331static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
332 const char *ThisTokEnd, unsigned CharByteWidth,
333 const LangOptions &Features, bool &HadError) {
334 // UTF-32: 4 bytes per escape.
335 if (CharByteWidth == 4)
336 return 4;
337
338 uint32_t UcnVal = 0;
339 unsigned short UcnLen = 0;
340 FullSourceLoc Loc;
341
342 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
343 UcnLen, Loc, 0, Features, true)) {
344 HadError = true;
345 return 0;
346 }
347
348 // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
349 if (CharByteWidth == 2)
350 return UcnVal <= 0xFFFF ? 2 : 4;
351
352 // UTF-8.
353 if (UcnVal < 0x80)
354 return 1;
355 if (UcnVal < 0x800)
356 return 2;
357 if (UcnVal < 0x10000)
358 return 3;
359 return 4;
360}
361
Nico Webera6bde812010-10-09 00:27:47 +0000362/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
363/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
364/// StringLiteralParser. When we decide to implement UCN's for identifiers,
365/// we will likely rework our support for UCN's.
Richard Smith2a70e652012-03-09 22:27:51 +0000366static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
367 const char *ThisTokEnd,
Chris Lattner2be8aa92010-11-17 07:12:42 +0000368 char *&ResultBuf, bool &HadError,
Douglas Gregorfb65e592011-07-27 05:40:30 +0000369 FullSourceLoc Loc, unsigned CharByteWidth,
David Blaikie9c902b52011-09-25 23:23:43 +0000370 DiagnosticsEngine *Diags,
371 const LangOptions &Features) {
Nico Webera6bde812010-10-09 00:27:47 +0000372 typedef uint32_t UTF32;
373 UTF32 UcnVal = 0;
374 unsigned short UcnLen = 0;
Richard Smith2a70e652012-03-09 22:27:51 +0000375 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
376 Loc, Diags, Features, true)) {
Richard Smith4060f772012-06-13 05:37:23 +0000377 HadError = true;
Steve Naroff7b753d22009-03-30 23:46:03 +0000378 return;
379 }
Nico Webera6bde812010-10-09 00:27:47 +0000380
Eli Friedmanf9edb002013-09-18 23:23:13 +0000381 assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
Douglas Gregorfb65e592011-07-27 05:40:30 +0000382 "only character widths of 1, 2, or 4 bytes supported");
Nico Weber9762e0a2010-10-06 04:57:26 +0000383
Douglas Gregorfb65e592011-07-27 05:40:30 +0000384 (void)UcnLen;
385 assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
Nico Weber9762e0a2010-10-06 04:57:26 +0000386
Douglas Gregorfb65e592011-07-27 05:40:30 +0000387 if (CharByteWidth == 4) {
Eli Friedmand1370792011-11-02 23:06:23 +0000388 // FIXME: Make the type of the result buffer correct instead of
389 // using reinterpret_cast.
390 UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
391 *ResultPtr = UcnVal;
392 ResultBuf += 4;
Douglas Gregorfb65e592011-07-27 05:40:30 +0000393 return;
394 }
395
396 if (CharByteWidth == 2) {
Eli Friedmand1370792011-11-02 23:06:23 +0000397 // FIXME: Make the type of the result buffer correct instead of
398 // using reinterpret_cast.
399 UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
400
Richard Smith0948d932012-06-13 05:41:29 +0000401 if (UcnVal <= (UTF32)0xFFFF) {
Eli Friedmand1370792011-11-02 23:06:23 +0000402 *ResultPtr = UcnVal;
403 ResultBuf += 2;
Nico Weber9762e0a2010-10-06 04:57:26 +0000404 return;
405 }
Nico Weber9762e0a2010-10-06 04:57:26 +0000406
Eli Friedmand1370792011-11-02 23:06:23 +0000407 // Convert to UTF16.
Nico Weber9762e0a2010-10-06 04:57:26 +0000408 UcnVal -= 0x10000;
Eli Friedmand1370792011-11-02 23:06:23 +0000409 *ResultPtr = 0xD800 + (UcnVal >> 10);
410 *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
411 ResultBuf += 4;
Fariborz Jahanianabaae2b2010-08-31 23:34:27 +0000412 return;
413 }
Douglas Gregorfb65e592011-07-27 05:40:30 +0000414
415 assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
416
Steve Naroff7b753d22009-03-30 23:46:03 +0000417 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
418 // The conversion below was inspired by:
419 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
Mike Stump11289f42009-09-09 15:08:12 +0000420 // First, we determine how many bytes the result will require.
Steve Naroffc94adda2009-04-01 11:09:15 +0000421 typedef uint8_t UTF8;
Steve Naroff7b753d22009-03-30 23:46:03 +0000422
423 unsigned short bytesToWrite = 0;
424 if (UcnVal < (UTF32)0x80)
425 bytesToWrite = 1;
426 else if (UcnVal < (UTF32)0x800)
427 bytesToWrite = 2;
428 else if (UcnVal < (UTF32)0x10000)
429 bytesToWrite = 3;
430 else
431 bytesToWrite = 4;
Mike Stump11289f42009-09-09 15:08:12 +0000432
Steve Naroff7b753d22009-03-30 23:46:03 +0000433 const unsigned byteMask = 0xBF;
434 const unsigned byteMark = 0x80;
Mike Stump11289f42009-09-09 15:08:12 +0000435
Steve Naroff7b753d22009-03-30 23:46:03 +0000436 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
Steve Narofff2a880c2009-03-31 10:29:45 +0000437 // into the first byte, depending on how many bytes follow.
Mike Stump11289f42009-09-09 15:08:12 +0000438 static const UTF8 firstByteMark[5] = {
Steve Narofff2a880c2009-03-31 10:29:45 +0000439 0x00, 0x00, 0xC0, 0xE0, 0xF0
Steve Naroff7b753d22009-03-30 23:46:03 +0000440 };
441 // Finally, we write the bytes into ResultBuf.
442 ResultBuf += bytesToWrite;
443 switch (bytesToWrite) { // note: everything falls through.
Benjamin Kramerf23a6e62012-11-08 19:22:26 +0000444 case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
445 case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
446 case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
447 case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
Steve Naroff7b753d22009-03-30 23:46:03 +0000448 }
449 // Update the buffer.
450 ResultBuf += bytesToWrite;
451}
Chris Lattner2f5add62007-04-05 06:57:15 +0000452
453
Steve Naroff09ef4742007-03-09 23:16:33 +0000454/// integer-constant: [C99 6.4.4.1]
455/// decimal-constant integer-suffix
456/// octal-constant integer-suffix
457/// hexadecimal-constant integer-suffix
Richard Smithf4198b72013-07-23 08:14:48 +0000458/// binary-literal integer-suffix [GNU, C++1y]
Richard Smith81292452012-03-08 21:59:28 +0000459/// user-defined-integer-literal: [C++11 lex.ext]
Richard Smith39570d002012-03-08 08:45:32 +0000460/// decimal-literal ud-suffix
461/// octal-literal ud-suffix
462/// hexadecimal-literal ud-suffix
Richard Smithf4198b72013-07-23 08:14:48 +0000463/// binary-literal ud-suffix [GNU, C++1y]
Mike Stump11289f42009-09-09 15:08:12 +0000464/// decimal-constant:
Steve Naroff09ef4742007-03-09 23:16:33 +0000465/// nonzero-digit
466/// decimal-constant digit
Mike Stump11289f42009-09-09 15:08:12 +0000467/// octal-constant:
Steve Naroff09ef4742007-03-09 23:16:33 +0000468/// 0
469/// octal-constant octal-digit
Mike Stump11289f42009-09-09 15:08:12 +0000470/// hexadecimal-constant:
Steve Naroff09ef4742007-03-09 23:16:33 +0000471/// hexadecimal-prefix hexadecimal-digit
472/// hexadecimal-constant hexadecimal-digit
473/// hexadecimal-prefix: one of
474/// 0x 0X
Richard Smithf4198b72013-07-23 08:14:48 +0000475/// binary-literal:
476/// 0b binary-digit
477/// 0B binary-digit
478/// binary-literal binary-digit
Steve Naroff09ef4742007-03-09 23:16:33 +0000479/// integer-suffix:
480/// unsigned-suffix [long-suffix]
481/// unsigned-suffix [long-long-suffix]
482/// long-suffix [unsigned-suffix]
483/// long-long-suffix [unsigned-sufix]
484/// nonzero-digit:
485/// 1 2 3 4 5 6 7 8 9
486/// octal-digit:
487/// 0 1 2 3 4 5 6 7
488/// hexadecimal-digit:
489/// 0 1 2 3 4 5 6 7 8 9
490/// a b c d e f
491/// A B C D E F
Richard Smithf4198b72013-07-23 08:14:48 +0000492/// binary-digit:
493/// 0
494/// 1
Steve Naroff09ef4742007-03-09 23:16:33 +0000495/// unsigned-suffix: one of
496/// u U
497/// long-suffix: one of
498/// l L
Mike Stump11289f42009-09-09 15:08:12 +0000499/// long-long-suffix: one of
Steve Naroff09ef4742007-03-09 23:16:33 +0000500/// ll LL
501///
502/// floating-constant: [C99 6.4.4.2]
503/// TODO: add rules...
504///
Dmitri Gribenko7ba91722012-09-24 09:53:54 +0000505NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
506 SourceLocation TokLoc,
507 Preprocessor &PP)
508 : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
Mike Stump11289f42009-09-09 15:08:12 +0000509
Chris Lattner59f09b62008-09-30 20:45:40 +0000510 // This routine assumes that the range begin/end matches the regex for integer
511 // and FP constants (specifically, the 'pp-number' regex), and assumes that
512 // the byte at "*end" is both valid and not part of the regex. Because of
513 // this, it doesn't have to check for 'overscan' in various places.
Jordan Rosea7d03842013-02-08 22:30:41 +0000514 assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");
Mike Stump11289f42009-09-09 15:08:12 +0000515
Dmitri Gribenko7ba91722012-09-24 09:53:54 +0000516 s = DigitsBegin = ThisTokBegin;
Steve Naroff09ef4742007-03-09 23:16:33 +0000517 saw_exponent = false;
518 saw_period = false;
Richard Smith39570d002012-03-08 08:45:32 +0000519 saw_ud_suffix = false;
Steve Naroff09ef4742007-03-09 23:16:33 +0000520 isLong = false;
521 isUnsigned = false;
522 isLongLong = false;
Chris Lattnered045422007-08-26 03:29:23 +0000523 isFloat = false;
Chris Lattnerf55ab182007-08-26 01:58:14 +0000524 isImaginary = false;
Mike Stumpc99c0222009-10-08 22:55:36 +0000525 isMicrosoftInteger = false;
Steve Naroff09ef4742007-03-09 23:16:33 +0000526 hadError = false;
Mike Stump11289f42009-09-09 15:08:12 +0000527
Steve Naroff09ef4742007-03-09 23:16:33 +0000528 if (*s == '0') { // parse radix
Chris Lattner6016a512008-06-30 06:39:54 +0000529 ParseNumberStartingWithZero(TokLoc);
530 if (hadError)
531 return;
Steve Naroff09ef4742007-03-09 23:16:33 +0000532 } else { // the first digit is non-zero
533 radix = 10;
534 s = SkipDigits(s);
535 if (s == ThisTokEnd) {
Chris Lattner328fa5c2007-06-08 17:12:06 +0000536 // Done.
Jordan Rosea7d03842013-02-08 22:30:41 +0000537 } else if (isHexDigit(*s) && !(*s == 'e' || *s == 'E')) {
Dmitri Gribenko7ba91722012-09-24 09:53:54 +0000538 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
Chris Lattner0e62c1c2011-07-23 10:55:15 +0000539 diag::err_invalid_decimal_digit) << StringRef(s, 1);
Chris Lattner59acca52008-11-22 07:23:31 +0000540 hadError = true;
Chris Lattner328fa5c2007-06-08 17:12:06 +0000541 return;
Steve Naroff09ef4742007-03-09 23:16:33 +0000542 } else if (*s == '.') {
Richard Smith1e130482013-09-26 04:19:11 +0000543 checkSeparator(TokLoc, s, CSK_AfterDigits);
Steve Naroff09ef4742007-03-09 23:16:33 +0000544 s++;
545 saw_period = true;
Richard Smith1e130482013-09-26 04:19:11 +0000546 checkSeparator(TokLoc, s, CSK_BeforeDigits);
Steve Naroff09ef4742007-03-09 23:16:33 +0000547 s = SkipDigits(s);
Mike Stump11289f42009-09-09 15:08:12 +0000548 }
Chris Lattnerfb8b8f22008-09-29 23:12:31 +0000549 if ((*s == 'e' || *s == 'E')) { // exponent
Richard Smith1e130482013-09-26 04:19:11 +0000550 checkSeparator(TokLoc, s, CSK_AfterDigits);
Chris Lattner4885b972008-04-20 18:47:55 +0000551 const char *Exponent = s;
Steve Naroff09ef4742007-03-09 23:16:33 +0000552 s++;
553 saw_exponent = true;
554 if (*s == '+' || *s == '-') s++; // sign
Richard Smith1e130482013-09-26 04:19:11 +0000555 checkSeparator(TokLoc, s, CSK_BeforeDigits);
Steve Naroff09ef4742007-03-09 23:16:33 +0000556 const char *first_non_digit = SkipDigits(s);
Chris Lattner48a9b9b2008-04-20 18:41:46 +0000557 if (first_non_digit != s) {
Steve Naroff09ef4742007-03-09 23:16:33 +0000558 s = first_non_digit;
Chris Lattner48a9b9b2008-04-20 18:41:46 +0000559 } else {
Dmitri Gribenko7ba91722012-09-24 09:53:54 +0000560 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent - ThisTokBegin),
Chris Lattner59acca52008-11-22 07:23:31 +0000561 diag::err_exponent_has_no_digits);
562 hadError = true;
Chris Lattner48a9b9b2008-04-20 18:41:46 +0000563 return;
Steve Naroff09ef4742007-03-09 23:16:33 +0000564 }
565 }
566 }
567
568 SuffixBegin = s;
Richard Smith1e130482013-09-26 04:19:11 +0000569 checkSeparator(TokLoc, s, CSK_AfterDigits);
Mike Stump11289f42009-09-09 15:08:12 +0000570
Chris Lattnerf55ab182007-08-26 01:58:14 +0000571 // Parse the suffix. At this point we can classify whether we have an FP or
572 // integer constant.
573 bool isFPConstant = isFloatingLiteral();
Richard Smithf4198b72013-07-23 08:14:48 +0000574 const char *ImaginarySuffixLoc = 0;
Mike Stump11289f42009-09-09 15:08:12 +0000575
Chris Lattnerf55ab182007-08-26 01:58:14 +0000576 // Loop over all of the characters of the suffix. If we see something bad,
577 // we break out of the loop.
578 for (; s != ThisTokEnd; ++s) {
579 switch (*s) {
580 case 'f': // FP Suffix for "float"
581 case 'F':
582 if (!isFPConstant) break; // Error for integer constant.
Chris Lattnered045422007-08-26 03:29:23 +0000583 if (isFloat || isLong) break; // FF, LF invalid.
584 isFloat = true;
Chris Lattnerf55ab182007-08-26 01:58:14 +0000585 continue; // Success.
586 case 'u':
587 case 'U':
588 if (isFPConstant) break; // Error for floating constant.
589 if (isUnsigned) break; // Cannot be repeated.
590 isUnsigned = true;
591 continue; // Success.
592 case 'l':
593 case 'L':
594 if (isLong || isLongLong) break; // Cannot be repeated.
Chris Lattnered045422007-08-26 03:29:23 +0000595 if (isFloat) break; // LF invalid.
Mike Stump11289f42009-09-09 15:08:12 +0000596
Chris Lattnerf55ab182007-08-26 01:58:14 +0000597 // Check for long long. The L's need to be adjacent and the same case.
598 if (s+1 != ThisTokEnd && s[1] == s[0]) {
599 if (isFPConstant) break; // long long invalid for floats.
600 isLongLong = true;
601 ++s; // Eat both of them.
602 } else {
Steve Naroff09ef4742007-03-09 23:16:33 +0000603 isLong = true;
Steve Naroff09ef4742007-03-09 23:16:33 +0000604 }
Chris Lattnerf55ab182007-08-26 01:58:14 +0000605 continue; // Success.
606 case 'i':
Chris Lattner26f6c222010-10-14 00:24:10 +0000607 case 'I':
David Blaikiebbafb8a2012-03-11 07:00:24 +0000608 if (PP.getLangOpts().MicrosoftExt) {
Fariborz Jahanian8c6c0b62010-01-22 21:36:53 +0000609 if (isFPConstant || isLong || isLongLong) break;
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000610
Steve Naroffa1f41452008-04-04 21:02:54 +0000611 // Allow i8, i16, i32, i64, and i128.
Mike Stumpc99c0222009-10-08 22:55:36 +0000612 if (s + 1 != ThisTokEnd) {
613 switch (s[1]) {
614 case '8':
615 s += 2; // i8 suffix
616 isMicrosoftInteger = true;
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000617 break;
Mike Stumpc99c0222009-10-08 22:55:36 +0000618 case '1':
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000619 if (s + 2 == ThisTokEnd) break;
Francois Pichet12df1dc2011-01-11 11:57:53 +0000620 if (s[2] == '6') {
621 s += 3; // i16 suffix
622 isMicrosoftInteger = true;
623 }
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000624 else if (s[2] == '2') {
625 if (s + 3 == ThisTokEnd) break;
Francois Pichet12df1dc2011-01-11 11:57:53 +0000626 if (s[3] == '8') {
627 s += 4; // i128 suffix
628 isMicrosoftInteger = true;
629 }
Mike Stumpc99c0222009-10-08 22:55:36 +0000630 }
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000631 break;
Mike Stumpc99c0222009-10-08 22:55:36 +0000632 case '3':
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000633 if (s + 2 == ThisTokEnd) break;
Francois Pichet12df1dc2011-01-11 11:57:53 +0000634 if (s[2] == '2') {
635 s += 3; // i32 suffix
636 isLong = true;
637 isMicrosoftInteger = true;
638 }
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000639 break;
Mike Stumpc99c0222009-10-08 22:55:36 +0000640 case '6':
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000641 if (s + 2 == ThisTokEnd) break;
Francois Pichet12df1dc2011-01-11 11:57:53 +0000642 if (s[2] == '4') {
643 s += 3; // i64 suffix
644 isLongLong = true;
645 isMicrosoftInteger = true;
646 }
Nuno Lopesbaa1bc42009-11-28 13:37:52 +0000647 break;
Mike Stumpc99c0222009-10-08 22:55:36 +0000648 default:
649 break;
650 }
651 break;
Steve Naroffa1f41452008-04-04 21:02:54 +0000652 }
Steve Naroffa1f41452008-04-04 21:02:54 +0000653 }
Richard Smith2a988622013-09-24 04:06:10 +0000654 // "i", "if", and "il" are user-defined suffixes in C++1y.
655 if (PP.getLangOpts().CPlusPlus1y && *s == 'i')
656 break;
Steve Naroffa1f41452008-04-04 21:02:54 +0000657 // fall through.
Chris Lattnerf55ab182007-08-26 01:58:14 +0000658 case 'j':
659 case 'J':
660 if (isImaginary) break; // Cannot be repeated.
Chris Lattnerf55ab182007-08-26 01:58:14 +0000661 isImaginary = true;
Richard Smithf4198b72013-07-23 08:14:48 +0000662 ImaginarySuffixLoc = s;
Chris Lattnerf55ab182007-08-26 01:58:14 +0000663 continue; // Success.
Steve Naroff09ef4742007-03-09 23:16:33 +0000664 }
Richard Smith39570d002012-03-08 08:45:32 +0000665 // If we reached here, there was an error or a ud-suffix.
Chris Lattnerf55ab182007-08-26 01:58:14 +0000666 break;
667 }
Mike Stump11289f42009-09-09 15:08:12 +0000668
Chris Lattnerf55ab182007-08-26 01:58:14 +0000669 if (s != ThisTokEnd) {
Richard Smith8b7258b2014-02-17 21:52:30 +0000670 // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
671 expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
672 if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
Richard Smithf4198b72013-07-23 08:14:48 +0000673 // Any suffix pieces we might have parsed are actually part of the
674 // ud-suffix.
675 isLong = false;
676 isUnsigned = false;
677 isLongLong = false;
678 isFloat = false;
679 isImaginary = false;
680 isMicrosoftInteger = false;
681
Richard Smith39570d002012-03-08 08:45:32 +0000682 saw_ud_suffix = true;
683 return;
684 }
685
686 // Report an error if there are any.
Dmitri Gribenko7ba91722012-09-24 09:53:54 +0000687 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
Chris Lattner59acca52008-11-22 07:23:31 +0000688 isFPConstant ? diag::err_invalid_suffix_float_constant :
689 diag::err_invalid_suffix_integer_constant)
Chris Lattner0e62c1c2011-07-23 10:55:15 +0000690 << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
Chris Lattner59acca52008-11-22 07:23:31 +0000691 hadError = true;
Chris Lattnerf55ab182007-08-26 01:58:14 +0000692 return;
Steve Naroff09ef4742007-03-09 23:16:33 +0000693 }
Richard Smithf4198b72013-07-23 08:14:48 +0000694
695 if (isImaginary) {
696 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc,
697 ImaginarySuffixLoc - ThisTokBegin),
698 diag::ext_imaginary_constant);
699 }
700}
701
702/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
703/// suffixes as ud-suffixes, because the diagnostic experience is better if we
704/// treat it as an invalid suffix.
705bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
706 StringRef Suffix) {
707 if (!LangOpts.CPlusPlus11 || Suffix.empty())
708 return false;
709
710 // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
711 if (Suffix[0] == '_')
712 return true;
713
714 // In C++11, there are no library suffixes.
715 if (!LangOpts.CPlusPlus1y)
716 return false;
717
718 // In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
Richard Smith2a988622013-09-24 04:06:10 +0000719 // Per tweaked N3660, "il", "i", and "if" are also used in the library.
Richard Smithf4198b72013-07-23 08:14:48 +0000720 return llvm::StringSwitch<bool>(Suffix)
721 .Cases("h", "min", "s", true)
722 .Cases("ms", "us", "ns", true)
Richard Smith2a988622013-09-24 04:06:10 +0000723 .Cases("il", "i", "if", true)
Richard Smithf4198b72013-07-23 08:14:48 +0000724 .Default(false);
Steve Naroff09ef4742007-03-09 23:16:33 +0000725}
726
Richard Smithfde94852013-09-26 03:33:06 +0000727void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
Richard Smith1e130482013-09-26 04:19:11 +0000728 const char *Pos,
729 CheckSeparatorKind IsAfterDigits) {
730 if (IsAfterDigits == CSK_AfterDigits) {
Richard Smith99dc0712013-09-26 05:57:03 +0000731 if (Pos == ThisTokBegin)
732 return;
Richard Smithfde94852013-09-26 03:33:06 +0000733 --Pos;
Richard Smith99dc0712013-09-26 05:57:03 +0000734 } else if (Pos == ThisTokEnd)
735 return;
Richard Smithfde94852013-09-26 03:33:06 +0000736
737 if (isDigitSeparator(*Pos))
738 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
739 diag::err_digit_separator_not_between_digits)
740 << IsAfterDigits;
741}
742
Chris Lattner6016a512008-06-30 06:39:54 +0000743/// ParseNumberStartingWithZero - This method is called when the first character
744/// of the number is found to be a zero. This means it is either an octal
745/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
Mike Stump11289f42009-09-09 15:08:12 +0000746/// a floating point number (01239.123e4). Eat the prefix, determining the
Chris Lattner6016a512008-06-30 06:39:54 +0000747/// radix etc.
748void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
749 assert(s[0] == '0' && "Invalid method call");
750 s++;
Mike Stump11289f42009-09-09 15:08:12 +0000751
NAKAMURA Takumif2bc8f32013-09-27 04:42:28 +0000752 int c1 = s[0];
753 int c2 = s[1];
754
Chris Lattner6016a512008-06-30 06:39:54 +0000755 // Handle a hex number like 0x1234.
NAKAMURA Takumif2bc8f32013-09-27 04:42:28 +0000756 if ((c1 == 'x' || c1 == 'X') && (isHexDigit(c2) || c2 == '.')) {
Chris Lattner6016a512008-06-30 06:39:54 +0000757 s++;
758 radix = 16;
759 DigitsBegin = s;
760 s = SkipHexDigits(s);
Aaron Ballmane1224a52012-02-08 13:36:33 +0000761 bool noSignificand = (s == DigitsBegin);
Chris Lattner6016a512008-06-30 06:39:54 +0000762 if (s == ThisTokEnd) {
763 // Done.
764 } else if (*s == '.') {
765 s++;
766 saw_period = true;
Aaron Ballmane1224a52012-02-08 13:36:33 +0000767 const char *floatDigitsBegin = s;
Richard Smith70ee92f2014-04-22 23:50:25 +0000768 checkSeparator(TokLoc, s, CSK_BeforeDigits);
Chris Lattner6016a512008-06-30 06:39:54 +0000769 s = SkipHexDigits(s);
Aaron Ballmane1224a52012-02-08 13:36:33 +0000770 noSignificand &= (floatDigitsBegin == s);
Chris Lattner6016a512008-06-30 06:39:54 +0000771 }
Aaron Ballmane1224a52012-02-08 13:36:33 +0000772
773 if (noSignificand) {
Dmitri Gribenko7ba91722012-09-24 09:53:54 +0000774 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
Aaron Ballmane1224a52012-02-08 13:36:33 +0000775 diag::err_hexconstant_requires_digits);
776 hadError = true;
777 return;
778 }
779
Chris Lattner6016a512008-06-30 06:39:54 +0000780 // A binary exponent can appear with or with a '.'. If dotted, the
Mike Stump11289f42009-09-09 15:08:12 +0000781 // binary exponent is required.
Douglas Gregor86325ad2011-08-30 22:40:35 +0000782 if (*s == 'p' || *s == 'P') {
Richard Smith70ee92f2014-04-22 23:50:25 +0000783 checkSeparator(TokLoc, s, CSK_AfterDigits);
Chris Lattner6016a512008-06-30 06:39:54 +0000784 const char *Exponent = s;
785 s++;
786 saw_exponent = true;
787 if (*s == '+' || *s == '-') s++; // sign
788 const char *first_non_digit = SkipDigits(s);
Chris Lattnerc94ad4a2008-07-25 18:18:34 +0000789 if (first_non_digit == s) {
Chris Lattner59acca52008-11-22 07:23:31 +0000790 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
791 diag::err_exponent_has_no_digits);
792 hadError = true;
Chris Lattnerc94ad4a2008-07-25 18:18:34 +0000793 return;
Chris Lattner6016a512008-06-30 06:39:54 +0000794 }
Richard Smith70ee92f2014-04-22 23:50:25 +0000795 checkSeparator(TokLoc, s, CSK_BeforeDigits);
Chris Lattnerc94ad4a2008-07-25 18:18:34 +0000796 s = first_non_digit;
Mike Stump11289f42009-09-09 15:08:12 +0000797
David Blaikiebbafb8a2012-03-11 07:00:24 +0000798 if (!PP.getLangOpts().HexFloats)
Chris Lattner59acca52008-11-22 07:23:31 +0000799 PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
Chris Lattner6016a512008-06-30 06:39:54 +0000800 } else if (saw_period) {
Chris Lattner59acca52008-11-22 07:23:31 +0000801 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
802 diag::err_hexconstant_requires_exponent);
803 hadError = true;
Chris Lattner6016a512008-06-30 06:39:54 +0000804 }
805 return;
806 }
Mike Stump11289f42009-09-09 15:08:12 +0000807
Chris Lattner6016a512008-06-30 06:39:54 +0000808 // Handle simple binary numbers 0b01010
NAKAMURA Takumif2bc8f32013-09-27 04:42:28 +0000809 if ((c1 == 'b' || c1 == 'B') && (c2 == '0' || c2 == '1')) {
Richard Smithc5c27f22013-04-19 20:47:20 +0000810 // 0b101010 is a C++1y / GCC extension.
811 PP.Diag(TokLoc,
812 PP.getLangOpts().CPlusPlus1y
813 ? diag::warn_cxx11_compat_binary_literal
814 : PP.getLangOpts().CPlusPlus
815 ? diag::ext_binary_literal_cxx1y
816 : diag::ext_binary_literal);
Chris Lattner6016a512008-06-30 06:39:54 +0000817 ++s;
818 radix = 2;
819 DigitsBegin = s;
820 s = SkipBinaryDigits(s);
821 if (s == ThisTokEnd) {
822 // Done.
Jordan Rosea7d03842013-02-08 22:30:41 +0000823 } else if (isHexDigit(*s)) {
Chris Lattner59acca52008-11-22 07:23:31 +0000824 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
Chris Lattner0e62c1c2011-07-23 10:55:15 +0000825 diag::err_invalid_binary_digit) << StringRef(s, 1);
Chris Lattner59acca52008-11-22 07:23:31 +0000826 hadError = true;
Chris Lattner6016a512008-06-30 06:39:54 +0000827 }
Chris Lattnerd68c04f2008-06-30 06:44:49 +0000828 // Other suffixes will be diagnosed by the caller.
Chris Lattner6016a512008-06-30 06:39:54 +0000829 return;
830 }
Mike Stump11289f42009-09-09 15:08:12 +0000831
Chris Lattner6016a512008-06-30 06:39:54 +0000832 // For now, the radix is set to 8. If we discover that we have a
833 // floating point constant, the radix will change to 10. Octal floating
Mike Stump11289f42009-09-09 15:08:12 +0000834 // point constants are not permitted (only decimal and hexadecimal).
Chris Lattner6016a512008-06-30 06:39:54 +0000835 radix = 8;
836 DigitsBegin = s;
837 s = SkipOctalDigits(s);
838 if (s == ThisTokEnd)
839 return; // Done, simple octal number like 01234
Mike Stump11289f42009-09-09 15:08:12 +0000840
Chris Lattnerd68c04f2008-06-30 06:44:49 +0000841 // If we have some other non-octal digit that *is* a decimal digit, see if
842 // this is part of a floating point number like 094.123 or 09e1.
Jordan Rosea7d03842013-02-08 22:30:41 +0000843 if (isDigit(*s)) {
Chris Lattnerd68c04f2008-06-30 06:44:49 +0000844 const char *EndDecimal = SkipDigits(s);
845 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
846 s = EndDecimal;
847 radix = 10;
848 }
849 }
Mike Stump11289f42009-09-09 15:08:12 +0000850
Chris Lattnerd68c04f2008-06-30 06:44:49 +0000851 // If we have a hex digit other than 'e' (which denotes a FP exponent) then
852 // the code is using an incorrect base.
Jordan Rosea7d03842013-02-08 22:30:41 +0000853 if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
Chris Lattner59acca52008-11-22 07:23:31 +0000854 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
Chris Lattner0e62c1c2011-07-23 10:55:15 +0000855 diag::err_invalid_octal_digit) << StringRef(s, 1);
Chris Lattner59acca52008-11-22 07:23:31 +0000856 hadError = true;
Chris Lattner6016a512008-06-30 06:39:54 +0000857 return;
858 }
Mike Stump11289f42009-09-09 15:08:12 +0000859
Chris Lattner6016a512008-06-30 06:39:54 +0000860 if (*s == '.') {
861 s++;
862 radix = 10;
863 saw_period = true;
Richard Smith70ee92f2014-04-22 23:50:25 +0000864 checkSeparator(TokLoc, s, CSK_BeforeDigits);
Chris Lattnerd68c04f2008-06-30 06:44:49 +0000865 s = SkipDigits(s); // Skip suffix.
Chris Lattner6016a512008-06-30 06:39:54 +0000866 }
867 if (*s == 'e' || *s == 'E') { // exponent
Richard Smith70ee92f2014-04-22 23:50:25 +0000868 checkSeparator(TokLoc, s, CSK_AfterDigits);
Chris Lattner6016a512008-06-30 06:39:54 +0000869 const char *Exponent = s;
870 s++;
871 radix = 10;
872 saw_exponent = true;
873 if (*s == '+' || *s == '-') s++; // sign
874 const char *first_non_digit = SkipDigits(s);
875 if (first_non_digit != s) {
Richard Smith70ee92f2014-04-22 23:50:25 +0000876 checkSeparator(TokLoc, s, CSK_BeforeDigits);
Chris Lattner6016a512008-06-30 06:39:54 +0000877 s = first_non_digit;
878 } else {
Mike Stump11289f42009-09-09 15:08:12 +0000879 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
Chris Lattner59acca52008-11-22 07:23:31 +0000880 diag::err_exponent_has_no_digits);
881 hadError = true;
Chris Lattner6016a512008-06-30 06:39:54 +0000882 return;
883 }
884 }
885}
886
Jordan Rosede584de2012-09-25 22:32:51 +0000887static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
Dmitri Gribenko511288b2012-09-25 19:09:15 +0000888 switch (Radix) {
889 case 2:
890 return NumDigits <= 64;
891 case 8:
892 return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
893 case 10:
894 return NumDigits <= 19; // floor(log10(2^64))
895 case 16:
896 return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
897 default:
898 llvm_unreachable("impossible Radix");
899 }
900}
Chris Lattner6016a512008-06-30 06:39:54 +0000901
Chris Lattner5b743d32007-04-04 05:52:58 +0000902/// GetIntegerValue - Convert this numeric literal value to an APInt that
Chris Lattner871b4e12007-04-04 06:36:34 +0000903/// matches Val's input width. If there is an overflow, set Val to the low bits
904/// of the result and return true. Otherwise, return false.
Chris Lattner23b7eb62007-06-15 23:05:46 +0000905bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
Daniel Dunbarbe947082008-10-16 07:32:01 +0000906 // Fast path: Compute a conservative bound on the maximum number of
907 // bits per digit in this radix. If we can't possibly overflow a
908 // uint64 based on that bound then do the simple conversion to
909 // integer. This avoids the expensive overflow checking below, and
910 // handles the common cases that matter (small decimal integers and
911 // hex/octal values which don't overflow).
Dmitri Gribenko511288b2012-09-25 19:09:15 +0000912 const unsigned NumDigits = SuffixBegin - DigitsBegin;
Jordan Rosede584de2012-09-25 22:32:51 +0000913 if (alwaysFitsInto64Bits(radix, NumDigits)) {
Daniel Dunbarbe947082008-10-16 07:32:01 +0000914 uint64_t N = 0;
Dmitri Gribenko511288b2012-09-25 19:09:15 +0000915 for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
Richard Smithfde94852013-09-26 03:33:06 +0000916 if (!isDigitSeparator(*Ptr))
917 N = N * radix + llvm::hexDigitValue(*Ptr);
Daniel Dunbarbe947082008-10-16 07:32:01 +0000918
919 // This will truncate the value to Val's input width. Simply check
920 // for overflow by comparing.
921 Val = N;
922 return Val.getZExtValue() != N;
923 }
924
Chris Lattner5b743d32007-04-04 05:52:58 +0000925 Val = 0;
Dmitri Gribenko511288b2012-09-25 19:09:15 +0000926 const char *Ptr = DigitsBegin;
Chris Lattner5b743d32007-04-04 05:52:58 +0000927
Chris Lattner23b7eb62007-06-15 23:05:46 +0000928 llvm::APInt RadixVal(Val.getBitWidth(), radix);
929 llvm::APInt CharVal(Val.getBitWidth(), 0);
930 llvm::APInt OldVal = Val;
Mike Stump11289f42009-09-09 15:08:12 +0000931
Chris Lattner871b4e12007-04-04 06:36:34 +0000932 bool OverflowOccurred = false;
Dmitri Gribenko511288b2012-09-25 19:09:15 +0000933 while (Ptr < SuffixBegin) {
Richard Smithfde94852013-09-26 03:33:06 +0000934 if (isDigitSeparator(*Ptr)) {
935 ++Ptr;
936 continue;
937 }
938
Jordan Rose78ed86a2013-01-18 22:33:58 +0000939 unsigned C = llvm::hexDigitValue(*Ptr++);
Mike Stump11289f42009-09-09 15:08:12 +0000940
Chris Lattner5b743d32007-04-04 05:52:58 +0000941 // If this letter is out of bound for this radix, reject it.
Chris Lattner531efa42007-04-04 06:49:26 +0000942 assert(C < radix && "NumericLiteralParser ctor should have rejected this");
Mike Stump11289f42009-09-09 15:08:12 +0000943
Chris Lattner5b743d32007-04-04 05:52:58 +0000944 CharVal = C;
Mike Stump11289f42009-09-09 15:08:12 +0000945
Chris Lattner871b4e12007-04-04 06:36:34 +0000946 // Add the digit to the value in the appropriate radix. If adding in digits
947 // made the value smaller, then this overflowed.
Chris Lattner5b743d32007-04-04 05:52:58 +0000948 OldVal = Val;
Chris Lattner871b4e12007-04-04 06:36:34 +0000949
950 // Multiply by radix, did overflow occur on the multiply?
Chris Lattner5b743d32007-04-04 05:52:58 +0000951 Val *= RadixVal;
Chris Lattner871b4e12007-04-04 06:36:34 +0000952 OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
953
Chris Lattner871b4e12007-04-04 06:36:34 +0000954 // Add value, did overflow occur on the value?
Daniel Dunbarb1f64422008-10-16 06:39:30 +0000955 // (a + b) ult b <=> overflow
Chris Lattner5b743d32007-04-04 05:52:58 +0000956 Val += CharVal;
Chris Lattner871b4e12007-04-04 06:36:34 +0000957 OverflowOccurred |= Val.ult(CharVal);
Chris Lattner5b743d32007-04-04 05:52:58 +0000958 }
Chris Lattner871b4e12007-04-04 06:36:34 +0000959 return OverflowOccurred;
Chris Lattner5b743d32007-04-04 05:52:58 +0000960}
961
John McCall53b93a02009-12-24 09:08:04 +0000962llvm::APFloat::opStatus
963NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
Ted Kremenekfbb08bc2007-11-26 23:12:30 +0000964 using llvm::APFloat;
Mike Stump11289f42009-09-09 15:08:12 +0000965
Erick Tryzelaarb9073112009-08-16 23:36:28 +0000966 unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
Richard Smithfde94852013-09-26 03:33:06 +0000967
968 llvm::SmallString<16> Buffer;
969 StringRef Str(ThisTokBegin, n);
970 if (Str.find('\'') != StringRef::npos) {
971 Buffer.reserve(n);
972 std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
973 &isDigitSeparator);
974 Str = Buffer;
975 }
976
977 return Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
Steve Naroff97b9e912007-07-09 23:53:58 +0000978}
Chris Lattner5b743d32007-04-04 05:52:58 +0000979
Chris Lattner2f5add62007-04-05 06:57:15 +0000980
James Dennett1cc22032012-06-17 03:34:42 +0000981/// \verbatim
Richard Smithe18f0fa2012-03-05 04:02:15 +0000982/// user-defined-character-literal: [C++11 lex.ext]
983/// character-literal ud-suffix
984/// ud-suffix:
985/// identifier
986/// character-literal: [C++11 lex.ccon]
Craig Topper54edcca2011-08-11 04:06:15 +0000987/// ' c-char-sequence '
988/// u' c-char-sequence '
989/// U' c-char-sequence '
990/// L' c-char-sequence '
991/// c-char-sequence:
992/// c-char
993/// c-char-sequence c-char
994/// c-char:
995/// any member of the source character set except the single-quote ',
996/// backslash \, or new-line character
997/// escape-sequence
998/// universal-character-name
Richard Smithe18f0fa2012-03-05 04:02:15 +0000999/// escape-sequence:
Craig Topper54edcca2011-08-11 04:06:15 +00001000/// simple-escape-sequence
1001/// octal-escape-sequence
1002/// hexadecimal-escape-sequence
1003/// simple-escape-sequence:
NAKAMURA Takumi9f8a02d2011-08-12 05:49:51 +00001004/// one of \' \" \? \\ \a \b \f \n \r \t \v
Craig Topper54edcca2011-08-11 04:06:15 +00001005/// octal-escape-sequence:
1006/// \ octal-digit
1007/// \ octal-digit octal-digit
1008/// \ octal-digit octal-digit octal-digit
1009/// hexadecimal-escape-sequence:
1010/// \x hexadecimal-digit
1011/// hexadecimal-escape-sequence hexadecimal-digit
Richard Smithe18f0fa2012-03-05 04:02:15 +00001012/// universal-character-name: [C++11 lex.charset]
Craig Topper54edcca2011-08-11 04:06:15 +00001013/// \u hex-quad
1014/// \U hex-quad hex-quad
1015/// hex-quad:
1016/// hex-digit hex-digit hex-digit hex-digit
James Dennett1cc22032012-06-17 03:34:42 +00001017/// \endverbatim
Craig Topper54edcca2011-08-11 04:06:15 +00001018///
Chris Lattner2f5add62007-04-05 06:57:15 +00001019CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
Douglas Gregorfb65e592011-07-27 05:40:30 +00001020 SourceLocation Loc, Preprocessor &PP,
1021 tok::TokenKind kind) {
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001022 // At this point we know that the character matches the regex "(L|u|U)?'.*'".
Chris Lattner2f5add62007-04-05 06:57:15 +00001023 HadError = false;
Mike Stump11289f42009-09-09 15:08:12 +00001024
Douglas Gregorfb65e592011-07-27 05:40:30 +00001025 Kind = kind;
1026
Richard Smith2a70e652012-03-09 22:27:51 +00001027 const char *TokBegin = begin;
1028
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001029 // Skip over wide character determinant.
1030 if (Kind != tok::char_constant) {
Douglas Gregorfb65e592011-07-27 05:40:30 +00001031 ++begin;
1032 }
Mike Stump11289f42009-09-09 15:08:12 +00001033
Chris Lattner2f5add62007-04-05 06:57:15 +00001034 // Skip over the entry quote.
1035 assert(begin[0] == '\'' && "Invalid token lexed");
1036 ++begin;
1037
Richard Smithe18f0fa2012-03-05 04:02:15 +00001038 // Remove an optional ud-suffix.
1039 if (end[-1] != '\'') {
1040 const char *UDSuffixEnd = end;
1041 do {
1042 --end;
1043 } while (end[-1] != '\'');
Richard Smith8b7258b2014-02-17 21:52:30 +00001044 // FIXME: Don't bother with this if !tok.hasUCN().
1045 expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
Richard Smith2a70e652012-03-09 22:27:51 +00001046 UDSuffixOffset = end - TokBegin;
Richard Smithe18f0fa2012-03-05 04:02:15 +00001047 }
1048
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001049 // Trim the ending quote.
Richard Smithe18f0fa2012-03-05 04:02:15 +00001050 assert(end != begin && "Invalid token lexed");
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001051 --end;
1052
Mike Stump11289f42009-09-09 15:08:12 +00001053 // FIXME: The "Value" is an uint64_t so we can handle char literals of
Chris Lattner57540c52011-04-15 05:22:18 +00001054 // up to 64-bits.
Chris Lattner2f5add62007-04-05 06:57:15 +00001055 // FIXME: This extensively assumes that 'char' is 8-bits.
Chris Lattner37e05872008-03-05 18:54:05 +00001056 assert(PP.getTargetInfo().getCharWidth() == 8 &&
Chris Lattner2f5add62007-04-05 06:57:15 +00001057 "Assumes char is 8 bits");
Chris Lattner8577f622009-04-28 21:51:46 +00001058 assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1059 (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1060 "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1061 assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1062 "Assumes sizeof(wchar) on target is <= 64");
Sanjiv Guptaf09cb952009-04-21 02:21:29 +00001063
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001064 SmallVector<uint32_t, 4> codepoint_buffer;
1065 codepoint_buffer.resize(end - begin);
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001066 uint32_t *buffer_begin = &codepoint_buffer.front();
1067 uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
Mike Stump11289f42009-09-09 15:08:12 +00001068
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001069 // Unicode escapes representing characters that cannot be correctly
1070 // represented in a single code unit are disallowed in character literals
1071 // by this implementation.
1072 uint32_t largest_character_for_kind;
1073 if (tok::wide_char_constant == Kind) {
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001074 largest_character_for_kind =
Nick Lewycky8054f1d2013-08-21 18:57:51 +00001075 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001076 } else if (tok::utf16_char_constant == Kind) {
1077 largest_character_for_kind = 0xFFFF;
1078 } else if (tok::utf32_char_constant == Kind) {
1079 largest_character_for_kind = 0x10FFFF;
1080 } else {
1081 largest_character_for_kind = 0x7Fu;
Chris Lattner8577f622009-04-28 21:51:46 +00001082 }
1083
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001084 while (begin != end) {
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001085 // Is this a span of non-escape characters?
1086 if (begin[0] != '\\') {
1087 char const *start = begin;
1088 do {
1089 ++begin;
1090 } while (begin != end && *begin != '\\');
1091
Eli Friedman94363522012-02-11 05:08:10 +00001092 char const *tmp_in_start = start;
1093 uint32_t *tmp_out_start = buffer_begin;
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001094 ConversionResult res =
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001095 ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
1096 reinterpret_cast<UTF8 const *>(begin),
1097 &buffer_begin, buffer_end, strictConversion);
1098 if (res != conversionOK) {
1099 // If we see bad encoding for unprefixed character literals, warn and
1100 // simply copy the byte values, for compatibility with gcc and
Eli Friedman94363522012-02-11 05:08:10 +00001101 // older versions of clang.
1102 bool NoErrorOnBadEncoding = isAscii();
1103 unsigned Msg = diag::err_bad_character_encoding;
1104 if (NoErrorOnBadEncoding)
1105 Msg = diag::warn_bad_character_encoding;
Nick Lewycky8054f1d2013-08-21 18:57:51 +00001106 PP.Diag(Loc, Msg);
Eli Friedman94363522012-02-11 05:08:10 +00001107 if (NoErrorOnBadEncoding) {
1108 start = tmp_in_start;
1109 buffer_begin = tmp_out_start;
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001110 for (; start != begin; ++start, ++buffer_begin)
Eli Friedman94363522012-02-11 05:08:10 +00001111 *buffer_begin = static_cast<uint8_t>(*start);
1112 } else {
1113 HadError = true;
1114 }
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001115 } else {
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001116 for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
Eli Friedman94363522012-02-11 05:08:10 +00001117 if (*tmp_out_start > largest_character_for_kind) {
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001118 HadError = true;
1119 PP.Diag(Loc, diag::err_character_too_large);
1120 }
1121 }
1122 }
1123
1124 continue;
1125 }
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001126 // Is this a Universal Character Name escape?
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001127 if (begin[1] == 'u' || begin[1] == 'U') {
1128 unsigned short UcnLen = 0;
Richard Smith2a70e652012-03-09 22:27:51 +00001129 if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001130 FullSourceLoc(Loc, PP.getSourceManager()),
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001131 &PP.getDiagnostics(), PP.getLangOpts(), true)) {
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001132 HadError = true;
1133 } else if (*buffer_begin > largest_character_for_kind) {
1134 HadError = true;
Richard Smith639b8d02012-09-08 07:16:20 +00001135 PP.Diag(Loc, diag::err_character_too_large);
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001136 }
1137
1138 ++buffer_begin;
1139 continue;
1140 }
1141 unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1142 uint64_t result =
Richard Smith639b8d02012-09-08 07:16:20 +00001143 ProcessCharEscape(TokBegin, begin, end, HadError,
Nick Lewycky8054f1d2013-08-21 18:57:51 +00001144 FullSourceLoc(Loc,PP.getSourceManager()),
Richard Smith639b8d02012-09-08 07:16:20 +00001145 CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001146 *buffer_begin++ = result;
1147 }
1148
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001149 unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001150
Chris Lattner8577f622009-04-28 21:51:46 +00001151 if (NumCharsSoFar > 1) {
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001152 if (isWide())
Douglas Gregorfb65e592011-07-27 05:40:30 +00001153 PP.Diag(Loc, diag::warn_extraneous_char_constant);
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001154 else if (isAscii() && NumCharsSoFar == 4)
1155 PP.Diag(Loc, diag::ext_four_char_character_literal);
1156 else if (isAscii())
Chris Lattner8577f622009-04-28 21:51:46 +00001157 PP.Diag(Loc, diag::ext_multichar_character_literal);
1158 else
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001159 PP.Diag(Loc, diag::err_multichar_utf_character_literal);
Eli Friedmand8cec572009-06-01 05:25:02 +00001160 IsMultiChar = true;
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001161 } else {
Daniel Dunbara444cc22009-07-29 01:46:05 +00001162 IsMultiChar = false;
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001163 }
Sanjiv Guptaf09cb952009-04-21 02:21:29 +00001164
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001165 llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1166
1167 // Narrow character literals act as though their value is concatenated
1168 // in this implementation, but warn on overflow.
1169 bool multi_char_too_long = false;
1170 if (isAscii() && isMultiChar()) {
1171 LitVal = 0;
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001172 for (size_t i = 0; i < NumCharsSoFar; ++i) {
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001173 // check for enough leading zeros to shift into
1174 multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1175 LitVal <<= 8;
1176 LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1177 }
1178 } else if (NumCharsSoFar > 0) {
1179 // otherwise just take the last character
1180 LitVal = buffer_begin[-1];
1181 }
1182
1183 if (!HadError && multi_char_too_long) {
Nick Lewycky63cc55b2013-08-21 02:40:19 +00001184 PP.Diag(Loc, diag::warn_char_constant_too_large);
Seth Cantrell8b2b6772012-01-18 12:27:04 +00001185 }
1186
Sanjiv Guptaf09cb952009-04-21 02:21:29 +00001187 // Transfer the value from APInt to uint64_t
1188 Value = LitVal.getZExtValue();
Mike Stump11289f42009-09-09 15:08:12 +00001189
Chris Lattner2f5add62007-04-05 06:57:15 +00001190 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1191 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1192 // character constants are not sign extended in the this implementation:
1193 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
Douglas Gregorfb65e592011-07-27 05:40:30 +00001194 if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
David Blaikiebbafb8a2012-03-11 07:00:24 +00001195 PP.getLangOpts().CharIsSigned)
Chris Lattner2f5add62007-04-05 06:57:15 +00001196 Value = (signed char)Value;
1197}
1198
James Dennett99c193b2012-06-19 21:04:25 +00001199/// \verbatim
Craig Topper54edcca2011-08-11 04:06:15 +00001200/// string-literal: [C++0x lex.string]
1201/// encoding-prefix " [s-char-sequence] "
1202/// encoding-prefix R raw-string
1203/// encoding-prefix:
1204/// u8
1205/// u
1206/// U
1207/// L
Steve Naroff4f88b312007-03-13 22:37:02 +00001208/// s-char-sequence:
1209/// s-char
1210/// s-char-sequence s-char
1211/// s-char:
Craig Topper54edcca2011-08-11 04:06:15 +00001212/// any member of the source character set except the double-quote ",
1213/// backslash \, or new-line character
1214/// escape-sequence
Steve Naroff4f88b312007-03-13 22:37:02 +00001215/// universal-character-name
Craig Topper54edcca2011-08-11 04:06:15 +00001216/// raw-string:
1217/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1218/// r-char-sequence:
1219/// r-char
1220/// r-char-sequence r-char
1221/// r-char:
1222/// any member of the source character set, except a right parenthesis )
1223/// followed by the initial d-char-sequence (which may be empty)
1224/// followed by a double quote ".
1225/// d-char-sequence:
1226/// d-char
1227/// d-char-sequence d-char
1228/// d-char:
1229/// any member of the basic source character set except:
1230/// space, the left parenthesis (, the right parenthesis ),
1231/// the backslash \, and the control characters representing horizontal
1232/// tab, vertical tab, form feed, and newline.
1233/// escape-sequence: [C++0x lex.ccon]
1234/// simple-escape-sequence
1235/// octal-escape-sequence
1236/// hexadecimal-escape-sequence
1237/// simple-escape-sequence:
NAKAMURA Takumi9f8a02d2011-08-12 05:49:51 +00001238/// one of \' \" \? \\ \a \b \f \n \r \t \v
Craig Topper54edcca2011-08-11 04:06:15 +00001239/// octal-escape-sequence:
1240/// \ octal-digit
1241/// \ octal-digit octal-digit
1242/// \ octal-digit octal-digit octal-digit
1243/// hexadecimal-escape-sequence:
1244/// \x hexadecimal-digit
1245/// hexadecimal-escape-sequence hexadecimal-digit
Steve Naroff4f88b312007-03-13 22:37:02 +00001246/// universal-character-name:
1247/// \u hex-quad
1248/// \U hex-quad hex-quad
1249/// hex-quad:
1250/// hex-digit hex-digit hex-digit hex-digit
James Dennett99c193b2012-06-19 21:04:25 +00001251/// \endverbatim
Chris Lattner2f5add62007-04-05 06:57:15 +00001252///
Steve Naroff4f88b312007-03-13 22:37:02 +00001253StringLiteralParser::
Chris Lattner146762e2007-07-20 16:59:19 +00001254StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
Chris Lattner6bab4352010-11-17 07:21:13 +00001255 Preprocessor &PP, bool Complain)
David Blaikiebbafb8a2012-03-11 07:00:24 +00001256 : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Argyrios Kyrtzidis8b7252a2011-05-17 22:09:56 +00001257 Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
Douglas Gregorfb65e592011-07-27 05:40:30 +00001258 MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1259 ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
Chris Lattner6bab4352010-11-17 07:21:13 +00001260 init(StringToks, NumStringToks);
1261}
1262
1263void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
Argyrios Kyrtzidis8b7252a2011-05-17 22:09:56 +00001264 // The literal token may have come from an invalid source location (e.g. due
1265 // to a PCH error), in which case the token length will be 0.
Argyrios Kyrtzidis9933e3a2012-05-03 17:50:32 +00001266 if (NumStringToks == 0 || StringToks[0].getLength() < 2)
1267 return DiagnoseLexingError(SourceLocation());
Argyrios Kyrtzidis8b7252a2011-05-17 22:09:56 +00001268
Steve Naroff4f88b312007-03-13 22:37:02 +00001269 // Scan all of the string portions, remember the max individual token length,
1270 // computing a bound on the concatenated string length, and see whether any
1271 // piece is a wide-string. If any of the string portions is a wide-string
1272 // literal, the result is a wide-string literal [C99 6.4.5p4].
Argyrios Kyrtzidis8b7252a2011-05-17 22:09:56 +00001273 assert(NumStringToks && "expected at least one token");
Alexis Hunt3b791862010-08-30 17:47:05 +00001274 MaxTokenLength = StringToks[0].getLength();
Argyrios Kyrtzidis8b7252a2011-05-17 22:09:56 +00001275 assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
Alexis Hunt3b791862010-08-30 17:47:05 +00001276 SizeBound = StringToks[0].getLength()-2; // -2 for "".
Douglas Gregorfb65e592011-07-27 05:40:30 +00001277 Kind = StringToks[0].getKind();
Alexis Hunt3b791862010-08-30 17:47:05 +00001278
1279 hadError = false;
Chris Lattner2f5add62007-04-05 06:57:15 +00001280
1281 // Implement Translation Phase #6: concatenation of string literals
1282 /// (C99 5.1.1.2p1). The common case is only one string fragment.
Steve Naroff4f88b312007-03-13 22:37:02 +00001283 for (unsigned i = 1; i != NumStringToks; ++i) {
Argyrios Kyrtzidis9933e3a2012-05-03 17:50:32 +00001284 if (StringToks[i].getLength() < 2)
1285 return DiagnoseLexingError(StringToks[i].getLocation());
Argyrios Kyrtzidis8b7252a2011-05-17 22:09:56 +00001286
Steve Naroff4f88b312007-03-13 22:37:02 +00001287 // The string could be shorter than this if it needs cleaning, but this is a
1288 // reasonable bound, which is all we need.
Argyrios Kyrtzidis8b7252a2011-05-17 22:09:56 +00001289 assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
Alexis Hunt3b791862010-08-30 17:47:05 +00001290 SizeBound += StringToks[i].getLength()-2; // -2 for "".
Mike Stump11289f42009-09-09 15:08:12 +00001291
Steve Naroff4f88b312007-03-13 22:37:02 +00001292 // Remember maximum string piece length.
Alexis Hunt3b791862010-08-30 17:47:05 +00001293 if (StringToks[i].getLength() > MaxTokenLength)
1294 MaxTokenLength = StringToks[i].getLength();
Mike Stump11289f42009-09-09 15:08:12 +00001295
Douglas Gregorfb65e592011-07-27 05:40:30 +00001296 // Remember if we see any wide or utf-8/16/32 strings.
1297 // Also check for illegal concatenations.
1298 if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1299 if (isAscii()) {
1300 Kind = StringToks[i].getKind();
1301 } else {
1302 if (Diags)
Richard Smith639b8d02012-09-08 07:16:20 +00001303 Diags->Report(StringToks[i].getLocation(),
Douglas Gregorfb65e592011-07-27 05:40:30 +00001304 diag::err_unsupported_string_concat);
1305 hadError = true;
1306 }
1307 }
Steve Naroff4f88b312007-03-13 22:37:02 +00001308 }
Chris Lattnerd42c29f2009-02-26 23:01:51 +00001309
Steve Naroff4f88b312007-03-13 22:37:02 +00001310 // Include space for the null terminator.
1311 ++SizeBound;
Mike Stump11289f42009-09-09 15:08:12 +00001312
Steve Naroff4f88b312007-03-13 22:37:02 +00001313 // TODO: K&R warning: "traditional C rejects string constant concatenation"
Mike Stump11289f42009-09-09 15:08:12 +00001314
Douglas Gregorfb65e592011-07-27 05:40:30 +00001315 // Get the width in bytes of char/wchar_t/char16_t/char32_t
1316 CharByteWidth = getCharWidth(Kind, Target);
1317 assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1318 CharByteWidth /= 8;
Mike Stump11289f42009-09-09 15:08:12 +00001319
Steve Naroff4f88b312007-03-13 22:37:02 +00001320 // The output buffer size needs to be large enough to hold wide characters.
1321 // This is a worst-case assumption which basically corresponds to L"" "long".
Douglas Gregorfb65e592011-07-27 05:40:30 +00001322 SizeBound *= CharByteWidth;
Mike Stump11289f42009-09-09 15:08:12 +00001323
Steve Naroff4f88b312007-03-13 22:37:02 +00001324 // Size the temporary buffer to hold the result string data.
1325 ResultBuf.resize(SizeBound);
Mike Stump11289f42009-09-09 15:08:12 +00001326
Steve Naroff4f88b312007-03-13 22:37:02 +00001327 // Likewise, but for each string piece.
Dylan Noblesmith2c1dd272012-02-05 02:13:05 +00001328 SmallString<512> TokenBuf;
Steve Naroff4f88b312007-03-13 22:37:02 +00001329 TokenBuf.resize(MaxTokenLength);
Mike Stump11289f42009-09-09 15:08:12 +00001330
Steve Naroff4f88b312007-03-13 22:37:02 +00001331 // Loop over all the strings, getting their spelling, and expanding them to
1332 // wide strings as appropriate.
1333 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
Mike Stump11289f42009-09-09 15:08:12 +00001334
Anders Carlssoncbfc4b82007-10-15 02:50:23 +00001335 Pascal = false;
Mike Stump11289f42009-09-09 15:08:12 +00001336
Richard Smithe18f0fa2012-03-05 04:02:15 +00001337 SourceLocation UDSuffixTokLoc;
1338
Steve Naroff4f88b312007-03-13 22:37:02 +00001339 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
1340 const char *ThisTokBuf = &TokenBuf[0];
1341 // Get the spelling of the token, which eliminates trigraphs, etc. We know
1342 // that ThisTokBuf points to a buffer that is big enough for the whole token
1343 // and 'spelled' tokens can only shrink.
Douglas Gregor7bda4b82010-03-16 05:20:39 +00001344 bool StringInvalid = false;
Chris Lattner6bab4352010-11-17 07:21:13 +00001345 unsigned ThisTokLen =
Chris Lattner39720112010-11-17 07:26:20 +00001346 Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1347 &StringInvalid);
Argyrios Kyrtzidis9933e3a2012-05-03 17:50:32 +00001348 if (StringInvalid)
1349 return DiagnoseLexingError(StringToks[i].getLocation());
Douglas Gregor7bda4b82010-03-16 05:20:39 +00001350
Richard Smith2a70e652012-03-09 22:27:51 +00001351 const char *ThisTokBegin = ThisTokBuf;
Richard Smithe18f0fa2012-03-05 04:02:15 +00001352 const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1353
1354 // Remove an optional ud-suffix.
1355 if (ThisTokEnd[-1] != '"') {
1356 const char *UDSuffixEnd = ThisTokEnd;
1357 do {
1358 --ThisTokEnd;
1359 } while (ThisTokEnd[-1] != '"');
1360
1361 StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1362
1363 if (UDSuffixBuf.empty()) {
Richard Smith8b7258b2014-02-17 21:52:30 +00001364 if (StringToks[i].hasUCN())
1365 expandUCNs(UDSuffixBuf, UDSuffix);
1366 else
1367 UDSuffixBuf.assign(UDSuffix);
Richard Smith75b67d62012-03-08 01:34:56 +00001368 UDSuffixToken = i;
1369 UDSuffixOffset = ThisTokEnd - ThisTokBuf;
Richard Smithe18f0fa2012-03-05 04:02:15 +00001370 UDSuffixTokLoc = StringToks[i].getLocation();
Richard Smith8b7258b2014-02-17 21:52:30 +00001371 } else {
1372 SmallString<32> ExpandedUDSuffix;
1373 if (StringToks[i].hasUCN()) {
1374 expandUCNs(ExpandedUDSuffix, UDSuffix);
1375 UDSuffix = ExpandedUDSuffix;
1376 }
1377
Richard Smithe18f0fa2012-03-05 04:02:15 +00001378 // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1379 // result of a concatenation involving at least one user-defined-string-
1380 // literal, all the participating user-defined-string-literals shall
1381 // have the same ud-suffix.
David Blaikiedcb72d72014-03-09 05:18:27 +00001382 if (UDSuffixBuf != UDSuffix) {
Richard Smith8b7258b2014-02-17 21:52:30 +00001383 if (Diags) {
1384 SourceLocation TokLoc = StringToks[i].getLocation();
1385 Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1386 << UDSuffixBuf << UDSuffix
1387 << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1388 << SourceRange(TokLoc, TokLoc);
1389 }
1390 hadError = true;
Richard Smithe18f0fa2012-03-05 04:02:15 +00001391 }
Richard Smithe18f0fa2012-03-05 04:02:15 +00001392 }
1393 }
1394
1395 // Strip the end quote.
1396 --ThisTokEnd;
1397
Steve Naroff4f88b312007-03-13 22:37:02 +00001398 // TODO: Input character set mapping support.
Mike Stump11289f42009-09-09 15:08:12 +00001399
Craig Topper61147ed2011-08-08 06:10:39 +00001400 // Skip marker for wide or unicode strings.
Douglas Gregorfb65e592011-07-27 05:40:30 +00001401 if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
Chris Lattnerc10adde2007-05-20 05:00:58 +00001402 ++ThisTokBuf;
Douglas Gregorfb65e592011-07-27 05:40:30 +00001403 // Skip 8 of u8 marker for utf8 strings.
1404 if (ThisTokBuf[0] == '8')
1405 ++ThisTokBuf;
Fariborz Jahanianabaae2b2010-08-31 23:34:27 +00001406 }
Mike Stump11289f42009-09-09 15:08:12 +00001407
Craig Topper54edcca2011-08-11 04:06:15 +00001408 // Check for raw string
1409 if (ThisTokBuf[0] == 'R') {
1410 ThisTokBuf += 2; // skip R"
Mike Stump11289f42009-09-09 15:08:12 +00001411
Craig Topper54edcca2011-08-11 04:06:15 +00001412 const char *Prefix = ThisTokBuf;
1413 while (ThisTokBuf[0] != '(')
Anders Carlssoncbfc4b82007-10-15 02:50:23 +00001414 ++ThisTokBuf;
Craig Topper54edcca2011-08-11 04:06:15 +00001415 ++ThisTokBuf; // skip '('
Mike Stump11289f42009-09-09 15:08:12 +00001416
Richard Smith81292452012-03-08 21:59:28 +00001417 // Remove same number of characters from the end
1418 ThisTokEnd -= ThisTokBuf - Prefix;
1419 assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
Craig Topper54edcca2011-08-11 04:06:15 +00001420
1421 // Copy the string over
Richard Smith639b8d02012-09-08 07:16:20 +00001422 if (CopyStringFragment(StringToks[i], ThisTokBegin,
1423 StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
1424 hadError = true;
Craig Topper54edcca2011-08-11 04:06:15 +00001425 } else {
Argyrios Kyrtzidis4e5b5c32012-05-03 01:01:56 +00001426 if (ThisTokBuf[0] != '"') {
1427 // The file may have come from PCH and then changed after loading the
1428 // PCH; Fail gracefully.
Argyrios Kyrtzidis9933e3a2012-05-03 17:50:32 +00001429 return DiagnoseLexingError(StringToks[i].getLocation());
Argyrios Kyrtzidis4e5b5c32012-05-03 01:01:56 +00001430 }
Craig Topper54edcca2011-08-11 04:06:15 +00001431 ++ThisTokBuf; // skip "
1432
1433 // Check if this is a pascal string
1434 if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1435 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1436
1437 // If the \p sequence is found in the first token, we have a pascal string
1438 // Otherwise, if we already have a pascal string, ignore the first \p
1439 if (i == 0) {
Steve Naroff4f88b312007-03-13 22:37:02 +00001440 ++ThisTokBuf;
Craig Topper54edcca2011-08-11 04:06:15 +00001441 Pascal = true;
1442 } else if (Pascal)
1443 ThisTokBuf += 2;
1444 }
Mike Stump11289f42009-09-09 15:08:12 +00001445
Craig Topper54edcca2011-08-11 04:06:15 +00001446 while (ThisTokBuf != ThisTokEnd) {
1447 // Is this a span of non-escape characters?
1448 if (ThisTokBuf[0] != '\\') {
1449 const char *InStart = ThisTokBuf;
1450 do {
1451 ++ThisTokBuf;
1452 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1453
1454 // Copy the character span over.
Richard Smith639b8d02012-09-08 07:16:20 +00001455 if (CopyStringFragment(StringToks[i], ThisTokBegin,
1456 StringRef(InStart, ThisTokBuf - InStart)))
1457 hadError = true;
Craig Topper54edcca2011-08-11 04:06:15 +00001458 continue;
Steve Naroff4f88b312007-03-13 22:37:02 +00001459 }
Craig Topper54edcca2011-08-11 04:06:15 +00001460 // Is this a Universal Character Name escape?
1461 if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
Richard Smith2a70e652012-03-09 22:27:51 +00001462 EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1463 ResultPtr, hadError,
1464 FullSourceLoc(StringToks[i].getLocation(), SM),
Craig Topper54edcca2011-08-11 04:06:15 +00001465 CharByteWidth, Diags, Features);
1466 continue;
1467 }
1468 // Otherwise, this is a non-UCN escape character. Process it.
1469 unsigned ResultChar =
Richard Smith639b8d02012-09-08 07:16:20 +00001470 ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
Craig Topper54edcca2011-08-11 04:06:15 +00001471 FullSourceLoc(StringToks[i].getLocation(), SM),
Richard Smith639b8d02012-09-08 07:16:20 +00001472 CharByteWidth*8, Diags, Features);
Mike Stump11289f42009-09-09 15:08:12 +00001473
Eli Friedmand1370792011-11-02 23:06:23 +00001474 if (CharByteWidth == 4) {
1475 // FIXME: Make the type of the result buffer correct instead of
1476 // using reinterpret_cast.
1477 UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
Nico Weberd60b72f2011-11-14 05:17:37 +00001478 *ResultWidePtr = ResultChar;
Eli Friedmand1370792011-11-02 23:06:23 +00001479 ResultPtr += 4;
1480 } else if (CharByteWidth == 2) {
1481 // FIXME: Make the type of the result buffer correct instead of
1482 // using reinterpret_cast.
1483 UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
Nico Weberd60b72f2011-11-14 05:17:37 +00001484 *ResultWidePtr = ResultChar & 0xFFFF;
Eli Friedmand1370792011-11-02 23:06:23 +00001485 ResultPtr += 2;
1486 } else {
1487 assert(CharByteWidth == 1 && "Unexpected char width");
1488 *ResultPtr++ = ResultChar & 0xFF;
1489 }
Craig Topper54edcca2011-08-11 04:06:15 +00001490 }
Steve Naroff4f88b312007-03-13 22:37:02 +00001491 }
1492 }
Mike Stump11289f42009-09-09 15:08:12 +00001493
Chris Lattner8a24e582009-01-16 18:51:42 +00001494 if (Pascal) {
Eli Friedman20554702011-11-05 00:41:04 +00001495 if (CharByteWidth == 4) {
1496 // FIXME: Make the type of the result buffer correct instead of
1497 // using reinterpret_cast.
1498 UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1499 ResultWidePtr[0] = GetNumStringChars() - 1;
1500 } else if (CharByteWidth == 2) {
1501 // FIXME: Make the type of the result buffer correct instead of
1502 // using reinterpret_cast.
1503 UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1504 ResultWidePtr[0] = GetNumStringChars() - 1;
1505 } else {
1506 assert(CharByteWidth == 1 && "Unexpected char width");
1507 ResultBuf[0] = GetNumStringChars() - 1;
1508 }
Chris Lattner8a24e582009-01-16 18:51:42 +00001509
1510 // Verify that pascal strings aren't too large.
Chris Lattner6bab4352010-11-17 07:21:13 +00001511 if (GetStringLength() > 256) {
Richard Smith639b8d02012-09-08 07:16:20 +00001512 if (Diags)
1513 Diags->Report(StringToks[0].getLocation(),
Chris Lattner6bab4352010-11-17 07:21:13 +00001514 diag::err_pascal_string_too_long)
1515 << SourceRange(StringToks[0].getLocation(),
1516 StringToks[NumStringToks-1].getLocation());
Douglas Gregorfb65e592011-07-27 05:40:30 +00001517 hadError = true;
Eli Friedman1c3fb222009-04-01 03:17:08 +00001518 return;
1519 }
Chris Lattner6bab4352010-11-17 07:21:13 +00001520 } else if (Diags) {
Douglas Gregorb37b46e2010-07-20 14:33:20 +00001521 // Complain if this string literal has too many characters.
Chris Lattner2be8aa92010-11-17 07:12:42 +00001522 unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
Benjamin Kramerf23a6e62012-11-08 19:22:26 +00001523
Douglas Gregorb37b46e2010-07-20 14:33:20 +00001524 if (GetNumStringChars() > MaxChars)
Richard Smith639b8d02012-09-08 07:16:20 +00001525 Diags->Report(StringToks[0].getLocation(),
Chris Lattner6bab4352010-11-17 07:21:13 +00001526 diag::ext_string_too_long)
Douglas Gregorb37b46e2010-07-20 14:33:20 +00001527 << GetNumStringChars() << MaxChars
Chris Lattner2be8aa92010-11-17 07:12:42 +00001528 << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
Douglas Gregorb37b46e2010-07-20 14:33:20 +00001529 << SourceRange(StringToks[0].getLocation(),
1530 StringToks[NumStringToks-1].getLocation());
Chris Lattner8a24e582009-01-16 18:51:42 +00001531 }
Steve Naroff4f88b312007-03-13 22:37:02 +00001532}
Chris Lattnerddb71912009-02-18 19:21:10 +00001533
Benjamin Kramerf23a6e62012-11-08 19:22:26 +00001534static const char *resyncUTF8(const char *Err, const char *End) {
1535 if (Err == End)
1536 return End;
1537 End = Err + std::min<unsigned>(getNumBytesForUTF8(*Err), End-Err);
1538 while (++Err != End && (*Err & 0xC0) == 0x80)
1539 ;
1540 return Err;
Seth Cantrell4cfc8172012-10-28 18:24:46 +00001541}
1542
Richard Smith639b8d02012-09-08 07:16:20 +00001543/// \brief This function copies from Fragment, which is a sequence of bytes
1544/// within Tok's contents (which begin at TokBegin) into ResultPtr.
Craig Topper54edcca2011-08-11 04:06:15 +00001545/// Performs widening for multi-byte characters.
Richard Smith639b8d02012-09-08 07:16:20 +00001546bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1547 const char *TokBegin,
1548 StringRef Fragment) {
1549 const UTF8 *ErrorPtrTmp;
1550 if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1551 return false;
Craig Topper54edcca2011-08-11 04:06:15 +00001552
Eli Friedman94363522012-02-11 05:08:10 +00001553 // If we see bad encoding for unprefixed string literals, warn and
1554 // simply copy the byte values, for compatibility with gcc and older
1555 // versions of clang.
1556 bool NoErrorOnBadEncoding = isAscii();
Richard Smith639b8d02012-09-08 07:16:20 +00001557 if (NoErrorOnBadEncoding) {
1558 memcpy(ResultPtr, Fragment.data(), Fragment.size());
1559 ResultPtr += Fragment.size();
1560 }
Seth Cantrell4cfc8172012-10-28 18:24:46 +00001561
Richard Smith639b8d02012-09-08 07:16:20 +00001562 if (Diags) {
Seth Cantrell4cfc8172012-10-28 18:24:46 +00001563 const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1564
1565 FullSourceLoc SourceLoc(Tok.getLocation(), SM);
1566 const DiagnosticBuilder &Builder =
1567 Diag(Diags, Features, SourceLoc, TokBegin,
Benjamin Kramerf23a6e62012-11-08 19:22:26 +00001568 ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
Seth Cantrell4cfc8172012-10-28 18:24:46 +00001569 NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1570 : diag::err_bad_string_encoding);
1571
Benjamin Kramerf23a6e62012-11-08 19:22:26 +00001572 const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
Seth Cantrell4cfc8172012-10-28 18:24:46 +00001573 StringRef NextFragment(NextStart, Fragment.end()-NextStart);
1574
Benjamin Kramer7d574e22012-11-08 19:22:31 +00001575 // Decode into a dummy buffer.
1576 SmallString<512> Dummy;
1577 Dummy.reserve(Fragment.size() * CharByteWidth);
1578 char *Ptr = Dummy.data();
1579
David Blaikiea0613172012-10-30 23:22:22 +00001580 while (!Builder.hasMaxRanges() &&
Benjamin Kramer7d574e22012-11-08 19:22:31 +00001581 !ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
Seth Cantrell4cfc8172012-10-28 18:24:46 +00001582 const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
Benjamin Kramerf23a6e62012-11-08 19:22:26 +00001583 NextStart = resyncUTF8(ErrorPtr, Fragment.end());
Seth Cantrell4cfc8172012-10-28 18:24:46 +00001584 Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
1585 ErrorPtr, NextStart);
1586 NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
1587 }
Richard Smith639b8d02012-09-08 07:16:20 +00001588 }
Eli Friedman94363522012-02-11 05:08:10 +00001589 return !NoErrorOnBadEncoding;
1590}
Craig Topper54edcca2011-08-11 04:06:15 +00001591
Argyrios Kyrtzidis9933e3a2012-05-03 17:50:32 +00001592void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1593 hadError = true;
1594 if (Diags)
1595 Diags->Report(Loc, diag::err_lexing_string);
1596}
1597
Chris Lattnerddb71912009-02-18 19:21:10 +00001598/// getOffsetOfStringByte - This function returns the offset of the
1599/// specified byte of the string data represented by Token. This handles
1600/// advancing over escape sequences in the string.
1601unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
Chris Lattnerbde1b812010-11-17 06:46:14 +00001602 unsigned ByteNo) const {
Chris Lattnerddb71912009-02-18 19:21:10 +00001603 // Get the spelling of the token.
Dylan Noblesmith2c1dd272012-02-05 02:13:05 +00001604 SmallString<32> SpellingBuffer;
Alexis Hunt3b791862010-08-30 17:47:05 +00001605 SpellingBuffer.resize(Tok.getLength());
Mike Stump11289f42009-09-09 15:08:12 +00001606
Douglas Gregor7bda4b82010-03-16 05:20:39 +00001607 bool StringInvalid = false;
Chris Lattnerddb71912009-02-18 19:21:10 +00001608 const char *SpellingPtr = &SpellingBuffer[0];
Chris Lattner39720112010-11-17 07:26:20 +00001609 unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1610 &StringInvalid);
Chris Lattner7a02bfd2010-11-17 06:26:08 +00001611 if (StringInvalid)
Douglas Gregor7bda4b82010-03-16 05:20:39 +00001612 return 0;
Chris Lattnerddb71912009-02-18 19:21:10 +00001613
Chris Lattnerddb71912009-02-18 19:21:10 +00001614 const char *SpellingStart = SpellingPtr;
1615 const char *SpellingEnd = SpellingPtr+TokLen;
1616
Richard Smith4060f772012-06-13 05:37:23 +00001617 // Handle UTF-8 strings just like narrow strings.
1618 if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1619 SpellingPtr += 2;
1620
1621 assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1622 SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1623
1624 // For raw string literals, this is easy.
1625 if (SpellingPtr[0] == 'R') {
1626 assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1627 // Skip 'R"'.
1628 SpellingPtr += 2;
1629 while (*SpellingPtr != '(') {
1630 ++SpellingPtr;
1631 assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1632 }
1633 // Skip '('.
1634 ++SpellingPtr;
1635 return SpellingPtr - SpellingStart + ByteNo;
1636 }
1637
1638 // Skip over the leading quote
Chris Lattnerddb71912009-02-18 19:21:10 +00001639 assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1640 ++SpellingPtr;
Mike Stump11289f42009-09-09 15:08:12 +00001641
Chris Lattnerddb71912009-02-18 19:21:10 +00001642 // Skip over bytes until we find the offset we're looking for.
1643 while (ByteNo) {
1644 assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
Mike Stump11289f42009-09-09 15:08:12 +00001645
Chris Lattnerddb71912009-02-18 19:21:10 +00001646 // Step over non-escapes simply.
1647 if (*SpellingPtr != '\\') {
1648 ++SpellingPtr;
1649 --ByteNo;
1650 continue;
1651 }
Mike Stump11289f42009-09-09 15:08:12 +00001652
Chris Lattnerddb71912009-02-18 19:21:10 +00001653 // Otherwise, this is an escape character. Advance over it.
1654 bool HadError = false;
Richard Smith4060f772012-06-13 05:37:23 +00001655 if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
1656 const char *EscapePtr = SpellingPtr;
1657 unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1658 1, Features, HadError);
1659 if (Len > ByteNo) {
1660 // ByteNo is somewhere within the escape sequence.
1661 SpellingPtr = EscapePtr;
1662 break;
1663 }
1664 ByteNo -= Len;
1665 } else {
Richard Smith639b8d02012-09-08 07:16:20 +00001666 ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
Richard Smith4060f772012-06-13 05:37:23 +00001667 FullSourceLoc(Tok.getLocation(), SM),
Richard Smith639b8d02012-09-08 07:16:20 +00001668 CharByteWidth*8, Diags, Features);
Richard Smith4060f772012-06-13 05:37:23 +00001669 --ByteNo;
1670 }
Chris Lattnerddb71912009-02-18 19:21:10 +00001671 assert(!HadError && "This method isn't valid on erroneous strings");
Chris Lattnerddb71912009-02-18 19:21:10 +00001672 }
Mike Stump11289f42009-09-09 15:08:12 +00001673
Chris Lattnerddb71912009-02-18 19:21:10 +00001674 return SpellingPtr-SpellingStart;
1675}